mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
apply ruff unsafe fixes
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
73cec158c6
commit
557efde7dc
@ -81,8 +81,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
title, section headers, text, lists, and tables.
|
title, section headers, text, lists, and tables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content = ""
|
|
||||||
|
|
||||||
in_list = False
|
in_list = False
|
||||||
in_table = False
|
in_table = False
|
||||||
|
|
||||||
@ -268,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def _get_current_level(self, parents):
|
def _get_current_level(self, parents):
|
||||||
for k, v in parents.items():
|
for k, v in parents.items():
|
||||||
if v == None and k > 0:
|
if v is None and k > 0:
|
||||||
return k - 1
|
return k - 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def _get_current_parent(self, parents):
|
def _get_current_parent(self, parents):
|
||||||
for k, v in parents.items():
|
for k, v in parents.items():
|
||||||
if v == None and k > 0:
|
if v is None and k > 0:
|
||||||
return parents[k - 1]
|
return parents[k - 1]
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@ -323,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
"marker": marker,
|
"marker": marker,
|
||||||
"text": text.strip(),
|
"text": text.strip(),
|
||||||
"numbered": False,
|
"numbered": False,
|
||||||
"indent": 0 if indent == None else len(indent),
|
"indent": 0 if indent is None else len(indent),
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
return {
|
return {
|
||||||
@ -331,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
"marker": marker,
|
"marker": marker,
|
||||||
"text": text.strip(),
|
"text": text.strip(),
|
||||||
"numbered": True,
|
"numbered": True,
|
||||||
"indent": 0 if indent == None else len(indent),
|
"indent": 0 if indent is None else len(indent),
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
# Fallback if no match
|
# Fallback if no match
|
||||||
|
@ -328,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||||
if not latex_s:
|
if not latex_s:
|
||||||
raise RuntimeError("Not support lim %s" % t_dict["e"])
|
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
||||||
else:
|
else:
|
||||||
return latex_s.format(lim=t_dict.get("lim"))
|
return latex_s.format(lim=t_dict.get("lim"))
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
item for item in element.next_siblings if isinstance(item, Tag)
|
item for item in element.next_siblings if isinstance(item, Tag)
|
||||||
]
|
]
|
||||||
if element.next_sibling is None or any(
|
if element.next_sibling is None or any(
|
||||||
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
||||||
):
|
):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text and tag.name in ["div"]:
|
if text and tag.name in ["div"]:
|
||||||
|
@ -126,7 +126,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
enum_list_item_value = 0
|
enum_list_item_value = 0
|
||||||
new_list = None
|
new_list = None
|
||||||
bullet_type = "None"
|
bullet_type = "None"
|
||||||
list_text = ""
|
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
doc_label = DocItemLabel.LIST_ITEM
|
doc_label = DocItemLabel.LIST_ITEM
|
||||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
||||||
@ -368,8 +367,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
slide_width = pptx_obj.slide_width
|
slide_width = pptx_obj.slide_width
|
||||||
slide_height = pptx_obj.slide_height
|
slide_height = pptx_obj.slide_height
|
||||||
|
|
||||||
text_content = [] # type: ignore
|
|
||||||
|
|
||||||
max_levels = 10
|
max_levels = 10
|
||||||
parents = {} # type: ignore
|
parents = {} # type: ignore
|
||||||
for i in range(max_levels):
|
for i in range(max_levels):
|
||||||
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
)
|
)
|
||||||
|
|
||||||
slide_size = Size(width=slide_width, height=slide_height)
|
slide_size = Size(width=slide_width, height=slide_height)
|
||||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||||
|
|
||||||
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
||||||
|
@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def _get_level(self) -> int:
|
def _get_level(self) -> int:
|
||||||
"""Return the first None index."""
|
"""Return the first None index."""
|
||||||
for k, v in self.parents.items():
|
for k, v in self.parents.items():
|
||||||
if k >= 0 and v == None:
|
if k >= 0 and v is None:
|
||||||
return k
|
return k
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
doc_info: etree.DocInfo = self.tree.docinfo
|
doc_info: etree.DocInfo = self.tree.docinfo
|
||||||
if doc_info.system_url and any(
|
if doc_info.system_url and any(
|
||||||
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
|
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
||||||
):
|
):
|
||||||
self.valid = True
|
self.valid = True
|
||||||
return
|
return
|
||||||
for ent in doc_info.internalDTD.iterentities():
|
for ent in doc_info.internalDTD.iterentities():
|
||||||
if ent.system_url and any(
|
if ent.system_url and any(
|
||||||
[kwd in ent.system_url for kwd in JATS_DTD_URL]
|
kwd in ent.system_url for kwd in JATS_DTD_URL
|
||||||
):
|
):
|
||||||
self.valid = True
|
self.valid = True
|
||||||
return
|
return
|
||||||
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# TODO: once superscript is supported, add label with formatting
|
# TODO: once superscript is supported, add label with formatting
|
||||||
aff = aff.removeprefix(f"{label[0].text}, ")
|
aff = aff.removeprefix(f"{label[0].text}, ")
|
||||||
affiliation_names.append(aff)
|
affiliation_names.append(aff)
|
||||||
affiliation_ids_names = {
|
affiliation_ids_names = dict(
|
||||||
id: name
|
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
||||||
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
)
|
||||||
}
|
|
||||||
|
|
||||||
# Get author names and affiliation names
|
# Get author names and affiliation names
|
||||||
for author_node in meta.xpath(
|
for author_node in meta.xpath(
|
||||||
|
@ -1472,9 +1472,7 @@ class XmlTable:
|
|||||||
if cw == 0:
|
if cw == 0:
|
||||||
offset_w0.append(col["offset"][ic])
|
offset_w0.append(col["offset"][ic])
|
||||||
|
|
||||||
min_colinfo["offset"] = sorted(
|
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
||||||
list(set(col["offset"] + min_colinfo["offset"]))
|
|
||||||
)
|
|
||||||
|
|
||||||
# add back the 0 width cols to offset list
|
# add back the 0 width cols to offset list
|
||||||
offset_w0 = list(set(offset_w0))
|
offset_w0 = list(set(offset_w0))
|
||||||
|
@ -430,7 +430,7 @@ def convert(
|
|||||||
settings.debug.visualize_ocr = debug_visualize_ocr
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
||||||
|
|
||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = list(InputFormat)
|
||||||
|
|
||||||
parsed_headers: Optional[Dict[str, str]] = None
|
parsed_headers: Optional[Dict[str, str]] = None
|
||||||
if headers is not None:
|
if headers is not None:
|
||||||
|
@ -89,14 +89,13 @@ def download(
|
|||||||
"Cannot simultaneously set 'all' parameter and specify models to download."
|
"Cannot simultaneously set 'all' parameter and specify models to download."
|
||||||
)
|
)
|
||||||
if not quiet:
|
if not quiet:
|
||||||
FORMAT = "%(message)s"
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="[blue]%(message)s[/blue]",
|
format="[blue]%(message)s[/blue]",
|
||||||
datefmt="[%X]",
|
datefmt="[%X]",
|
||||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||||
)
|
)
|
||||||
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
|
to_download = models or (list(_AvailableModels) if all else _default_models)
|
||||||
output_dir = download_models(
|
output_dir = download_models(
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
force=force,
|
force=force,
|
||||||
|
@ -172,7 +172,7 @@ class DocumentConverter:
|
|||||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||||
):
|
):
|
||||||
self.allowed_formats = (
|
self.allowed_formats = (
|
||||||
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||||
)
|
)
|
||||||
self.format_to_options = {
|
self.format_to_options = {
|
||||||
format: (
|
format: (
|
||||||
|
@ -59,13 +59,11 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
device = decide_device(accelerator_options.device)
|
device = decide_device(accelerator_options.device)
|
||||||
# Enable easyocr GPU if running on CUDA, MPS
|
# Enable easyocr GPU if running on CUDA, MPS
|
||||||
use_gpu = any(
|
use_gpu = any(
|
||||||
[
|
|
||||||
device.startswith(x)
|
device.startswith(x)
|
||||||
for x in [
|
for x in [
|
||||||
AcceleratorDevice.CUDA.value,
|
AcceleratorDevice.CUDA.value,
|
||||||
AcceleratorDevice.MPS.value,
|
AcceleratorDevice.MPS.value,
|
||||||
]
|
]
|
||||||
]
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def registered_kind(self) -> list[str]:
|
def registered_kind(self) -> list[str]:
|
||||||
return list(opt.kind for opt in self._classes.keys())
|
return [opt.kind for opt in self._classes.keys()]
|
||||||
|
|
||||||
def get_enum(self) -> enum.Enum:
|
def get_enum(self) -> enum.Enum:
|
||||||
return enum.Enum(
|
return enum.Enum(
|
||||||
|
@ -121,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
generation_time = time.time() - start_time
|
generation_time = time.time() - start_time
|
||||||
page_tags = output
|
page_tags = output
|
||||||
|
|
||||||
|
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
||||||
|
|
||||||
# inference_time = time.time() - start_time
|
# inference_time = time.time() - start_time
|
||||||
# tokens_per_second = num_tokens / generation_time
|
# tokens_per_second = num_tokens / generation_time
|
||||||
# print("")
|
# print("")
|
||||||
|
@ -166,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
num_tokens = len(generated_ids[0])
|
num_tokens = len(generated_ids[0])
|
||||||
page_tags = generated_texts
|
page_tags = generated_texts
|
||||||
|
|
||||||
|
_log.debug(
|
||||||
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||||
|
)
|
||||||
|
|
||||||
# inference_time = time.time() - start_time
|
# inference_time = time.time() - start_time
|
||||||
# tokens_per_second = num_tokens / generation_time
|
# tokens_per_second = num_tokens / generation_time
|
||||||
# print("")
|
# print("")
|
||||||
|
@ -64,7 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||||
if self._name != None and self._version != None:
|
if self._name is not None and self._version is not None:
|
||||||
return self._name, self._version # type: ignore
|
return self._name, self._version # type: ignore
|
||||||
|
|
||||||
cmd = [self.options.tesseract_cmd, "--version"]
|
cmd = [self.options.tesseract_cmd, "--version"]
|
||||||
@ -187,7 +187,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
self._tesseract_languages = df[0].tolist()[1:]
|
self._tesseract_languages = df[0].tolist()[1:]
|
||||||
|
|
||||||
# Decide the script prefix
|
# Decide the script prefix
|
||||||
if any([l.startswith("script/") for l in self._tesseract_languages]):
|
if any(l.startswith("script/") for l in self._tesseract_languages):
|
||||||
script_prefix = "script/"
|
script_prefix = "script/"
|
||||||
else:
|
else:
|
||||||
script_prefix = ""
|
script_prefix = ""
|
||||||
|
@ -76,7 +76,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||||
lang = "+".join(self.options.lang)
|
lang = "+".join(self.options.lang)
|
||||||
|
|
||||||
if any([l.startswith("script/") for l in self._tesserocr_languages]):
|
if any(l.startswith("script/") for l in self._tesserocr_languages):
|
||||||
self.script_prefix = "script/"
|
self.script_prefix = "script/"
|
||||||
else:
|
else:
|
||||||
self.script_prefix = ""
|
self.script_prefix = ""
|
||||||
|
@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
|
|||||||
if isinstance(iterator, List):
|
if isinstance(iterator, List):
|
||||||
iterator = iter(iterator)
|
iterator = iter(iterator)
|
||||||
for first in iterator: # Take the first element from the iterator
|
for first in iterator: # Take the first element from the iterator
|
||||||
yield [first] + list(islice(iterator, chunk_size - 1))
|
yield [first, *list(islice(iterator, chunk_size - 1))]
|
||||||
|
|
||||||
|
|
||||||
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
||||||
|
@ -544,7 +544,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"doc = backend.convert()\n",
|
"doc = backend.convert()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
|
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
|
||||||
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
|
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -84,7 +84,7 @@ def main():
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
result = doc_converter.convert(input_doc_path)
|
doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -807,10 +807,12 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"from typing import Optional\n",
|
||||||
|
"\n",
|
||||||
"from azure.search.documents.models import VectorizableTextQuery\n",
|
"from azure.search.documents.models import VectorizableTextQuery\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def generate_chat_response(prompt: str, system_message: str = None):\n",
|
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
|
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
|
||||||
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
|
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
|
||||||
|
@ -51,10 +51,9 @@ def main():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
conv_res = doc_converter.convert(input_doc_path)
|
conv_res = doc_converter.convert(input_doc_path)
|
||||||
conv_doc = conv_res.document
|
conv_doc = conv_res.document
|
||||||
|
doc_filename = conv_res.input.file
|
||||||
|
|
||||||
# Save markdown with embedded pictures in original text
|
# Save markdown with embedded pictures in original text
|
||||||
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
||||||
|
@ -38,7 +38,7 @@ def test_asciidocs_examples():
|
|||||||
|
|
||||||
if os.path.exists(gname):
|
if os.path.exists(gname):
|
||||||
with open(gname) as fr:
|
with open(gname) as fr:
|
||||||
true_mddoc = fr.read()
|
fr.read()
|
||||||
|
|
||||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||||
else:
|
else:
|
||||||
|
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -79,7 +79,7 @@ def test_pages(documents) -> None:
|
|||||||
documents: The paths and converted documents.
|
documents: The paths and converted documents.
|
||||||
"""
|
"""
|
||||||
# number of pages from the backend method
|
# number of pages from the backend method
|
||||||
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
|
path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=path,
|
path_or_stream=path,
|
||||||
format=InputFormat.XLSX,
|
format=InputFormat.XLSX,
|
||||||
@ -90,7 +90,7 @@ def test_pages(documents) -> None:
|
|||||||
assert backend.page_count() == 3
|
assert backend.page_count() == 3
|
||||||
|
|
||||||
# number of pages from the converted document
|
# number of pages from the converted document
|
||||||
doc = [item for path, item in documents if path.stem == "test-01"][0]
|
doc = next(item for path, item in documents if path.stem == "test-01")
|
||||||
assert len(doc.pages) == 3
|
assert len(doc.pages) == 3
|
||||||
|
|
||||||
# page sizes as number of cells
|
# page sizes as number of cells
|
||||||
|
@ -129,7 +129,7 @@ def test_tables(tables):
|
|||||||
"""Test the table parser."""
|
"""Test the table parser."""
|
||||||
# CHECK table in file tables_20180000016.xml
|
# CHECK table in file tables_20180000016.xml
|
||||||
file_name = "tables_ipa20180000016.xml"
|
file_name = "tables_ipa20180000016.xml"
|
||||||
file_table = [item[1] for item in tables if item[0].name == file_name][0]
|
file_table = next(item[1] for item in tables if item[0].name == file_name)
|
||||||
assert file_table.num_rows == 13
|
assert file_table.num_rows == 13
|
||||||
assert file_table.num_cols == 10
|
assert file_table.num_cols == 10
|
||||||
assert len(file_table.table_cells) == 130
|
assert len(file_table.table_cells) == 130
|
||||||
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20200022300
|
# CHECK application doc number 20200022300
|
||||||
file_name = "ipa20200022300.xml"
|
file_name = "ipa20200022300.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
|
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
|
||||||
file_name = "ipa20180000016.xml"
|
file_name = "ipa20180000016.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20110039701 for complex long tables
|
# CHECK application doc number 20110039701 for complex long tables
|
||||||
file_name = "ipa20110039701.xml"
|
file_name = "ipa20110039701.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
assert doc.name == file_name
|
assert doc.name == file_name
|
||||||
assert len(doc.tables) == 17
|
assert len(doc.tables) == 17
|
||||||
|
|
||||||
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 06442728
|
# CHECK application doc number 06442728
|
||||||
file_name = "pg06442728.xml"
|
file_name = "pg06442728.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20010031492
|
# CHECK application doc number 20010031492
|
||||||
file_name = "pa20010031492.xml"
|
file_name = "pa20010031492.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 057006474
|
# CHECK application doc number 057006474
|
||||||
file_name = "pftaps057006474.txt"
|
file_name = "pftaps057006474.txt"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -14,7 +14,7 @@ from docling.document_converter import PdfFormatOption
|
|||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
doc = _make_input_doc(test_doc_path)
|
doc = _make_input_doc(test_doc_path)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_invalid_path():
|
def test_in_doc_from_invalid_path():
|
||||||
@ -22,7 +22,7 @@ def test_in_doc_from_invalid_path():
|
|||||||
|
|
||||||
doc = _make_input_doc(test_doc_path)
|
doc = _make_input_doc(test_doc_path)
|
||||||
|
|
||||||
assert doc.valid == False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_valid_buf():
|
def test_in_doc_from_valid_buf():
|
||||||
@ -30,7 +30,7 @@ def test_in_doc_from_valid_buf():
|
|||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
doc = _make_input_doc_from_stream(stream)
|
doc = _make_input_doc_from_stream(stream)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_invalid_buf():
|
def test_in_doc_from_invalid_buf():
|
||||||
@ -38,7 +38,7 @@ def test_in_doc_from_invalid_buf():
|
|||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
doc = _make_input_doc_from_stream(stream)
|
doc = _make_input_doc_from_stream(stream)
|
||||||
assert doc.valid == False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_image_in_pdf_backend():
|
def test_image_in_pdf_backend():
|
||||||
@ -82,7 +82,7 @@ def test_in_doc_with_page_range():
|
|||||||
backend=PyPdfiumDocumentBackend,
|
backend=PyPdfiumDocumentBackend,
|
||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
limits.page_range = (9, 9)
|
limits.page_range = (9, 9)
|
||||||
|
|
||||||
@ -92,7 +92,7 @@ def test_in_doc_with_page_range():
|
|||||||
backend=PyPdfiumDocumentBackend,
|
backend=PyPdfiumDocumentBackend,
|
||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
limits.page_range = (11, 12)
|
limits.page_range = (11, 12)
|
||||||
|
|
||||||
@ -102,7 +102,7 @@ def test_in_doc_with_page_range():
|
|||||||
backend=PyPdfiumDocumentBackend,
|
backend=PyPdfiumDocumentBackend,
|
||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
assert doc.valid == False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_guess_format(tmp_path):
|
def test_guess_format(tmp_path):
|
||||||
@ -187,17 +187,17 @@ def test_guess_format(tmp_path):
|
|||||||
)
|
)
|
||||||
doc_path = temp_dir / "docling_test.xml"
|
doc_path = temp_dir / "docling_test.xml"
|
||||||
doc_path.write_text(xml_content, encoding="utf-8")
|
doc_path.write_text(xml_content, encoding="utf-8")
|
||||||
assert dci._guess_format(doc_path) == None
|
assert dci._guess_format(doc_path) is None
|
||||||
buf = BytesIO(Path(doc_path).open("rb").read())
|
buf = BytesIO(Path(doc_path).open("rb").read())
|
||||||
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
||||||
assert dci._guess_format(stream) == None
|
assert dci._guess_format(stream) is None
|
||||||
|
|
||||||
# Invalid USPTO patent (as plain text)
|
# Invalid USPTO patent (as plain text)
|
||||||
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
||||||
assert dci._guess_format(stream) == None
|
assert dci._guess_format(stream) is None
|
||||||
doc_path = temp_dir / "pftaps_wrong.txt"
|
doc_path = temp_dir / "pftaps_wrong.txt"
|
||||||
doc_path.write_text("xyz", encoding="utf-8")
|
doc_path.write_text("xyz", encoding="utf-8")
|
||||||
assert dci._guess_format(doc_path) == None
|
assert dci._guess_format(doc_path) is None
|
||||||
|
|
||||||
# Valid Docling JSON
|
# Valid Docling JSON
|
||||||
test_str = '{"name": ""}'
|
test_str = '{"name": ""}'
|
||||||
|
@ -291,7 +291,7 @@ def verify_conversion_result_v1(
|
|||||||
input_path: Path,
|
input_path: Path,
|
||||||
doc_result: ConversionResult,
|
doc_result: ConversionResult,
|
||||||
generate: bool = False,
|
generate: bool = False,
|
||||||
ocr_engine: str = None,
|
ocr_engine: Optional[str] = None,
|
||||||
fuzzy: bool = False,
|
fuzzy: bool = False,
|
||||||
):
|
):
|
||||||
PageList = TypeAdapter(List[Page])
|
PageList = TypeAdapter(List[Page])
|
||||||
@ -375,7 +375,7 @@ def verify_conversion_result_v2(
|
|||||||
input_path: Path,
|
input_path: Path,
|
||||||
doc_result: ConversionResult,
|
doc_result: ConversionResult,
|
||||||
generate: bool = False,
|
generate: bool = False,
|
||||||
ocr_engine: str = None,
|
ocr_engine: Optional[str] = None,
|
||||||
fuzzy: bool = False,
|
fuzzy: bool = False,
|
||||||
):
|
):
|
||||||
PageList = TypeAdapter(List[Page])
|
PageList = TypeAdapter(List[Page])
|
||||||
|
Loading…
Reference in New Issue
Block a user