apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-14 15:01:05 +02:00
parent 73cec158c6
commit 557efde7dc
30 changed files with 65 additions and 69 deletions

View File

@ -81,8 +81,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
title, section headers, text, lists, and tables. title, section headers, text, lists, and tables.
""" """
content = ""
in_list = False in_list = False
in_table = False in_table = False
@ -268,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
def _get_current_level(self, parents): def _get_current_level(self, parents):
for k, v in parents.items(): for k, v in parents.items():
if v == None and k > 0: if v is None and k > 0:
return k - 1 return k - 1
return 0 return 0
def _get_current_parent(self, parents): def _get_current_parent(self, parents):
for k, v in parents.items(): for k, v in parents.items():
if v == None and k > 0: if v is None and k > 0:
return parents[k - 1] return parents[k - 1]
return None return None
@ -323,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker, "marker": marker,
"text": text.strip(), "text": text.strip(),
"numbered": False, "numbered": False,
"indent": 0 if indent == None else len(indent), "indent": 0 if indent is None else len(indent),
} }
else: else:
return { return {
@ -331,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker, "marker": marker,
"text": text.strip(), "text": text.strip(),
"numbered": True, "numbered": True,
"indent": 0 if indent == None else len(indent), "indent": 0 if indent is None else len(indent),
} }
else: else:
# Fallback if no match # Fallback if no match

View File

@ -328,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim")) t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"]) latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s: if not latex_s:
raise RuntimeError("Not support lim %s" % t_dict["e"]) raise RuntimeError("Not support lim {}".format(t_dict["e"]))
else: else:
return latex_s.format(lim=t_dict.get("lim")) return latex_s.format(lim=t_dict.get("lim"))

View File

@ -146,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
item for item in element.next_siblings if isinstance(item, Tag) item for item in element.next_siblings if isinstance(item, Tag)
] ]
if element.next_sibling is None or any( if element.next_sibling is None or any(
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings] item.name in TAGS_FOR_NODE_ITEMS for item in siblings
): ):
text = text.strip() text = text.strip()
if text and tag.name in ["div"]: if text and tag.name in ["div"]:

View File

@ -126,7 +126,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_list_item_value = 0 enum_list_item_value = 0
new_list = None new_list = None
bullet_type = "None" bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@ -368,8 +367,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
slide_width = pptx_obj.slide_width slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10 max_levels = 10
parents = {} # type: ignore parents = {} # type: ignore
for i in range(max_levels): for i in range(max_levels):
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
) )
slide_size = Size(width=slide_width, height=slide_height) slide_size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size) doc.add_page(page_no=slide_ind + 1, size=slide_size)
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size): def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
handle_groups(shape, parent_slide, slide_ind, doc, slide_size) handle_groups(shape, parent_slide, slide_ind, doc, slide_size)

View File

@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _get_level(self) -> int: def _get_level(self) -> int:
"""Return the first None index.""" """Return the first None index."""
for k, v in self.parents.items(): for k, v in self.parents.items():
if k >= 0 and v == None: if k >= 0 and v is None:
return k return k
return 0 return 0

View File

@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
doc_info: etree.DocInfo = self.tree.docinfo doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any( if doc_info.system_url and any(
[kwd in doc_info.system_url for kwd in JATS_DTD_URL] kwd in doc_info.system_url for kwd in JATS_DTD_URL
): ):
self.valid = True self.valid = True
return return
for ent in doc_info.internalDTD.iterentities(): for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any( if ent.system_url and any(
[kwd in ent.system_url for kwd in JATS_DTD_URL] kwd in ent.system_url for kwd in JATS_DTD_URL
): ):
self.valid = True self.valid = True
return return
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
# TODO: once superscript is supported, add label with formatting # TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ") aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff) affiliation_names.append(aff)
affiliation_ids_names = { affiliation_ids_names = dict(
id: name zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) )
}
# Get author names and affiliation names # Get author names and affiliation names
for author_node in meta.xpath( for author_node in meta.xpath(

View File

@ -1472,9 +1472,7 @@ class XmlTable:
if cw == 0: if cw == 0:
offset_w0.append(col["offset"][ic]) offset_w0.append(col["offset"][ic])
min_colinfo["offset"] = sorted( min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
list(set(col["offset"] + min_colinfo["offset"]))
)
# add back the 0 width cols to offset list # add back the 0 width cols to offset list
offset_w0 = list(set(offset_w0)) offset_w0 = list(set(offset_w0))

View File

@ -430,7 +430,7 @@ def convert(
settings.debug.visualize_ocr = debug_visualize_ocr settings.debug.visualize_ocr = debug_visualize_ocr
if from_formats is None: if from_formats is None:
from_formats = [e for e in InputFormat] from_formats = list(InputFormat)
parsed_headers: Optional[Dict[str, str]] = None parsed_headers: Optional[Dict[str, str]] = None
if headers is not None: if headers is not None:

View File

@ -89,14 +89,13 @@ def download(
"Cannot simultaneously set 'all' parameter and specify models to download." "Cannot simultaneously set 'all' parameter and specify models to download."
) )
if not quiet: if not quiet:
FORMAT = "%(message)s"
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="[blue]%(message)s[/blue]", format="[blue]%(message)s[/blue]",
datefmt="[%X]", datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)], handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
) )
to_download = models or ([m for m in _AvailableModels] if all else _default_models) to_download = models or (list(_AvailableModels) if all else _default_models)
output_dir = download_models( output_dir = download_models(
output_dir=output_dir, output_dir=output_dir,
force=force, force=force,

View File

@ -172,7 +172,7 @@ class DocumentConverter:
format_options: Optional[Dict[InputFormat, FormatOption]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None,
): ):
self.allowed_formats = ( self.allowed_formats = (
allowed_formats if allowed_formats is not None else [e for e in InputFormat] allowed_formats if allowed_formats is not None else list(InputFormat)
) )
self.format_to_options = { self.format_to_options = {
format: ( format: (

View File

@ -59,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
device = decide_device(accelerator_options.device) device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS # Enable easyocr GPU if running on CUDA, MPS
use_gpu = any( use_gpu = any(
[ device.startswith(x)
device.startswith(x) for x in [
for x in [ AcceleratorDevice.CUDA.value,
AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value,
AcceleratorDevice.MPS.value,
]
] ]
) )
else: else:

View File

@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
@property @property
def registered_kind(self) -> list[str]: def registered_kind(self) -> list[str]:
return list(opt.kind for opt in self._classes.keys()) return [opt.kind for opt in self._classes.keys()]
def get_enum(self) -> enum.Enum: def get_enum(self) -> enum.Enum:
return enum.Enum( return enum.Enum(

View File

@ -121,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
generation_time = time.time() - start_time generation_time = time.time() - start_time
page_tags = output page_tags = output
_log.debug(f"Generation time {generation_time:.2f} seconds.")
# inference_time = time.time() - start_time # inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time # tokens_per_second = num_tokens / generation_time
# print("") # print("")

View File

@ -166,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
num_tokens = len(generated_ids[0]) num_tokens = len(generated_ids[0])
page_tags = generated_texts page_tags = generated_texts
_log.debug(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
)
# inference_time = time.time() - start_time # inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time # tokens_per_second = num_tokens / generation_time
# print("") # print("")

View File

@ -64,7 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
) )
def _get_name_and_version(self) -> Tuple[str, str]: def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None: if self._name is not None and self._version is not None:
return self._name, self._version # type: ignore return self._name, self._version # type: ignore
cmd = [self.options.tesseract_cmd, "--version"] cmd = [self.options.tesseract_cmd, "--version"]
@ -187,7 +187,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self._tesseract_languages = df[0].tolist()[1:] self._tesseract_languages = df[0].tolist()[1:]
# Decide the script prefix # Decide the script prefix
if any([l.startswith("script/") for l in self._tesseract_languages]): if any(l.startswith("script/") for l in self._tesseract_languages):
script_prefix = "script/" script_prefix = "script/"
else: else:
script_prefix = "" script_prefix = ""

View File

@ -76,7 +76,7 @@ class TesseractOcrModel(BaseOcrModel):
_log.debug("Initializing TesserOCR: %s", tesseract_version) _log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
if any([l.startswith("script/") for l in self._tesserocr_languages]): if any(l.startswith("script/") for l in self._tesserocr_languages):
self.script_prefix = "script/" self.script_prefix = "script/"
else: else:
self.script_prefix = "" self.script_prefix = ""

View File

@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
if isinstance(iterator, List): if isinstance(iterator, List):
iterator = iter(iterator) iterator = iter(iterator)
for first in iterator: # Take the first element from the iterator for first in iterator: # Take the first element from the iterator
yield [first] + list(islice(iterator, chunk_size - 1)) yield [first, *list(islice(iterator, chunk_size - 1))]
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:

View File

@ -544,7 +544,7 @@
"source": [ "source": [
"doc = backend.convert()\n", "doc = backend.convert()\n",
"\n", "\n",
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n", "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')" "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
] ]
}, },

View File

@ -84,7 +84,7 @@ def main():
) )
} }
) )
result = doc_converter.convert(input_doc_path) doc_converter.convert(input_doc_path)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -807,10 +807,12 @@
} }
], ],
"source": [ "source": [
"from typing import Optional\n",
"\n",
"from azure.search.documents.models import VectorizableTextQuery\n", "from azure.search.documents.models import VectorizableTextQuery\n",
"\n", "\n",
"\n", "\n",
"def generate_chat_response(prompt: str, system_message: str = None):\n", "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
" \"\"\"\n", " \"\"\"\n",
" Generates a single-turn chat response using Azure OpenAI Chat.\n", " Generates a single-turn chat response using Azure OpenAI Chat.\n",
" If you need multi-turn conversation or follow-up queries, you'll have to\n", " If you need multi-turn conversation or follow-up queries, you'll have to\n",

View File

@ -51,10 +51,9 @@ def main():
} }
) )
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path) conv_res = doc_converter.convert(input_doc_path)
conv_doc = conv_res.document conv_doc = conv_res.document
doc_filename = conv_res.input.file
# Save markdown with embedded pictures in original text # Save markdown with embedded pictures in original text
md_filename = output_dir / f"{doc_filename}-with-images-orig.md" md_filename = output_dir / f"{doc_filename}-with-images-orig.md"

View File

@ -38,7 +38,7 @@ def test_asciidocs_examples():
if os.path.exists(gname): if os.path.exists(gname):
with open(gname) as fr: with open(gname) as fr:
true_mddoc = fr.read() fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else: else:

View File

@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -79,7 +79,7 @@ def test_pages(documents) -> None:
documents: The paths and converted documents. documents: The paths and converted documents.
""" """
# number of pages from the backend method # number of pages from the backend method
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=path, path_or_stream=path,
format=InputFormat.XLSX, format=InputFormat.XLSX,
@ -90,7 +90,7 @@ def test_pages(documents) -> None:
assert backend.page_count() == 3 assert backend.page_count() == 3
# number of pages from the converted document # number of pages from the converted document
doc = [item for path, item in documents if path.stem == "test-01"][0] doc = next(item for path, item in documents if path.stem == "test-01")
assert len(doc.pages) == 3 assert len(doc.pages) == 3
# page sizes as number of cells # page sizes as number of cells

View File

@ -129,7 +129,7 @@ def test_tables(tables):
"""Test the table parser.""" """Test the table parser."""
# CHECK table in file tables_20180000016.xml # CHECK table in file tables_20180000016.xml
file_name = "tables_ipa20180000016.xml" file_name = "tables_ipa20180000016.xml"
file_table = [item[1] for item in tables if item[0].name == file_name][0] file_table = next(item[1] for item in tables if item[0].name == file_name)
assert file_table.num_rows == 13 assert file_table.num_rows == 13
assert file_table.num_cols == 10 assert file_table.num_cols == 10
assert len(file_table.table_cells) == 130 assert len(file_table.table_cells) == 130
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20200022300 # CHECK application doc number 20200022300
file_name = "ipa20200022300.xml" file_name = "ipa20200022300.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables # CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
file_name = "ipa20180000016.xml" file_name = "ipa20180000016.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20110039701 for complex long tables # CHECK application doc number 20110039701 for complex long tables
file_name = "ipa20110039701.xml" file_name = "ipa20110039701.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
assert doc.name == file_name assert doc.name == file_name
assert len(doc.tables) == 17 assert len(doc.tables) == 17
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
# CHECK application doc number 06442728 # CHECK application doc number 06442728
file_name = "pg06442728.xml" file_name = "pg06442728.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
# CHECK application doc number 20010031492 # CHECK application doc number 20010031492
file_name = "pa20010031492.xml" file_name = "pa20010031492.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
# CHECK application doc number 057006474 # CHECK application doc number 057006474
file_name = "pftaps057006474.txt" file_name = "pftaps057006474.txt"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)

View File

@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -14,7 +14,7 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path) doc = _make_input_doc(test_doc_path)
assert doc.valid == True assert doc.valid is True
def test_in_doc_from_invalid_path(): def test_in_doc_from_invalid_path():
@ -22,7 +22,7 @@ def test_in_doc_from_invalid_path():
doc = _make_input_doc(test_doc_path) doc = _make_input_doc(test_doc_path)
assert doc.valid == False assert doc.valid is False
def test_in_doc_from_valid_buf(): def test_in_doc_from_valid_buf():
@ -30,7 +30,7 @@ def test_in_doc_from_valid_buf():
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream) doc = _make_input_doc_from_stream(stream)
assert doc.valid == True assert doc.valid is True
def test_in_doc_from_invalid_buf(): def test_in_doc_from_invalid_buf():
@ -38,7 +38,7 @@ def test_in_doc_from_invalid_buf():
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream) doc = _make_input_doc_from_stream(stream)
assert doc.valid == False assert doc.valid is False
def test_image_in_pdf_backend(): def test_image_in_pdf_backend():
@ -82,7 +82,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend, backend=PyPdfiumDocumentBackend,
limits=limits, limits=limits,
) )
assert doc.valid == True assert doc.valid is True
limits.page_range = (9, 9) limits.page_range = (9, 9)
@ -92,7 +92,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend, backend=PyPdfiumDocumentBackend,
limits=limits, limits=limits,
) )
assert doc.valid == True assert doc.valid is True
limits.page_range = (11, 12) limits.page_range = (11, 12)
@ -102,7 +102,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend, backend=PyPdfiumDocumentBackend,
limits=limits, limits=limits,
) )
assert doc.valid == False assert doc.valid is False
def test_guess_format(tmp_path): def test_guess_format(tmp_path):
@ -187,17 +187,17 @@ def test_guess_format(tmp_path):
) )
doc_path = temp_dir / "docling_test.xml" doc_path = temp_dir / "docling_test.xml"
doc_path.write_text(xml_content, encoding="utf-8") doc_path.write_text(xml_content, encoding="utf-8")
assert dci._guess_format(doc_path) == None assert dci._guess_format(doc_path) is None
buf = BytesIO(Path(doc_path).open("rb").read()) buf = BytesIO(Path(doc_path).open("rb").read())
stream = DocumentStream(name="docling_test.xml", stream=buf) stream = DocumentStream(name="docling_test.xml", stream=buf)
assert dci._guess_format(stream) == None assert dci._guess_format(stream) is None
# Invalid USPTO patent (as plain text) # Invalid USPTO patent (as plain text)
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
assert dci._guess_format(stream) == None assert dci._guess_format(stream) is None
doc_path = temp_dir / "pftaps_wrong.txt" doc_path = temp_dir / "pftaps_wrong.txt"
doc_path.write_text("xyz", encoding="utf-8") doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None assert dci._guess_format(doc_path) is None
# Valid Docling JSON # Valid Docling JSON
test_str = '{"name": ""}' test_str = '{"name": ""}'

View File

@ -291,7 +291,7 @@ def verify_conversion_result_v1(
input_path: Path, input_path: Path,
doc_result: ConversionResult, doc_result: ConversionResult,
generate: bool = False, generate: bool = False,
ocr_engine: str = None, ocr_engine: Optional[str] = None,
fuzzy: bool = False, fuzzy: bool = False,
): ):
PageList = TypeAdapter(List[Page]) PageList = TypeAdapter(List[Page])
@ -375,7 +375,7 @@ def verify_conversion_result_v2(
input_path: Path, input_path: Path,
doc_result: ConversionResult, doc_result: ConversionResult,
generate: bool = False, generate: bool = False,
ocr_engine: str = None, ocr_engine: Optional[str] = None,
fuzzy: bool = False, fuzzy: bool = False,
): ):
PageList = TypeAdapter(List[Page]) PageList = TypeAdapter(List[Page])