diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 91a7e39e..3c418103 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -81,8 +81,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): title, section headers, text, lists, and tables. """ - content = "" - in_list = False in_table = False @@ -268,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend): def _get_current_level(self, parents): for k, v in parents.items(): - if v == None and k > 0: + if v is None and k > 0: return k - 1 return 0 def _get_current_parent(self, parents): for k, v in parents.items(): - if v == None and k > 0: + if v is None and k > 0: return parents[k - 1] return None @@ -323,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): "marker": marker, "text": text.strip(), "numbered": False, - "indent": 0 if indent == None else len(indent), + "indent": 0 if indent is None else len(indent), } else: return { @@ -331,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): "marker": marker, "text": text.strip(), "numbered": True, - "indent": 0 if indent == None else len(indent), + "indent": 0 if indent is None else len(indent), } else: # Fallback if no match diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index d1e5453d..f927885b 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -328,7 +328,7 @@ class oMath2Latex(Tag2Method): t_dict = self.process_children_dict(elm, include=("e", "lim")) latex_s = LIM_FUNC.get(t_dict["e"]) if not latex_s: - raise RuntimeError("Not support lim %s" % t_dict["e"]) + raise RuntimeError("Not support lim {}".format(t_dict["e"])) else: return latex_s.format(lim=t_dict.get("lim")) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 83226d7e..aa2637f2 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -146,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): item for item in element.next_siblings if isinstance(item, Tag) ] if element.next_sibling is None or any( - [item.name in TAGS_FOR_NODE_ITEMS for item in siblings] + item.name in TAGS_FOR_NODE_ITEMS for item in siblings ): text = text.strip() if text and tag.name in ["div"]: diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index c0ef2bce..3b9a6bb7 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -126,7 +126,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB enum_list_item_value = 0 new_list = None bullet_type = "None" - list_text = "" list_label = GroupLabel.LIST doc_label = DocItemLabel.LIST_ITEM prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) @@ -368,8 +367,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB slide_width = pptx_obj.slide_width slide_height = pptx_obj.slide_height - text_content = [] # type: ignore - max_levels = 10 parents = {} # type: ignore for i in range(max_levels): @@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB ) slide_size = Size(width=slide_width, height=slide_height) - parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size) + doc.add_page(page_no=slide_ind + 1, size=slide_size) def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size): handle_groups(shape, parent_slide, slide_ind, doc, slide_size) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index c77b0783..49ccefef 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _get_level(self) -> int: """Return the first None index.""" for k, v in self.parents.items(): - if k >= 0 and v == None: + if k >= 0 and v is None: return k return 0 diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py index 17271fb8..e4b29957 100755 --- a/docling/backend/xml/jats_backend.py +++ b/docling/backend/xml/jats_backend.py @@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): doc_info: etree.DocInfo = self.tree.docinfo if doc_info.system_url and any( - [kwd in doc_info.system_url for kwd in JATS_DTD_URL] + kwd in doc_info.system_url for kwd in JATS_DTD_URL ): self.valid = True return for ent in doc_info.internalDTD.iterentities(): if ent.system_url and any( - [kwd in ent.system_url for kwd in JATS_DTD_URL] + kwd in ent.system_url for kwd in JATS_DTD_URL ): self.valid = True return @@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): # TODO: once superscript is supported, add label with formatting aff = aff.removeprefix(f"{label[0].text}, ") affiliation_names.append(aff) - affiliation_ids_names = { - id: name - for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) - } + affiliation_ids_names = dict( + zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) + ) # Get author names and affiliation names for author_node in meta.xpath( diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index 29b41846..b0f8031f 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -1472,9 +1472,7 @@ class XmlTable: if cw == 0: offset_w0.append(col["offset"][ic]) - min_colinfo["offset"] = sorted( - list(set(col["offset"] + min_colinfo["offset"])) - ) + min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"])) # add back the 0 width cols to offset list offset_w0 = list(set(offset_w0)) diff --git a/docling/cli/main.py b/docling/cli/main.py index f60f11cb..1f9047da 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -430,7 +430,7 @@ def convert( settings.debug.visualize_ocr = debug_visualize_ocr if from_formats is None: - from_formats = [e for e in InputFormat] + from_formats = list(InputFormat) parsed_headers: Optional[Dict[str, str]] = None if headers is not None: diff --git a/docling/cli/models.py b/docling/cli/models.py index 80672714..982bbdda 100644 --- a/docling/cli/models.py +++ b/docling/cli/models.py @@ -89,14 +89,13 @@ def download( "Cannot simultaneously set 'all' parameter and specify models to download." ) if not quiet: - FORMAT = "%(message)s" logging.basicConfig( level=logging.INFO, format="[blue]%(message)s[/blue]", datefmt="[%X]", handlers=[RichHandler(show_level=False, show_time=False, markup=True)], ) - to_download = models or ([m for m in _AvailableModels] if all else _default_models) + to_download = models or (list(_AvailableModels) if all else _default_models) output_dir = download_models( output_dir=output_dir, force=force, diff --git a/docling/document_converter.py b/docling/document_converter.py index 4e37f409..125681f3 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -172,7 +172,7 @@ class DocumentConverter: format_options: Optional[Dict[InputFormat, FormatOption]] = None, ): self.allowed_formats = ( - allowed_formats if allowed_formats is not None else [e for e in InputFormat] + allowed_formats if allowed_formats is not None else list(InputFormat) ) self.format_to_options = { format: ( diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index c714af85..b40ca506 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -59,12 +59,10 @@ class EasyOcrModel(BaseOcrModel): device = decide_device(accelerator_options.device) # Enable easyocr GPU if running on CUDA, MPS use_gpu = any( - [ - device.startswith(x) - for x in [ - AcceleratorDevice.CUDA.value, - AcceleratorDevice.MPS.value, - ] + device.startswith(x) + for x in [ + AcceleratorDevice.CUDA.value, + AcceleratorDevice.MPS.value, ] ) else: diff --git a/docling/models/factories/base_factory.py b/docling/models/factories/base_factory.py index 542fc7e6..208f0cab 100644 --- a/docling/models/factories/base_factory.py +++ b/docling/models/factories/base_factory.py @@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta): @property def registered_kind(self) -> list[str]: - return list(opt.kind for opt in self._classes.keys()) + return [opt.kind for opt in self._classes.keys()] def get_enum(self) -> enum.Enum: return enum.Enum( diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py index 8516cee5..63f8fc95 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/hf_mlx_model.py @@ -121,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel): generation_time = time.time() - start_time page_tags = output + _log.debug(f"Generation time {generation_time:.2f} seconds.") + # inference_time = time.time() - start_time # tokens_per_second = num_tokens / generation_time # print("") diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 3d203b5e..29276fc4 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -166,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel): num_tokens = len(generated_ids[0]) page_tags = generated_texts + _log.debug( + f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." + ) + # inference_time = time.time() - start_time # tokens_per_second = num_tokens / generation_time # print("") diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 156045e9..0467e70f 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -64,7 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel): ) def _get_name_and_version(self) -> Tuple[str, str]: - if self._name != None and self._version != None: + if self._name is not None and self._version is not None: return self._name, self._version # type: ignore cmd = [self.options.tesseract_cmd, "--version"] @@ -187,7 +187,7 @@ class TesseractOcrCliModel(BaseOcrModel): self._tesseract_languages = df[0].tolist()[1:] # Decide the script prefix - if any([l.startswith("script/") for l in self._tesseract_languages]): + if any(l.startswith("script/") for l in self._tesseract_languages): script_prefix = "script/" else: script_prefix = "" diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index ef8c806f..892928d3 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -76,7 +76,7 @@ class TesseractOcrModel(BaseOcrModel): _log.debug("Initializing TesserOCR: %s", tesseract_version) lang = "+".join(self.options.lang) - if any([l.startswith("script/") for l in self._tesserocr_languages]): + if any(l.startswith("script/") for l in self._tesserocr_languages): self.script_prefix = "script/" else: self.script_prefix = "" diff --git a/docling/utils/utils.py b/docling/utils/utils.py index 1261f860..11b9fddb 100644 --- a/docling/utils/utils.py +++ b/docling/utils/utils.py @@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size): if isinstance(iterator, List): iterator = iter(iterator) for first in iterator: # Take the first element from the iterator - yield [first] + list(islice(iterator, chunk_size - 1)) + yield [first, *list(islice(iterator, chunk_size - 1))] def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb index 3af38b4f..60872c35 100644 --- a/docs/examples/backend_xml_rag.ipynb +++ b/docs/examples/backend_xml_rag.ipynb @@ -544,7 +544,7 @@ "source": [ "doc = backend.convert()\n", "\n", - "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n", + "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n", "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')" ] }, diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index e9972d02..beb1575a 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -84,7 +84,7 @@ def main(): ) } ) - result = doc_converter.convert(input_doc_path) + doc_converter.convert(input_doc_path) if __name__ == "__main__": diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index d863313d..6e77352d 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -807,10 +807,12 @@ } ], "source": [ + "from typing import Optional\n", + "\n", "from azure.search.documents.models import VectorizableTextQuery\n", "\n", "\n", - "def generate_chat_response(prompt: str, system_message: str = None):\n", + "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n", " \"\"\"\n", " Generates a single-turn chat response using Azure OpenAI Chat.\n", " If you need multi-turn conversation or follow-up queries, you'll have to\n", diff --git a/docs/examples/translate.py b/docs/examples/translate.py index 6aa9321f..229d5451 100644 --- a/docs/examples/translate.py +++ b/docs/examples/translate.py @@ -51,10 +51,9 @@ def main(): } ) - start_time = time.time() - conv_res = doc_converter.convert(input_doc_path) conv_doc = conv_res.document + doc_filename = conv_res.input.file # Save markdown with embedded pictures in original text md_filename = output_dir / f"{doc_filename}-with-images-orig.md" diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index 6d316129..fc047baf 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -38,7 +38,7 @@ def test_asciidocs_examples(): if os.path.exists(gname): with open(gname) as fr: - true_mddoc = fr.read() + fr.read() # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" else: diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 26c20ff8..d6f804cc 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path): page_backend: DoclingParsePageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index 4d5529cd..972f3b55 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path): page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_docling_parse_v4.py b/tests/test_backend_docling_parse_v4.py index caa4cc1a..35c4eab7 100644 --- a/tests/test_backend_docling_parse_v4.py +++ b/tests/test_backend_docling_parse_v4.py @@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path): page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 0ce2ec57..65f636e0 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -79,7 +79,7 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] + path = next(item for item in get_xlsx_paths() if item.stem == "test-01") in_doc = InputDocument( path_or_stream=path, format=InputFormat.XLSX, @@ -90,7 +90,7 @@ def test_pages(documents) -> None: assert backend.page_count() == 3 # number of pages from the converted document - doc = [item for path, item in documents if path.stem == "test-01"][0] + doc = next(item for path, item in documents if path.stem == "test-01") assert len(doc.pages) == 3 # page sizes as number of cells diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py index a43adda1..ace6d3a2 100644 --- a/tests/test_backend_patent_uspto.py +++ b/tests/test_backend_patent_uspto.py @@ -129,7 +129,7 @@ def test_tables(tables): """Test the table parser.""" # CHECK table in file tables_20180000016.xml file_name = "tables_ipa20180000016.xml" - file_table = [item[1] for item in tables if item[0].name == file_name][0] + file_table = next(item[1] for item in tables if item[0].name == file_name) assert file_table.num_rows == 13 assert file_table.num_cols == 10 assert len(file_table.table_cells) == 130 @@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents): # CHECK application doc number 20200022300 file_name = "ipa20200022300.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents): # CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables file_name = "ipa20180000016.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents): # CHECK application doc number 20110039701 for complex long tables file_name = "ipa20110039701.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) assert doc.name == file_name assert len(doc.tables) == 17 @@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents): # CHECK application doc number 06442728 file_name = "pg06442728.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents): # CHECK application doc number 20010031492 file_name = "pa20010031492.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents): # CHECK application doc number 057006474 file_name = "pftaps057006474.txt" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 55f55437..317cdeed 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path): page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 661fe93d..94a68873 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -14,7 +14,7 @@ from docling.document_converter import PdfFormatOption def test_in_doc_from_valid_path(): test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") doc = _make_input_doc(test_doc_path) - assert doc.valid == True + assert doc.valid is True def test_in_doc_from_invalid_path(): @@ -22,7 +22,7 @@ def test_in_doc_from_invalid_path(): doc = _make_input_doc(test_doc_path) - assert doc.valid == False + assert doc.valid is False def test_in_doc_from_valid_buf(): @@ -30,7 +30,7 @@ def test_in_doc_from_valid_buf(): stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) - assert doc.valid == True + assert doc.valid is True def test_in_doc_from_invalid_buf(): @@ -38,7 +38,7 @@ def test_in_doc_from_invalid_buf(): stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) - assert doc.valid == False + assert doc.valid is False def test_image_in_pdf_backend(): @@ -82,7 +82,7 @@ def test_in_doc_with_page_range(): backend=PyPdfiumDocumentBackend, limits=limits, ) - assert doc.valid == True + assert doc.valid is True limits.page_range = (9, 9) @@ -92,7 +92,7 @@ def test_in_doc_with_page_range(): backend=PyPdfiumDocumentBackend, limits=limits, ) - assert doc.valid == True + assert doc.valid is True limits.page_range = (11, 12) @@ -102,7 +102,7 @@ def test_in_doc_with_page_range(): backend=PyPdfiumDocumentBackend, limits=limits, ) - assert doc.valid == False + assert doc.valid is False def test_guess_format(tmp_path): @@ -187,17 +187,17 @@ def test_guess_format(tmp_path): ) doc_path = temp_dir / "docling_test.xml" doc_path.write_text(xml_content, encoding="utf-8") - assert dci._guess_format(doc_path) == None + assert dci._guess_format(doc_path) is None buf = BytesIO(Path(doc_path).open("rb").read()) stream = DocumentStream(name="docling_test.xml", stream=buf) - assert dci._guess_format(stream) == None + assert dci._guess_format(stream) is None # Invalid USPTO patent (as plain text) stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) - assert dci._guess_format(stream) == None + assert dci._guess_format(stream) is None doc_path = temp_dir / "pftaps_wrong.txt" doc_path.write_text("xyz", encoding="utf-8") - assert dci._guess_format(doc_path) == None + assert dci._guess_format(doc_path) is None # Valid Docling JSON test_str = '{"name": ""}' diff --git a/tests/verify_utils.py b/tests/verify_utils.py index ec75f29a..ab3412e3 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -291,7 +291,7 @@ def verify_conversion_result_v1( input_path: Path, doc_result: ConversionResult, generate: bool = False, - ocr_engine: str = None, + ocr_engine: Optional[str] = None, fuzzy: bool = False, ): PageList = TypeAdapter(List[Page]) @@ -375,7 +375,7 @@ def verify_conversion_result_v2( input_path: Path, doc_result: ConversionResult, generate: bool = False, - ocr_engine: str = None, + ocr_engine: Optional[str] = None, fuzzy: bool = False, ): PageList = TypeAdapter(List[Page])