apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-04-14 15:01:05 +02:00 · 2025-04-14 15:01:05 +02:00 · 557efde7dc
commit 557efde7dc
parent 73cec158c6
30 changed files with 65 additions and 69 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -81,8 +81,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        title, section headers, text, lists, and tables.
        """

-        content = ""
-
        in_list = False
        in_table = False

@ -268,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

    def _get_current_level(self, parents):
        for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                return k - 1

        return 0

    def _get_current_parent(self, parents):
        for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                return parents[k - 1]

        return None
@ -323,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                }
            else:
                return {
@ -331,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                }
        else:
            # Fallback if no match
--- a/docling/backend/docx/latex/omml.py
+++ b/docling/backend/docx/latex/omml.py
@ -328,7 +328,7 @@ class oMath2Latex(Tag2Method):
        t_dict = self.process_children_dict(elm, include=("e", "lim"))
        latex_s = LIM_FUNC.get(t_dict["e"])
        if not latex_s:
-            raise RuntimeError("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim {}".format(t_dict["e"]))
        else:
            return latex_s.format(lim=t_dict.get("lim"))

--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -146,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    item for item in element.next_siblings if isinstance(item, Tag)
                ]
                if element.next_sibling is None or any(
-                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                    item.name in TAGS_FOR_NODE_ITEMS for item in siblings
                ):
                    text = text.strip()
                    if text and tag.name in ["div"]:
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -126,7 +126,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        enum_list_item_value = 0
        new_list = None
        bullet_type = "None"
-        list_text = ""
        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@ -368,8 +367,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        slide_width = pptx_obj.slide_width
        slide_height = pptx_obj.slide_height

-        text_content = []  # type: ignore
-
        max_levels = 10
        parents = {}  # type: ignore
        for i in range(max_levels):
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            )

            slide_size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
+            doc.add_page(page_no=slide_ind + 1, size=slide_size)

            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _get_level(self) -> int:
        """Return the first None index."""
        for k, v in self.parents.items():
-            if k >= 0 and v == None:
+            if k >= 0 and v is None:
                return k
        return 0

--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):

            doc_info: etree.DocInfo = self.tree.docinfo
            if doc_info.system_url and any(
-                [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
+                kwd in doc_info.system_url for kwd in JATS_DTD_URL
            ):
                self.valid = True
                return
            for ent in doc_info.internalDTD.iterentities():
                if ent.system_url and any(
-                    [kwd in ent.system_url for kwd in JATS_DTD_URL]
+                    kwd in ent.system_url for kwd in JATS_DTD_URL
                ):
                    self.valid = True
                    return
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                # TODO: once superscript is supported, add label with formatting
                aff = aff.removeprefix(f"{label[0].text}, ")
            affiliation_names.append(aff)
-        affiliation_ids_names = {
-            id: name
-            for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
-        }
+        affiliation_ids_names = dict(
+            zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
+        )

        # Get author names and affiliation names
        for author_node in meta.xpath(
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -1472,9 +1472,7 @@ class XmlTable:
                if cw == 0:
                    offset_w0.append(col["offset"][ic])

-            min_colinfo["offset"] = sorted(
-                list(set(col["offset"] + min_colinfo["offset"]))
-            )
+            min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))

        # add back the 0 width cols to offset list
        offset_w0 = list(set(offset_w0))
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -430,7 +430,7 @@ def convert(
    settings.debug.visualize_ocr = debug_visualize_ocr

    if from_formats is None:
-        from_formats = [e for e in InputFormat]
+        from_formats = list(InputFormat)

    parsed_headers: Optional[Dict[str, str]] = None
    if headers is not None:
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@ -89,14 +89,13 @@ def download(
            "Cannot simultaneously set 'all' parameter and specify models to download."
        )
    if not quiet:
-        FORMAT = "%(message)s"
        logging.basicConfig(
            level=logging.INFO,
            format="[blue]%(message)s[/blue]",
            datefmt="[%X]",
            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
        )
-    to_download = models or ([m for m in _AvailableModels] if all else _default_models)
+    to_download = models or (list(_AvailableModels) if all else _default_models)
    output_dir = download_models(
        output_dir=output_dir,
        force=force,
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -172,7 +172,7 @@ class DocumentConverter:
        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
        self.allowed_formats = (
-            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+            allowed_formats if allowed_formats is not None else list(InputFormat)
        )
        self.format_to_options = {
            format: (
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -59,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
                device = decide_device(accelerator_options.device)
                # Enable easyocr GPU if running on CUDA, MPS
                use_gpu = any(
-                    [
-                        device.startswith(x)
-                        for x in [
-                            AcceleratorDevice.CUDA.value,
-                            AcceleratorDevice.MPS.value,
-                        ]
+                    device.startswith(x)
+                    for x in [
+                        AcceleratorDevice.CUDA.value,
+                        AcceleratorDevice.MPS.value,
                    ]
                )
            else:
--- a/docling/models/factories/base_factory.py
+++ b/docling/models/factories/base_factory.py
@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):

    @property
    def registered_kind(self) -> list[str]:
-        return list(opt.kind for opt in self._classes.keys())
+        return [opt.kind for opt in self._classes.keys()]

    def get_enum(self) -> enum.Enum:
        return enum.Enum(
--- a/docling/models/hf_mlx_model.py
+++ b/docling/models/hf_mlx_model.py
@ -121,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
                    generation_time = time.time() - start_time
                    page_tags = output

+                    _log.debug(f"Generation time {generation_time:.2f} seconds.")
+
                    # inference_time = time.time() - start_time
                    # tokens_per_second = num_tokens / generation_time
                    # print("")
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@ -166,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
                    num_tokens = len(generated_ids[0])
                    page_tags = generated_texts

+                    _log.debug(
+                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
+                    )
+
                    # inference_time = time.time() - start_time
                    # tokens_per_second = num_tokens / generation_time
                    # print("")
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -64,7 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                )

    def _get_name_and_version(self) -> Tuple[str, str]:
-        if self._name != None and self._version != None:
+        if self._name is not None and self._version is not None:
            return self._name, self._version  # type: ignore

        cmd = [self.options.tesseract_cmd, "--version"]
@ -187,7 +187,7 @@ class TesseractOcrCliModel(BaseOcrModel):
        self._tesseract_languages = df[0].tolist()[1:]

        # Decide the script prefix
-        if any([l.startswith("script/") for l in self._tesseract_languages]):
+        if any(l.startswith("script/") for l in self._tesseract_languages):
            script_prefix = "script/"
        else:
            script_prefix = ""
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -76,7 +76,7 @@ class TesseractOcrModel(BaseOcrModel):
            _log.debug("Initializing TesserOCR: %s", tesseract_version)
            lang = "+".join(self.options.lang)

-            if any([l.startswith("script/") for l in self._tesserocr_languages]):
+            if any(l.startswith("script/") for l in self._tesserocr_languages):
                self.script_prefix = "script/"
            else:
                self.script_prefix = ""
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
    if isinstance(iterator, List):
        iterator = iter(iterator)
    for first in iterator:  # Take the first element from the iterator
-        yield [first] + list(islice(iterator, chunk_size - 1))
+        yield [first, *list(islice(iterator, chunk_size - 1))]


 def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -544,7 +544,7 @@
   "source": [
    "doc = backend.convert()\n",
    "\n",
-    "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
+    "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
    "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
   ]
  },
--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -84,7 +84,7 @@ def main():
            )
        }
    )
-    result = doc_converter.convert(input_doc_path)
+    doc_converter.convert(input_doc_path)


 if __name__ == "__main__":
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@ -807,10 +807,12 @@
    }
   ],
   "source": [
+    "from typing import Optional\n",
+    "\n",
    "from azure.search.documents.models import VectorizableTextQuery\n",
    "\n",
    "\n",
-    "def generate_chat_response(prompt: str, system_message: str = None):\n",
+    "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
    "    \"\"\"\n",
    "    Generates a single-turn chat response using Azure OpenAI Chat.\n",
    "    If you need multi-turn conversation or follow-up queries, you'll have to\n",
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -51,10 +51,9 @@ def main():
        }
    )

-    start_time = time.time()
-
    conv_res = doc_converter.convert(input_doc_path)
    conv_doc = conv_res.document
+    doc_filename = conv_res.input.file

    # Save markdown with embedded pictures in original text
    md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -38,7 +38,7 @@ def test_asciidocs_examples():

        if os.path.exists(gname):
            with open(gname) as fr:
-                true_mddoc = fr.read()
+                fr.read()

            # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
        else:
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)

    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_docling_parse_v2.py
+++ b/tests/test_backend_docling_parse_v2.py
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)

    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_docling_parse_v4.py
+++ b/tests/test_backend_docling_parse_v4.py
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)

    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@ -79,7 +79,7 @@ def test_pages(documents) -> None:
        documents: The paths and converted documents.
    """
    # number of pages from the backend method
-    path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
+    path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.XLSX,
@ -90,7 +90,7 @@ def test_pages(documents) -> None:
    assert backend.page_count() == 3

    # number of pages from the converted document
-    doc = [item for path, item in documents if path.stem == "test-01"][0]
+    doc = next(item for path, item in documents if path.stem == "test-01")
    assert len(doc.pages) == 3

    # page sizes as number of cells
--- a/tests/test_backend_patent_uspto.py
+++ b/tests/test_backend_patent_uspto.py
@ -129,7 +129,7 @@ def test_tables(tables):
    """Test the table parser."""
    # CHECK table in file tables_20180000016.xml
    file_name = "tables_ipa20180000016.xml"
-    file_table = [item[1] for item in tables if item[0].name == file_name][0]
+    file_table = next(item[1] for item in tables if item[0].name == file_name)
    assert file_table.num_rows == 13
    assert file_table.num_cols == 10
    assert len(file_table.table_cells) == 130
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):

    # CHECK application doc number 20200022300
    file_name = "ipa20200022300.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)

@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):

    # CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
    file_name = "ipa20180000016.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)

@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):

    # CHECK application doc number 20110039701 for complex long tables
    file_name = "ipa20110039701.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    assert doc.name == file_name
    assert len(doc.tables) == 17

@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):

    # CHECK application doc number 06442728
    file_name = "pg06442728.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)

@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):

    # CHECK application doc number 20010031492
    file_name = "pa20010031492.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)

@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):

    # CHECK application doc number 057006474
    file_name = "pftaps057006474.txt"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)

--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)

    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -14,7 +14,7 @@ from docling.document_converter import PdfFormatOption
 def test_in_doc_from_valid_path():
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    doc = _make_input_doc(test_doc_path)
-    assert doc.valid == True
+    assert doc.valid is True


 def test_in_doc_from_invalid_path():
@ -22,7 +22,7 @@ def test_in_doc_from_invalid_path():

    doc = _make_input_doc(test_doc_path)

-    assert doc.valid == False
+    assert doc.valid is False


 def test_in_doc_from_valid_buf():
@ -30,7 +30,7 @@ def test_in_doc_from_valid_buf():
    stream = DocumentStream(name="my_doc.pdf", stream=buf)

    doc = _make_input_doc_from_stream(stream)
-    assert doc.valid == True
+    assert doc.valid is True


 def test_in_doc_from_invalid_buf():
@ -38,7 +38,7 @@ def test_in_doc_from_invalid_buf():
    stream = DocumentStream(name="my_doc.pdf", stream=buf)

    doc = _make_input_doc_from_stream(stream)
-    assert doc.valid == False
+    assert doc.valid is False


 def test_image_in_pdf_backend():
@ -82,7 +82,7 @@ def test_in_doc_with_page_range():
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
-    assert doc.valid == True
+    assert doc.valid is True

    limits.page_range = (9, 9)

@ -92,7 +92,7 @@ def test_in_doc_with_page_range():
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
-    assert doc.valid == True
+    assert doc.valid is True

    limits.page_range = (11, 12)

@ -102,7 +102,7 @@ def test_in_doc_with_page_range():
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
-    assert doc.valid == False
+    assert doc.valid is False


 def test_guess_format(tmp_path):
@ -187,17 +187,17 @@ def test_guess_format(tmp_path):
    )
    doc_path = temp_dir / "docling_test.xml"
    doc_path.write_text(xml_content, encoding="utf-8")
-    assert dci._guess_format(doc_path) == None
+    assert dci._guess_format(doc_path) is None
    buf = BytesIO(Path(doc_path).open("rb").read())
    stream = DocumentStream(name="docling_test.xml", stream=buf)
-    assert dci._guess_format(stream) == None
+    assert dci._guess_format(stream) is None

    # Invalid USPTO patent (as plain text)
    stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
-    assert dci._guess_format(stream) == None
+    assert dci._guess_format(stream) is None
    doc_path = temp_dir / "pftaps_wrong.txt"
    doc_path.write_text("xyz", encoding="utf-8")
-    assert dci._guess_format(doc_path) == None
+    assert dci._guess_format(doc_path) is None

    # Valid Docling JSON
    test_str = '{"name": ""}'
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -291,7 +291,7 @@ def verify_conversion_result_v1(
    input_path: Path,
    doc_result: ConversionResult,
    generate: bool = False,
-    ocr_engine: str = None,
+    ocr_engine: Optional[str] = None,
    fuzzy: bool = False,
 ):
    PageList = TypeAdapter(List[Page])
@ -375,7 +375,7 @@ def verify_conversion_result_v2(
    input_path: Path,
    doc_result: ConversionResult,
    generate: bool = False,
-    ocr_engine: str = None,
+    ocr_engine: Optional[str] = None,
    fuzzy: bool = False,
 ):
    PageList = TypeAdapter(List[Page])