apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-04-14 14:48:04 +02:00 · 2025-04-14 14:48:04 +02:00 · 73cec158c6
commit 73cec158c6
parent d74e407526
84 changed files with 172 additions and 225 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                self.lines = text_stream.split("\n")
            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                    self.lines = f.readlines()
            self.valid = True

@ -75,7 +75,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

        return doc

-    def _parse(self, doc: DoclingDocument):
+    def _parse(self, doc: DoclingDocument):  # noqa: C901
        """
        Main function that orchestrates the parsing by yielding components:
        title, section headers, text, lists, and tables.
@ -95,7 +95,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
        indents: dict[int, Union[GroupItem, None]] = {}

-        for i in range(0, 10):
+        for i in range(10):
            parents[i] = None
            indents[i] = None

--- a/docling/backend/csv_backend.py
+++ b/docling/backend/csv_backend.py
@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
        head = self.content.readline()
        dialect = csv.Sniffer().sniff(head, ",;\t|:")
        _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
-        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+        if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
            raise RuntimeError(
                f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
            )
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -1,14 +1,14 @@
 import logging
-import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
-from PIL import Image, ImageDraw
+from PIL import Image
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
--- a/docling/backend/docx/latex/latex_dict.py
+++ b/docling/backend/docx/latex/latex_dict.py
@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
-
 """
 Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
 On 23/01/2025
 """

-from __future__ import unicode_literals
-
 CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")

 BLANK = ""
@ -79,7 +75,6 @@ CHR_BO = {
 }

 T = {
-    "\u2192": "\\rightarrow ",
    # Greek letters
    "\U0001d6fc": "\\alpha ",
    "\U0001d6fd": "\\beta ",
--- a/docling/backend/docx/latex/omml.py
+++ b/docling/backend/docx/latex/omml.py
@ -76,7 +76,7 @@ def get_val(key, default=None, store=CHR):
        return default


-class Tag2Method(object):
+class Tag2Method:
    def call_method(self, elm, stag=None):
        getmethod = self.tag2meth.get
        if stag is None:
@ -157,7 +157,7 @@ class Pr(Tag2Method):
    def do_common(self, elm):
        stag = elm.tag.replace(OMML_NS, "")
        if stag in self.__val_tags:
-            t = elm.get("{0}val".format(OMML_NS))
+            t = elm.get(f"{OMML_NS}val")
            self.__innerdict[stag] = t
        return None

@ -246,7 +246,6 @@ class oMath2Latex(Tag2Method):
        """
        the Pre-Sub-Superscript object -- Not support yet
        """
-        pass

    def do_sub(self, elm):
        text = self.process_children(elm)
@ -329,7 +328,7 @@ class oMath2Latex(Tag2Method):
        t_dict = self.process_children_dict(elm, include=("e", "lim"))
        latex_s = LIM_FUNC.get(t_dict["e"])
        if not latex_s:
-            raise NotSupport("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim %s" % t_dict["e"])
        else:
            return latex_s.format(lim=t_dict.get("lim"))

@ -411,7 +410,7 @@ class oMath2Latex(Tag2Method):
        """
        _str = []
        _base_str = []
-        found_text = elm.findtext("./{0}t".format(OMML_NS))
+        found_text = elm.findtext(f"./{OMML_NS}t")
        if found_text:
            for s in found_text:
                out_latex_str = self.process_unicode(s)
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.max_levels = 10
        self.level = 0
        self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
-        for i in range(0, self.max_levels):
+        for i in range(self.max_levels):
            self.parents[i] = None

        try:
@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    self.analyze_tag(cast(Tag, element), doc)
                except Exception as exc_child:
                    _log.error(
-                        f"Error processing child from tag {tag.name}: {repr(exc_child)}"
+                        f"Error processing child from tag {tag.name}: {exc_child!r}"
                    )
                    raise exc_child
            elif isinstance(element, NavigableString) and not isinstance(
@ -357,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            marker = ""
            enumerated = False
            if parent_label == GroupLabel.ORDERED_LIST:
-                marker = f"{str(index_in_list)}."
+                marker = f"{index_in_list!s}."
                enumerated = True
            doc.add_list_item(
                text=text,
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                # otherwise they represent emphasis (bold or italic)
                self.markdown = self._shorten_underscore_sequences(text_stream)
            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                    md_content = f.read()
                    # remove invalid sequences
                    # very long sequences of underscores will lead to unnecessary long processing times.
@ -235,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            if has_non_empty_list_items:
                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
                parent_item = doc.add_group(
-                    label=label, name=f"list", parent=parent_item
+                    label=label, name="list", parent=parent_item
                )

        elif (
@ -319,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            self._html_blocks += 1
            self._process_inline_text(parent_item, doc)
            self._close_table(doc)
-            _log.debug("HTML Block: {}".format(element))
+            _log.debug(f"HTML Block: {element}")
            if (
                len(element.body) > 0
            ):  # If Marko doesn't return any content for HTML block, skip it
@ -331,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        else:
            if not isinstance(element, str):
                self._close_table(doc)
-                _log.debug("Some other element: {}".format(element))
+                _log.debug(f"Some other element: {element}")

        processed_block_types = (
            marko.block.Heading,
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -120,7 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB

        return prov

-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
        is_a_list = False
        is_list_group_created = False
        enum_list_item_value = 0
@ -243,7 +243,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                    enum_marker = str(enum_list_item_value) + "."
                if not is_list_group_created:
                    new_list = doc.add_group(
-                        label=list_label, name=f"list", parent=parent_slide
+                        label=list_label, name="list", parent=parent_slide
                    )
                    is_list_group_created = True
                doc.add_list_item(
@ -372,7 +372,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB

        max_levels = 10
        parents = {}  # type: ignore
-        for i in range(0, max_levels):
+        for i in range(max_levels):
            parents[i] = None

        # Loop through each slide
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
                )
                if cell is None or cell._tc in cell_set:
-                    _log.debug(f"  skipped since repeated content")
+                    _log.debug("  skipped since repeated content")
                    col_idx += cell.grid_span
                    continue
                else:
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    image=ImageRef.from_pil(image=pil_image, dpi=72),
                    caption=None,
                )
-            except (UnidentifiedImageError, OSError) as e:
+            except (UnidentifiedImageError, OSError):
                _log.warning("Warning: image cannot be loaded by Pillow")
                doc.add_picture(
                    parent=self.parents[level - 1],
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Set, Union
+from typing import Optional, Set, Union

 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union

 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
        self.valid = True  # No better way to tell from pypdfium.
        try:
            self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
-        except PdfiumError as e:
+        except PdfiumError:
            _log.info(
                f"An exception occurred when loading page {page_no} of document {document_hash}.",
                exc_info=True,
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -348,7 +348,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):

        return

-    def _parse_element_citation(self, node: etree._Element) -> str:
+    def _parse_element_citation(self, node: etree._Element) -> str:  # noqa: C901
        citation: Citation = {
            "author_names": "",
            "title": "",
@ -439,7 +439,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
            if len(node.xpath("lpage")) > 0:
                citation["page"] += (
-                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()  # noqa: RUF001
                )

        # Flatten the citation to string
@ -594,9 +594,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):

        try:
            self._add_table(doc, parent, table)
-        except Exception as e:
-            _log.warning(f"Skipping unsupported table in {str(self.file)}")
-            pass
+        except Exception:
+            _log.warning(f"Skipping unsupported table in {self.file!s}")

        return

--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -162,7 +162,6 @@ class PatentUspto(ABC):
        Returns:
            The patent parsed as a docling document.
        """
-        pass


 class PatentUsptoIce(PatentUspto):
@ -264,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
            self.style_html = HtmlEntity()

        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.

            Args:
@ -280,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
            self._start_registered_elements(tag, attributes)

        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.

            HTML entities will be skipped by the parser. This method will unescape them
@ -314,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
                        self.text += unescaped

        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.

            Args:
@ -602,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
            self.style_html = HtmlEntity()

        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.

            Args:
@ -615,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
            self._start_registered_elements(tag, attributes)

        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.

            HTML entities will be skipped by the parser. This method will unescape them
@ -649,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
                        self.text += unescaped

        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.

            Args:
@ -690,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
            if tag in [member.value for member in self.Element]:
                if (
                    tag == self.Element.HEADING.value
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                ):
                    level_attr: str = attributes.get("LVL", "")
                    new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@ -742,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
                # headers except claims statement
                elif (
                    self.Element.HEADING.value in self.property
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                    and text.strip()
                ):
                    self.parents[self.level + 1] = self.doc.add_heading(
@ -1163,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
            self.style_html = HtmlEntity()

        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.

            Args:
@ -1176,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
            self._start_registered_elements(tag, attributes)

        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.

            HTML entities will be skipped by the parser. This method will unescape them
@ -1210,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
                        self.text += unescaped

        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.

            Args:
@ -1526,7 +1525,7 @@ class XmlTable:

        return ncols_max

-    def _parse_table(self, table: Tag) -> TableData:
+    def _parse_table(self, table: Tag) -> TableData:  # noqa: C901
        """Parse the content of a table tag.

        Args:
@ -1721,7 +1720,7 @@ class HtmlEntity:
                "0": "&#8304;",
                "+": "&#8314;",
                "-": "&#8315;",
-                "−": "&#8315;",
+                "−": "&#8315;",  # noqa: RUF001
                "=": "&#8316;",
                "(": "&#8317;",
                ")": "&#8318;",
@ -1745,7 +1744,7 @@ class HtmlEntity:
                "0": "&#8320;",
                "+": "&#8330;",
                "-": "&#8331;",
-                "−": "&#8331;",
+                "−": "&#8331;",  # noqa: RUF001
                "=": "&#8332;",
                "(": "&#8333;",
                ")": "&#8334;",
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -6,8 +6,9 @@ import sys
 import tempfile
 import time
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional, Type
+from typing import Annotated, Dict, List, Optional, Type

 import rich.table
 import typer
@ -288,7 +289,7 @@ def convert(
            ...,
            help=(
                f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
-                f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
+                f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
                f"Use the option --show-external-plugins to see the options allowed with external plugins."
            ),
        ),
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@ -62,7 +62,7 @@ def download(
    models: Annotated[
        Optional[list[_AvailableModels]],
        typer.Argument(
-            help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
+            help="Models to download (default behavior: a predefined set of models will be downloaded).",
        ),
    ] = None,
    all: Annotated[
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -10,7 +10,7 @@ from docling_core.types.doc import (
    TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+from docling_core.types.io import (
    DocumentStream,
 )
 from PIL.Image import Image
@ -243,7 +243,7 @@ class Page(BaseModel):
        if self._backend is None:
            return self._image_cache.get(scale, None)

-        if not scale in self._image_cache:
+        if scale not in self._image_cache:
            if cropbox is None:
                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
            else:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,13 +1,13 @@
 import csv
 import logging
 import re
+from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
 from typing import (
    TYPE_CHECKING,
    Dict,
-    Iterable,
    List,
    Literal,
    Optional,
@ -18,31 +18,9 @@ from typing import (

 import filetype
 from docling_core.types.doc import (
-    DocItem,
    DocItemLabel,
    DoclingDocument,
-    PictureItem,
-    SectionHeaderItem,
-    TableItem,
-    TextItem,
 )
-from docling_core.types.doc.document import ListItem
-from docling_core.types.legacy_doc.base import (
-    BaseText,
-    Figure,
-    GlmTableCell,
-    PageDimensions,
-    PageReference,
-    Prov,
-    Ref,
-)
-from docling_core.types.legacy_doc.base import Table as DsSchemaTable
-from docling_core.types.legacy_doc.base import TableCell
-from docling_core.types.legacy_doc.document import (
-    CCSDocumentDescription as DsDocumentDescription,
-)
-from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
-from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
@ -65,7 +43,7 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.profiling import ProfilingItem
-from docling.utils.utils import create_file_hash, create_hash
+from docling.utils.utils import create_file_hash

 if TYPE_CHECKING:
    from docling.document_converter import FormatOption
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,11 +1,11 @@
 import hashlib
 import logging
-import math
 import sys
 import time
+from collections.abc import Iterable, Iterator
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple, Type, Union

 from pydantic import BaseModel, ConfigDict, model_validator, validate_call

@ -254,7 +254,7 @@ class DocumentConverter:

        if not had_result and raises_on_error:
            raise ConversionError(
-                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
+                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
            )

    def _convert(
@ -266,7 +266,7 @@ class DocumentConverter:
            conv_input.docs(self.format_to_options),
            settings.perf.doc_batch_size,  # pass format_options
        ):
-            _log.info(f"Going to convert document batch...")
+            _log.info("Going to convert document batch...")

            # parallel processing only within input_batch
            # with ThreadPoolExecutor(
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -1,4 +1,4 @@
-from typing import Iterable
+from collections.abc import Iterable

 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import Any, Generic, Iterable, Optional, Protocol, Type
+from collections.abc import Iterable
+from typing import Generic, Optional, Protocol, Type

 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
 from typing_extensions import TypeVar
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -1,12 +1,12 @@
 import copy
 import logging
 from abc import abstractmethod
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional, Type
+from typing import List, Optional, Type

 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -1,7 +1,8 @@
 import re
 from collections import Counter
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Literal, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union

 import numpy as np
 from docling_core.types.doc import (
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Literal, Optional, Tuple, Union
+from typing import List, Literal, Optional, Union

 import numpy as np
 from docling_core.types.doc import (
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -1,8 +1,9 @@
 import logging
 import warnings
 import zipfile
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional, Type
+from typing import List, Optional, Type

 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -98,8 +99,10 @@ class EasyOcrModel(BaseOcrModel):
        progress: bool = False,
    ) -> Path:
        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
-        from easyocr.config import detection_models as det_models_dict
-        from easyocr.config import recognition_models as rec_models_dict
+        from easyocr.config import (
+            detection_models as det_models_dict,
+            recognition_models as rec_models_dict,
+        )

        if local_dir is None:
            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
--- a/docling/models/factories/init.py
+++ b/docling/models/factories/init.py
@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
 logger = logging.getLogger(__name__)


-@lru_cache()
+@lru_cache
 def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
    factory = OcrFactory()
    factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
    return factory


-@lru_cache()
+@lru_cache
 def get_picture_description_factory(
    allow_external_plugins: bool = False,
 ) -> PictureDescriptionFactory:
--- a/docling/models/hf_mlx_model.py
+++ b/docling/models/hf_mlx_model.py
@ -1,18 +1,16 @@
 import logging
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Optional

 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
-from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@ -1,16 +1,15 @@
 import logging
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Optional

 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
-from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
@ -41,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
            device = decide_device(accelerator_options.device)
            self.device = device

-            _log.debug("Available device for HuggingFace VLM: {}".format(device))
+            _log.debug(f"Available device for HuggingFace VLM: {device}")

            repo_cache_folder = vlm_options.repo_id.replace("/", "--")

--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -1,8 +1,9 @@
 import copy
 import logging
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional

 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -1,8 +1,9 @@
 import logging
 import sys
 import tempfile
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Tuple, Type
+from typing import Optional, Type

 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):

        if self.enabled:
            if "darwin" != sys.platform:
-                raise RuntimeError(f"OcrMac is only supported on Mac.")
+                raise RuntimeError("OcrMac is only supported on Mac.")
            install_errmsg = (
                "ocrmac is not correctly installed. "
                "Please install it via `pip install ocrmac` to use this OCR engine. "
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -1,6 +1,7 @@
 import logging
 import re
-from typing import Iterable, List
+from collections.abc import Iterable
+from typing import List

 from pydantic import BaseModel

@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
        sanitized_text = "".join(lines)

        # Text normalization
-        sanitized_text = sanitized_text.replace("⁄", "/")
-        sanitized_text = sanitized_text.replace("’", "'")
-        sanitized_text = sanitized_text.replace("‘", "'")
+        sanitized_text = sanitized_text.replace("⁄", "/")  # noqa: RUF001
+        sanitized_text = sanitized_text.replace("’", "'")  # noqa: RUF001
+        sanitized_text = sanitized_text.replace("‘", "'")  # noqa: RUF001
        sanitized_text = sanitized_text.replace("“", '"')
        sanitized_text = sanitized_text.replace("”", '"')
        sanitized_text = sanitized_text.replace("•", "·")
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Optional

 from PIL import ImageDraw
 from pydantic import BaseModel
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Optional, Type, Union

 from PIL import Image

--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@ -1,12 +1,11 @@
-import logging
 from abc import abstractmethod
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable, List, Optional, Type, Union
+from typing import List, Optional, Type, Union

 from docling_core.types.doc import (
    DoclingDocument,
    NodeItem,
-    PictureClassificationClass,
    PictureItem,
 )
 from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Optional, Type, Union

 from PIL import Image

--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -1,6 +1,7 @@
 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type

 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -1,12 +1,7 @@
-import copy
-import random
 from pathlib import Path
 from typing import Dict, List

 from docling_core.types.doc import (
-    BoundingBox,
-    CoordOrigin,
-    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
@ -17,13 +12,10 @@ from docling_core.types.doc import (
    TableData,
 )
 from docling_core.types.doc.document import ContentLayer
-from docling_core.types.legacy_doc.base import Ref
-from docling_core.types.legacy_doc.document import BaseText
 from docling_ibm_models.reading_order.reading_order_rb import (
    PageElement as ReadingOrderPageElement,
+    ReadingOrderPredictor,
 )
-from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
-from PIL import ImageDraw
 from pydantic import BaseModel, ConfigDict

 from docling.datamodel.base_models import (
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
    TextElement,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.settings import settings
 from docling.utils.profiling import ProfilingScope, TimeRecorder


--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -1,13 +1,13 @@
 import copy
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional

 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_core.types.doc.page import (
    BoundingRectangle,
-    SegmentedPdfPage,
    TextCellUnit,
 )
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -3,9 +3,10 @@ import io
 import logging
 import os
 import tempfile
+from collections.abc import Iterable
 from pathlib import Path
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -1,6 +1,7 @@
 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type

 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -3,9 +3,10 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, List
+from collections.abc import Iterable
+from typing import Any, Callable, List

-from docling_core.types.doc import DoclingDocument, NodeItem
+from docling_core.types.doc import NodeItem

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.

        total_elapsed_time = 0.0
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-            for i in range(0, conv_res.input.page_count):
+            for i in range(conv_res.input.page_count):
                start_page, end_page = conv_res.input.limits.page_range
                if (start_page - 1) <= i <= (end_page - 1):
                    conv_res.pages.append(Page(page_no=i))
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -1,5 +1,4 @@
 import logging
-import sys
 import warnings
 from pathlib import Path
 from typing import Optional, cast
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -1,5 +1,4 @@
 import logging
-import warnings
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@ -1,8 +1,8 @@
 import logging
-from typing import Any, Dict, Iterable, List, Tuple, Union
+from collections.abc import Iterable
+from typing import Any, Dict, List, Tuple, Union

 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table

 from docling.datamodel.document import ConversionResult, Page
--- a/docling/utils/glm_utils.py
+++ b/docling/utils/glm_utils.py
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
    return unique_objects


-def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
+def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:  # noqa: C901
    origin = DocumentOrigin(
        mimetype="application/pdf",
        filename=doc_glm["file-info"]["filename"],
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -18,7 +18,7 @@ class UnionFind:

    def __init__(self, elements):
        self.parent = {elem: elem for elem in elements}
-        self.rank = {elem: 0 for elem in elements}
+        self.rank = dict.fromkeys(elements, 0)

    def find(self, x):
        if self.parent[x] != x:
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@ -37,7 +37,7 @@ def download_models(
    output_dir.mkdir(exist_ok=True, parents=True)

    if with_layout:
-        _log.info(f"Downloading layout model...")
+        _log.info("Downloading layout model...")
        LayoutModel.download_models(
            local_dir=output_dir / LayoutModel._model_repo_folder,
            force=force,
@ -45,7 +45,7 @@ def download_models(
        )

    if with_tableformer:
-        _log.info(f"Downloading tableformer model...")
+        _log.info("Downloading tableformer model...")
        TableStructureModel.download_models(
            local_dir=output_dir / TableStructureModel._model_repo_folder,
            force=force,
@ -53,7 +53,7 @@ def download_models(
        )

    if with_picture_classifier:
-        _log.info(f"Downloading picture classifier model...")
+        _log.info("Downloading picture classifier model...")
        DocumentPictureClassifier.download_models(
            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
            force=force,
@ -61,7 +61,7 @@ def download_models(
        )

    if with_code_formula:
-        _log.info(f"Downloading code formula model...")
+        _log.info("Downloading code formula model...")
        CodeFormulaModel.download_models(
            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
            force=force,
@ -69,7 +69,7 @@ def download_models(
        )

    if with_smolvlm:
-        _log.info(f"Downloading SmolVlm model...")
+        _log.info("Downloading SmolVlm model...")
        PictureDescriptionVlmModel.download_models(
            repo_id=smolvlm_picture_description.repo_id,
            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@ -78,7 +78,7 @@ def download_models(
        )

    if with_granite_vision:
-        _log.info(f"Downloading Granite Vision model...")
+        _log.info("Downloading Granite Vision model...")
        PictureDescriptionVlmModel.download_models(
            repo_id=granite_picture_description.repo_id,
            local_dir=output_dir / granite_picture_description.repo_cache_folder,
@ -87,7 +87,7 @@ def download_models(
        )

    if with_easyocr:
-        _log.info(f"Downloading easyocr models...")
+        _log.info("Downloading easyocr models...")
        EasyOcrModel.download_models(
            local_dir=output_dir / EasyOcrModel._model_repo_folder,
            force=force,
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -383,7 +383,7 @@
    "\n",
    "print(f\"Downloading {url}...\")\n",
    "buf = BytesIO(requests.get(url).content)\n",
-    "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
+    "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
    "with zipfile.ZipFile(buf) as zf:\n",
    "    res = zf.testzip()\n",
    "    if res:\n",
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -1,8 +1,8 @@
 import json
 import logging
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable

 import yaml
 from docling_core.types.doc import ImageRefMode
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption

 _log = logging.getLogger(__name__)
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -3,7 +3,6 @@ import logging
 import time
 from pathlib import Path

-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.models.ocr_mac_model import OcrMacOptions
-from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
-from docling.models.tesseract_ocr_model import TesseractOcrOptions

 _log = logging.getLogger(__name__)

--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -3,8 +3,8 @@
 # It does not run the actual formula understanding model.

 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable

 from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem

--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -3,8 +3,9 @@
 # It does not run the actual picture classifier model.

 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable
+from typing import Any

 from docling_core.types.doc import (
    DoclingDocument,
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -4,7 +4,7 @@ from pathlib import Path

 from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

-from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@ -1,14 +1,9 @@
 from pathlib import Path

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    OcrMacOptions,
    PdfPipelineOptions,
-    RapidOcrOptions,
    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@ -153,10 +153,10 @@
   "source": [
    "for i, chunk in enumerate(chunk_iter):\n",
    "    print(f\"=== {i} ===\")\n",
-    "    print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
+    "    print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
    "\n",
    "    enriched_text = chunker.serialize(chunk=chunk)\n",
-    "    print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
+    "    print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
    "\n",
    "    print()"
   ]
@ -353,11 +353,11 @@
    "for i, chunk in enumerate(chunks):\n",
    "    print(f\"=== {i} ===\")\n",
    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
-    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
+    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
    "\n",
    "    ser_txt = chunker.serialize(chunk=chunk)\n",
    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
-    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
+    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
    "\n",
    "    print()"
   ]
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -2,17 +2,11 @@ import json
 import time
 from pathlib import Path

-import yaml
-
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
    VlmPipelineOptions,
-    granite_vision_vlm_conversion_options,
-    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
-from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
 for source in sources:
    start_time = time.time()
    print("================================================")
-    print("Processing... {}".format(source))
+    print(f"Processing... {source}")
    print("================================================")
    print("")

@ -77,7 +71,7 @@ for source in sources:
        print(page.predictions.vlm_response.text)

    res.document.save_as_html(
-        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
+        filename=Path(f"{out_path}/{res.input.file.stem}.html"),
        image_mode=ImageRefMode.REFERENCED,
        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
    )
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@ -144,7 +144,7 @@
    "for pic in doc.pictures[:5]:\n",
    "    html_item = (\n",
    "        f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
-    "        f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
+    "        f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
    "        f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
    "    )\n",
    "    for annotation in pic.annotations:\n",
@ -252,7 +252,7 @@
    "for pic in doc.pictures[:5]:\n",
    "    html_item = (\n",
    "        f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
-    "        f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
+    "        f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
    "        f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
    "    )\n",
    "    for annotation in pic.annotations:\n",
--- a/docs/examples/rag_haystack.ipynb
+++ b/docs/examples/rag_haystack.ipynb
@ -351,7 +351,7 @@
    "for source in sources:\n",
    "    if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
    "        doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
-    "        print(f\"- text: {repr(doc_chunk.text)}\")\n",
+    "        print(f\"- text: {doc_chunk.text!r}\")\n",
    "        if doc_chunk.meta.origin:\n",
    "            print(f\"  file: {doc_chunk.meta.origin.filename}\")\n",
    "        if doc_chunk.meta.headings:\n",
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@ -119,7 +119,7 @@
    "    device = torch.device(\"mps\")\n",
    "    print(\"MPS GPU is enabled.\")\n",
    "else:\n",
-    "    raise EnvironmentError(\n",
+    "    raise OSError(\n",
    "        \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
    "    )"
   ]
@ -226,7 +226,6 @@
    }
   ],
   "source": [
-    "from docling.datamodel.document import ConversionResult\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# Instantiate the doc converter\n",
@ -345,7 +344,7 @@
    "\n",
    "    openai_api_key = os.getenv(openai_api_key_var)\n",
    "    if not openai_api_key:\n",
-    "        raise EnvironmentError(\n",
+    "        raise OSError(\n",
    "            f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
    "            \"Please define it before running this script.\"\n",
    "        )"
@ -387,7 +386,6 @@
   "outputs": [],
   "source": [
    "import weaviate.classes.config as wc\n",
-    "from weaviate.classes.config import DataType, Property\n",
    "\n",
    "# Define the collection name\n",
    "collection_name = \"docling\"\n",
--- a/docs/examples/run_md.py
+++ b/docs/examples/run_md.py
@ -25,7 +25,7 @@ def main():
        document = mdb.convert()

        out_path = Path("scratch")
-        print(f"Document {path} converted.\nSaved markdown output to: {str(out_path)}")
+        print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")

        # Export Docling document format to markdowndoc:
        fn = os.path.basename(path)
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -1,13 +1,10 @@
 from pathlib import Path

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -63,7 +63,7 @@ def main():
        out_path = Path("scratch")
        print(
            f"Document {res.input.file.name} converted."
-            f"\nSaved markdown output to: {str(out_path)}"
+            f"\nSaved markdown output to: {out_path!s}"
        )
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        # Export Docling document format to markdowndoc:
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -2,9 +2,9 @@ import logging
 import time
 from pathlib import Path

-from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
+from docling_core.types.doc import ImageRefMode, TableItem, TextItem

-from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    ResponseFormat,
    VlmPipelineOptions,
-    granite_vision_vlm_ollama_conversion_options,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
--- a/pyproject.toml
+++ b/pyproject.toml
@ -202,12 +202,16 @@ select = [
 ]

 ignore = [
+    "C408",  # Unnecessary `dict()` call (rewrite as a literal)
    "E501",  # Line too long, handled by ruff formatter
    "D107", # "Missing docstring in __init__",
+    "F401",  # imported but unused; consider using `importlib.util.find_spec` to test for "
    "F811", # "redefinition of the same function"
    "PL", # Pylint
    "RUF012", # Mutable Class Attributes
+    "UP006",  # List vs list, etc
    "UP007", # Option and Union
+    "UP035",  # `typing.Set` is deprecated, use `set` instead"
 ]

 #extend-select = []
@ -217,7 +221,7 @@ ignore = [
 "tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests

 [tool.ruff.lint.mccabe]
-max-complexity = 15
+max-complexity = 20

 # [tool.ruff.lint.isort.sections]
 # "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -37,7 +37,7 @@ def test_asciidocs_examples():
        print("\n\n", pred_mddoc)

        if os.path.exists(gname):
-            with open(gname, "r") as fr:
+            with open(gname) as fr:
                true_mddoc = fr.read()

            # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
--- a/tests/test_backend_csv.py
+++ b/tests/test_backend_csv.py
@ -1,5 +1,3 @@
-import json
-import os
 from pathlib import Path

 from pytest import warns
@ -16,7 +14,7 @@ GENERATE = GEN_TEST_DATA

 def get_csv_paths():
    # Define the directory you want to search
-    directory = Path(f"./tests/data/csv/")
+    directory = Path("./tests/data/csv/")

    # List all CSV files in the directory and its subdirectories
    return sorted(directory.rglob("*.csv"))
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@ -32,7 +32,7 @@ def test_text_cell_counts():

    doc_backend = _get_backend(pdf_doc)

-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
--- a/tests/test_backend_docling_parse_v2.py
+++ b/tests/test_backend_docling_parse_v2.py
@ -31,7 +31,7 @@ def test_text_cell_counts():

    doc_backend = _get_backend(pdf_doc)

-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
--- a/tests/test_backend_docling_parse_v4.py
+++ b/tests/test_backend_docling_parse_v4.py
@ -31,7 +31,7 @@ def test_text_cell_counts():

    doc_backend = _get_backend(pdf_doc)

-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
--- a/tests/test_backend_jats.py
+++ b/tests/test_backend_jats.py
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA


 def get_pubmed_paths():
-    directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
+    directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
    xml_files = sorted(directory.rglob("*.xml"))
    return xml_files

--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -1,4 +1,3 @@
-import os
 from pathlib import Path

 from docling.backend.msword_backend import MsWordDocumentBackend
--- a/tests/test_backend_patent_uspto.py
+++ b/tests/test_backend_patent_uspto.py
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
    assert isinstance(texts[2], TextItem)
    assert texts[2].text == (
        "An interleaver receives incoming data frames of size N. The interleaver "
-        "indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
+        "indexes the elements of the frame with an N₁×N₂ index array. The interleaver "  # noqa: RUF001
        "then effectively rearranges (permutes) the data by permuting the rows of the "
-        "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
+        "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "  # noqa: RUF001
        "permute the columns (indexed by k) of each row (indexed by j). P is at least "
        "equal to N₂, βj is a constant which may be different for each row, and each "
-        "αj is a relative prime number relative to P. After permuting, the "
+        "αj is a relative prime number relative to P. After permuting, the "  # noqa: RUF001
        "interleaver outputs the data in a different order than received (e.g., "
        "receives sequentially row by row, outputs sequentially each column by column)."
    )
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@ -32,7 +32,7 @@ def test_text_cell_counts():

    doc_backend = _get_backend(pdf_doc)

-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
--- a/tests/test_backend_pptx.py
+++ b/tests/test_backend_pptx.py
@ -1,4 +1,3 @@
-import os
 from pathlib import Path

 from docling.datamodel.base_models import InputFormat
--- a/tests/test_code_formula.py
+++ b/tests/test_code_formula.py
@ -3,7 +3,6 @@ from pathlib import Path
 from docling_core.types.doc import CodeItem, TextItem
 from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/tests/test_document_picture_classifier.py
+++ b/tests/test_document_picture_classifier.py
@ -2,7 +2,6 @@ from pathlib import Path

 from docling_core.types.doc import PictureClassificationData

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -1,7 +1,6 @@
 from pathlib import Path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -3,7 +3,6 @@ from pathlib import Path
 from typing import List

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -4,7 +4,6 @@ from pathlib import Path
 import pytest

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@ -3,8 +3,6 @@ from pathlib import Path

 import pytest

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -216,7 +216,7 @@ def verify_picture_image_v2(


 def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
-    assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match."
+    assert len(doc_pred.texts) == len(doc_true.texts), "Text lengths do not match."

    assert len(doc_true.tables) == len(doc_pred.tables), (
        "document has different count of tables than expected."
@ -230,7 +230,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
        assert isinstance(pred_item, DocItem), "Test item is not a DocItem"

        # Validate type
-        assert true_item.label == pred_item.label, f"Object label does not match."
+        assert true_item.label == pred_item.label, "Object label does not match."

        # Validate provenance
        assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
@ -337,16 +337,16 @@ def verify_conversion_result_v1(
        with open(dt_path, "w") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path, "r") as fr:
+        with open(pages_path) as fr:
            doc_true_pages = PageList.validate_json(fr.read())

-        with open(json_path, "r") as fr:
+        with open(json_path) as fr:
            doc_true: DsDocument = DsDocument.model_validate_json(fr.read())

-        with open(md_path, "r") as fr:
+        with open(md_path) as fr:
            doc_true_md = fr.read()

-        with open(dt_path, "r") as fr:
+        with open(dt_path) as fr:
            doc_true_dt = fr.read()

        if not fuzzy:
@ -419,16 +419,16 @@ def verify_conversion_result_v2(
        with open(dt_path, "w") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path, "r") as fr:
+        with open(pages_path) as fr:
            doc_true_pages = PageList.validate_json(fr.read())

-        with open(json_path, "r") as fr:
+        with open(json_path) as fr:
            doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())

-        with open(md_path, "r") as fr:
+        with open(md_path) as fr:
            doc_true_md = fr.read()

-        with open(dt_path, "r") as fr:
+        with open(dt_path) as fr:
            doc_true_dt = fr.read()

        if not fuzzy: