style & quality applied

Signed-off-by: felix <felixdittrich92@gmail.com>
2025-07-29 21:44:32 +00:00 · 2025-03-21 21:49:50 +01:00 · 2025-03-21 21:49:50 +01:00 · a19cf81f98
commit a19cf81f98
parent 7c87467ea5
7 changed files with 1509 additions and 47 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -161,18 +161,12 @@ class OnnxtrOcrOptions(OcrOptions):

    det_arch: str = "fast_base"
    reco_arch: str = "crnn_vgg16_bn"  # NOTE: This can be also a hf hub model
-    det_bs: int = (
-        1  # NOTE: Should be 1 because docling seems not to support batch processing yet
-    )
    reco_bs: int = 512
    auto_correct_orientation: bool = False
    preserve_aspect_ratio: bool = True
    symmetric_pad: bool = True
    paragraph_break: float = 0.035
    load_in_8_bit: bool = False
-    det_engine_cfg: Dict[str, Any] = {}
-    reco_engine_cfg: Dict[str, Any] = {}
-    clf_engine_cfg: Dict[str, Any] = {}

    model_config = ConfigDict(
        extra="forbid",
--- a/docling/models/onnxtr_model.py
+++ b/docling/models/onnxtr_model.py
@ -1,8 +1,10 @@
 import logging
+import os
 from pathlib import Path
 from typing import Iterable, Optional, Type

 import numpy
+import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell

@ -42,7 +44,15 @@ class OnnxtrOcrModel(BaseOcrModel):

        if self.enabled:
            try:
-                from onnxtr.models import ocr_predictor, EngineConfig, from_hub  # type: ignore
+                from onnxtr.models import (  # type: ignore
+                    EngineConfig,
+                    from_hub,
+                    ocr_predictor,
+                )
+
+                # We diable multiprocessing for OnnxTR,
+                # because the speed up is minimal and it can raise memory leaks on windows
+                os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE"
            except ImportError:
                raise ImportError(
                    "OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
@ -62,6 +72,7 @@ class OnnxtrOcrModel(BaseOcrModel):
                config = {
                    "assume_straight_pages": True,
                    "straighten_pages": False,
+                    # This should be disabled when docling supports polygons
                    "export_as_straight_boxes": True,
                    "disable_crop_orientation": False,
                    "disable_page_orientation": False,
@ -78,15 +89,22 @@ class OnnxtrOcrModel(BaseOcrModel):
                    if self.options.reco_arch.count("/") == 1
                    else self.options.reco_arch
                ),
+                det_bs=1,  # NOTE: Should be always 1, because docling handles batching
                preserve_aspect_ratio=self.options.preserve_aspect_ratio,
                symmetric_pad=self.options.symmetric_pad,
                paragraph_break=self.options.paragraph_break,
                load_in_8_bit=self.options.load_in_8_bit,
                **config,
+                # TODO: Allow specification of the engine configs in the options
+                det_engine_cfg=None,
+                reco_engine_cfg=None,
+                clf_engine_cfg=None,
            )

    def _to_absolute_and_docling_format(
-        self, geom: list[list[float]], img_shape: tuple[int, int]
+        self,
+        geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
+        img_shape: tuple[int, int],
    ) -> tuple[int, int, int, int]:
        """
        Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
@ -109,14 +127,11 @@ class OnnxtrOcrModel(BaseOcrModel):
            (xmin, ymin), (xmax, ymax) = geom
            x1, y1 = scale_point(xmin, ymin)
            x2, y2 = scale_point(xmax, ymax)
-        elif len(geom) == 4:
+        # 4-Point polygon
+        else:
            abs_points = [scale_point(*point) for point in geom]
            x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
            x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
-        else:
-            raise ValueError(
-                f"Invalid geometry format: {geom}. Expected either 2 or 4 points."
-            )

        return x1, y1, x2, y2

--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@ -1,9 +1,9 @@
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.ocr_mac_model import OcrMacModel
+from docling.models.onnxtr_model import OnnxtrOcrModel
 from docling.models.picture_description_api_model import PictureDescriptionApiModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
-from docling.models.onnxtr_model import OnnxtrOcrModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel

--- a/docs/examples/onnxtr_with_custom_models.py
+++ b/docs/examples/onnxtr_with_custom_models.py
@ -0,0 +1,40 @@
+from docling.datamodel.pipeline_options import OnnxtrOcrOptions, PdfPipelineOptions
+from docling.document_converter import (
+    ConversionResult,
+    DocumentConverter,
+    InputFormat,
+    PdfFormatOption,
+)
+
+
+def main():
+    # Source document to convert
+    source = "https://arxiv.org/pdf/2408.09869v4"
+
+    ocr_options = OnnxtrOcrOptions(
+        det_arch="db_mobilenet_v3_large",
+        reco_arch="Felix92/onnxtr-parseq-multilingual-v1",  # Model will be downloaded from Hugging Face Hub
+        auto_correct_orientation=True,  # This can be used to correct the orientation of the pages
+    )
+
+    pipeline_options = PdfPipelineOptions(
+        ocr_options=ocr_options,
+    )
+
+    # Convert the document
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            ),
+        },
+    )
+
+    conversion_result: ConversionResult = converter.convert(source=source)
+    doc = conversion_result.document
+    md = doc.export_to_markdown()
+    print(md)
+
+
+if __name__ == "__main__":
+    main()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -44,7 +44,7 @@ packages = [{ include = "docling" }]
 ######################
 # actual dependencies:
 ######################
-python = "^3.9"
+python = "^3.10"
 pydantic = "^2.0.0"
 docling-core = {extras = ["chunking"], version = "^2.24.1"}
 docling-ibm-models = "^3.4.0"
@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
 lxml = ">=4.0.0,<6.0.0"
 ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
 rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
-onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" }
+onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
 onnxruntime = [
  # 1.19.2 is the last version with python3.9 support,
  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -11,9 +11,9 @@ from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    OcrOptions,
+    OnnxtrOcrOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
-    OnnxtrOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )