From 35f185f545c4ebe5a5360f9de8ef243f74128f62 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Fri, 21 Mar 2025 21:09:16 +0100
Subject: [PATCH] init

Signed-off-by: felix <felixdittrich92@gmail.com>
---
 docling/datamodel/pipeline_options.py |  26 ++++
 docling/models/onnxtr_model.py        | 174 ++++++++++++++++++++++++++
 docling/models/plugins/defaults.py    |   2 +
 pyproject.toml                        |   5 +-
 tests/test_e2e_ocr_conversion.py      |   2 +
 5 files changed, 207 insertions(+), 2 deletions(-)
 create mode 100644 docling/models/onnxtr_model.py

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 654e04df..3bef70da 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -151,6 +151,32 @@ class RapidOcrOptions(OcrOptions):
     )
 
 
+class OnnxtrOcrOptions(OcrOptions):
+    """Options for the Onnxtr engine."""
+
+    kind: ClassVar[Literal["onnxtr"]] = "onnxtr"
+
+    lang: List[str] = ["en", "fr"]
+    confidence_score: float = 0.5
+
+    det_arch: str = "fast_base"
+    reco_arch: str = "crnn_vgg16_bn"  # NOTE: This can be also a hf hub model
+    det_bs: int = 1  # NOTE: Should be 1 because docling seems not to support batch processing yet
+    reco_bs: int = 512
+    auto_correct_orientation: bool = False
+    preserve_aspect_ratio: bool = True
+    symmetric_pad: bool = True
+    paragraph_break: float = 0.035
+    load_in_8_bit: bool = False
+    det_engine_cfg: Dict[str, Any] = {}
+    reco_engine_cfg: Dict[str, Any] = {}
+    clf_engine_cfg: Dict[str, Any] = {}
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
 class EasyOcrOptions(OcrOptions):
     """Options for the EasyOCR engine."""
 
diff --git a/docling/models/onnxtr_model.py b/docling/models/onnxtr_model.py
new file mode 100644
index 00000000..e1eb4e5d
--- /dev/null
+++ b/docling/models/onnxtr_model.py
@@ -0,0 +1,174 @@
+import logging
+from pathlib import Path
+from typing import Iterable, Optional, Type
+
+import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
+
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    OcrOptions,
+    OnnxtrOcrOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class OnnxtrOcrModel(BaseOcrModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: OnnxtrOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: OnnxtrOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            try:
+                from onnxtr.models import ocr_predictor, EngineConfig, from_hub  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
+
+
+            if options.auto_correct_orientation:
+                config = {
+                    "assume_straight_pages": False,
+                    "straighten_pages": True,
+                    "export_as_straight_boxes": True,
+                    # Disable crop orientation because we straighten the pages already
+                    "disable_crop_orientation": True,
+                    "disable_page_orientation": False,
+                }
+            else:
+                config = {
+                    "assume_straight_pages": True,
+                    "straighten_pages": False,
+                    "export_as_straight_boxes": True,
+                    "disable_crop_orientation": False,
+                    "disable_page_orientation": False,
+                }
+
+            self.reader = ocr_predictor(
+                det_arch=from_hub(self.options.det_arch) if self.options.det_arch.count("/") == 1 else self.options.det_arch,
+                reco_arch=from_hub(self.options.reco_arch) if self.options.reco_arch.count("/") == 1 else self.options.reco_arch,
+                preserve_aspect_ratio=self.options.preserve_aspect_ratio,
+                symmetric_pad=self.options.symmetric_pad,
+                paragraph_break=self.options.paragraph_break,
+                load_in_8_bit=self.options.load_in_8_bit,
+                **config,
+            )
+
+    def _to_absolute_and_docling_format(self, geom: list[list[float]], img_shape: tuple[int, int]) -> tuple[int, int, int, int]:
+        """
+        Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
+
+        Args:
+            geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
+            img_shape (tuple[int, int]): (height, width) of the image
+
+        Returns:
+            tuple: (x1, y1, x2, y2)
+        """
+        h, w = img_shape
+        scale_inv = 1 / self.scale  # Precompute inverse for efficiency
+
+        def scale_point(x: float, y: float) -> tuple[int, int]:
+            """Scale and round a point to absolute coordinates."""
+            return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
+
+        if len(geom) == 2:
+            (xmin, ymin), (xmax, ymax) = geom
+            x1, y1 = scale_point(xmin, ymin)
+            x2, y2 = scale_point(xmax, ymax)
+        elif len(geom) == 4:
+            abs_points = [scale_point(*point) for point in geom]
+            x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
+            x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
+        else:
+            raise ValueError(f"Invalid geometry format: {geom}. Expected either 2 or 4 points.")
+
+        return x1, y1, x2, y2
+
+
+    def __call__(self, conv_res: ConversionResult, page_batch: Iterable[Page]) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+                continue
+
+            with TimeRecorder(conv_res, "ocr"):
+                ocr_rects = self.get_ocr_rects(page)
+                all_ocr_cells = []
+
+                for ocr_rect in ocr_rects:
+                    if ocr_rect.area() == 0:
+                        continue
+
+                    with page._backend.get_page_image(scale=self.scale, cropbox=ocr_rect) as high_res_image:
+                        im_width, im_height = high_res_image.size
+                        result = self.reader([numpy.array(high_res_image)])
+
+                    if result is not None:
+                        for p in result.pages:
+                            for ix, word in enumerate(
+                                word
+                                for block in p.blocks
+                                for line in block.lines
+                                for word in line.words
+                            ):
+                                all_ocr_cells.append(
+                                    TextCell(
+                                        index=ix,
+                                        text=word.value,
+                                        orig=word.value,
+                                        from_ocr=True,
+                                        confidence=word.confidence,
+                                        rect=BoundingRectangle.from_bounding_box(
+                                            BoundingBox.from_tuple(
+                                                self._to_absolute_and_docling_format(
+                                                    word.geometry, img_shape=(im_height, im_width)
+                                                ),
+                                                origin=CoordOrigin.TOPLEFT,
+                                            )
+                                        ),
+                                    )
+                                )
+
+                # Post-process the cells
+                page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+
+            # DEBUG code:
+            if settings.debug.visualize_ocr:
+                self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+
+            yield page
+
+
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return OnnxtrOcrOptions
diff --git a/docling/models/plugins/defaults.py b/docling/models/plugins/defaults.py
index 00873579..1775e23a 100644
--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@@ -3,6 +3,7 @@ from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.picture_description_api_model import PictureDescriptionApiModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
+from docling.models.onnxtr_model import OnnxtrOcrModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 
@@ -13,6 +14,7 @@ def ocr_engines():
             EasyOcrModel,
             OcrMacModel,
             RapidOcrModel,
+            OnnxtrOcrModel,
             TesseractOcrModel,
             TesseractOcrCliModel,
         ]
diff --git a/pyproject.toml b/pyproject.toml
index dd48a9d2..5dbbe74b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,11 +72,12 @@ openpyxl = "^3.1.5"
 lxml = ">=4.0.0,<6.0.0"
 ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
 rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
+onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" }
 onnxruntime = [
   # 1.19.2 is the last version with python3.9 support,
   # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
-  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
-  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
+  { version = "^1.7.0", optional = true, markers = "python_version < '3.10'" },
+  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version >= '3.10'" },
 ]
 
 transformers = [
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 985a6250..c110906a 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
     OcrOptions,
     PdfPipelineOptions,
     RapidOcrOptions,
+    OnnxtrOcrOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
 )
@@ -62,6 +63,7 @@ def test_e2e_conversions():
         TesseractOcrOptions(),
         TesseractCliOcrOptions(),
         EasyOcrOptions(force_full_page_ocr=True),
+        OnnxtrOcrOptions(force_full_page_ocr=True),
         TesseractOcrOptions(force_full_page_ocr=True),
         TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
         TesseractCliOcrOptions(force_full_page_ocr=True),