feat(ocr): Add OnnxTR as possible OCR engine

Signed-off-by: felix <felixdittrich92@gmail.com>
2025-07-30 14:04:27 +00:00 · 2025-03-22 13:28:17 +01:00 · 2025-03-22 13:28:17 +01:00 · e4ab4ce576
commit e4ab4ce576
parent 268fa98821
3 changed files with 22 additions and 15 deletions
--- a/docling/models/onnxtr_model.py
+++ b/docling/models/onnxtr_model.py
@ -1,7 +1,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Iterable, Optional, Tuple, Type, Union

 import numpy
 import numpy as np
@ -11,14 +11,12 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
    AcceleratorOptions,
    OcrOptions,
    OnnxtrOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
@ -104,25 +102,25 @@ class OnnxtrOcrModel(BaseOcrModel):
                clf_engine_cfg=engine_cfg,
            )

-    def _to_absolute_and_docling_format(
+    def _to_absolute_docling_format(
        self,
-        geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
-        img_shape: tuple[int, int],
-    ) -> tuple[int, int, int, int]:
+        geom: Union[Tuple[Tuple[float, float], Tuple[float, float]], np.ndarray],
+        img_shape: Tuple[int, int],
+    ) -> Tuple[int, int, int, int]:
        """
        Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.

        Args:
-            geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
-            img_shape (tuple[int, int]): (height, width) of the image
+            geom: Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
+            img_shape: (height, width) of the image

        Returns:
-            tuple: (x1, y1, x2, y2)
+            top-left and bottom-right coordinates in absolute format (x1, y1, x2, y2)
        """
        h, w = img_shape
        scale_inv = 1 / self.scale  # Precompute inverse for efficiency

-        def scale_point(x: float, y: float) -> tuple[int, int]:
+        def scale_point(x: float, y: float) -> Tuple[int, int]:
            """Scale and round a point to absolute coordinates."""
            return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))

@ -187,7 +185,7 @@ class OnnxtrOcrModel(BaseOcrModel):
                                            confidence=word.confidence,
                                            rect=BoundingRectangle.from_bounding_box(
                                                BoundingBox.from_tuple(
-                                                    self._to_absolute_and_docling_format(
+                                                    self._to_absolute_docling_format(
                                                        word.geometry,
                                                        img_shape=(im_height, im_width),
                                                    ),
--- a/docs/examples/onnxtr_with_custom_models.py
+++ b/docs/examples/onnxtr_with_custom_models.py
@ -11,10 +11,19 @@ def main():
    # Source document to convert
    source = "https://arxiv.org/pdf/2408.09869v4"

+    # Available detection & recognition models can be found at
+    # https://github.com/felixdittrich92/OnnxTR
+
+    # Or you choose a model from Hugging Face Hub
+    # Collection: https://huggingface.co/collections/Felix92/onnxtr-66bf213a9f88f7346c90e842
+
    ocr_options = OnnxtrOcrOptions(
+        # Text detection model
        det_arch="db_mobilenet_v3_large",
-        reco_arch="Felix92/onnxtr-parseq-multilingual-v1",  # Model will be downloaded from Hugging Face Hub
-        auto_correct_orientation=False,  # This can be set to `True` to auto-correct the orientation of the pages
+        # Text recognition model - from Hugging Face Hub
+        reco_arch="Felix92/onnxtr-parseq-multilingual-v1",
+        # This can be set to `True` to auto-correct the orientation of the pages
+        auto_correct_orientation=False,
    )

    pipeline_options = PdfPipelineOptions(
--- a/pyproject.toml
+++ b/pyproject.toml
@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
 lxml = ">=4.0.0,<6.0.0"
 ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
 rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
-onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
+onnxtr = { extras= ["gpu"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
 onnxruntime = [
  # 1.19.2 is the last version with python3.9 support,
  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0