From cfc42458ae645af83679e6a68e818bfb2cd313e0 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Fri, 21 Mar 2025 22:17:51 +0100
Subject: [PATCH] [Feature] Add OnnxTR as possible OCR engine

Signed-off-by: felix <felixdittrich92@gmail.com>
---
 docling/datamodel/pipeline_options.py      | 11 ++++-
 docling/models/onnxtr_model.py             | 56 ++++++++++++----------
 docs/examples/onnxtr_with_custom_models.py |  2 +-
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index d610ada3..5d809c58 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -157,16 +157,23 @@ class OnnxtrOcrOptions(OcrOptions):
     kind: ClassVar[Literal["onnxtr"]] = "onnxtr"
 
     lang: List[str] = ["en", "fr"]
-    confidence_score: float = 0.5
+    # word confidence threshold for the recognition model
+    confidence_score: float = 0.7
+    # detection model objectness score threshold 'fast algorithm'
+    objectness_score: float = 0.3
 
+    # NOTE: This can be also a hf hub model
     det_arch: str = "fast_base"
-    reco_arch: str = "crnn_vgg16_bn"  # NOTE: This can be also a hf hub model
+    reco_arch: str = "crnn_vgg16_bn"
     reco_bs: int = 512
     auto_correct_orientation: bool = False
     preserve_aspect_ratio: bool = True
     symmetric_pad: bool = True
     paragraph_break: float = 0.035
     load_in_8_bit: bool = False
+    # Ref.: https://onnxruntime.ai/docs/api/python/api_summary.html
+    providers: list[tuple[str, dict[str, Any]]] | list[str] | None = None
+    session_options: Any = None
 
     model_config = ConfigDict(
         extra="forbid",
diff --git a/docling/models/onnxtr_model.py b/docling/models/onnxtr_model.py
index 11a899cd..2423752f 100644
--- a/docling/models/onnxtr_model.py
+++ b/docling/models/onnxtr_model.py
@@ -50,7 +50,7 @@ class OnnxtrOcrModel(BaseOcrModel):
                     ocr_predictor,
                 )
 
-                # We diable multiprocessing for OnnxTR,
+                # We disable multiprocessing for OnnxTR,
                 # because the speed up is minimal and it can raise memory leaks on windows
                 os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE"
             except ImportError:
@@ -72,12 +72,16 @@ class OnnxtrOcrModel(BaseOcrModel):
                 config = {
                     "assume_straight_pages": True,
                     "straighten_pages": False,
-                    # This should be disabled when docling supports polygons
-                    "export_as_straight_boxes": True,
+                    "export_as_straight_boxes": False,
                     "disable_crop_orientation": False,
                     "disable_page_orientation": False,
                 }
 
+            engine_cfg = EngineConfig(
+                providers=self.options.providers,
+                session_options=self.options.session_options,
+            )
+
             self.reader = ocr_predictor(
                 det_arch=(
                     from_hub(self.options.det_arch)
@@ -95,10 +99,9 @@ class OnnxtrOcrModel(BaseOcrModel):
                 paragraph_break=self.options.paragraph_break,
                 load_in_8_bit=self.options.load_in_8_bit,
                 **config,
-                # TODO: Allow specification of the engine configs in the options
-                det_engine_cfg=None,
-                reco_engine_cfg=None,
-                clf_engine_cfg=None,
+                det_engine_cfg=engine_cfg,
+                reco_engine_cfg=engine_cfg,
+                clf_engine_cfg=engine_cfg,
             )
 
     def _to_absolute_and_docling_format(
@@ -170,24 +173,29 @@ class OnnxtrOcrModel(BaseOcrModel):
                                 for line in block.lines
                                 for word in line.words
                             ):
-                                all_ocr_cells.append(
-                                    TextCell(
-                                        index=ix,
-                                        text=word.value,
-                                        orig=word.value,
-                                        from_ocr=True,
-                                        confidence=word.confidence,
-                                        rect=BoundingRectangle.from_bounding_box(
-                                            BoundingBox.from_tuple(
-                                                self._to_absolute_and_docling_format(
-                                                    word.geometry,
-                                                    img_shape=(im_height, im_width),
-                                                ),
-                                                origin=CoordOrigin.TOPLEFT,
-                                            )
-                                        ),
+                                if (
+                                    word.confidence >= self.options.confidence_score
+                                    and word.objectness_score
+                                    >= self.options.objectness_score
+                                ):
+                                    all_ocr_cells.append(
+                                        TextCell(
+                                            index=ix,
+                                            text=word.value,
+                                            orig=word.value,
+                                            from_ocr=True,
+                                            confidence=word.confidence,
+                                            rect=BoundingRectangle.from_bounding_box(
+                                                BoundingBox.from_tuple(
+                                                    self._to_absolute_and_docling_format(
+                                                        word.geometry,
+                                                        img_shape=(im_height, im_width),
+                                                    ),
+                                                    origin=CoordOrigin.TOPLEFT,
+                                                )
+                                            ),
+                                        )
                                     )
-                                )
 
                 # Post-process the cells
                 page.cells = self.post_process_cells(all_ocr_cells, page.cells)
diff --git a/docs/examples/onnxtr_with_custom_models.py b/docs/examples/onnxtr_with_custom_models.py
index 9853502b..f459da61 100644
--- a/docs/examples/onnxtr_with_custom_models.py
+++ b/docs/examples/onnxtr_with_custom_models.py
@@ -14,7 +14,7 @@ def main():
     ocr_options = OnnxtrOcrOptions(
         det_arch="db_mobilenet_v3_large",
         reco_arch="Felix92/onnxtr-parseq-multilingual-v1",  # Model will be downloaded from Hugging Face Hub
-        auto_correct_orientation=True,  # This can be used to correct the orientation of the pages
+        auto_correct_orientation=False,  # This can be set to `True` to auto-correct the orientation of the pages
     )
 
     pipeline_options = PdfPipelineOptions(