[Feature] Add OnnxTR as possible OCR engine

Signed-off-by: felix <felixdittrich92@gmail.com>
2025-07-30 14:04:27 +00:00 · 2025-03-21 22:17:51 +01:00 · 2025-03-21 22:17:51 +01:00 · cfc42458ae
commit cfc42458ae
parent a19cf81f98
3 changed files with 42 additions and 27 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -157,16 +157,23 @@ class OnnxtrOcrOptions(OcrOptions):
    kind: ClassVar[Literal["onnxtr"]] = "onnxtr"

    lang: List[str] = ["en", "fr"]
-    confidence_score: float = 0.5
+    # word confidence threshold for the recognition model
+    confidence_score: float = 0.7
+    # detection model objectness score threshold 'fast algorithm'
+    objectness_score: float = 0.3

+    # NOTE: This can be also a hf hub model
    det_arch: str = "fast_base"
-    reco_arch: str = "crnn_vgg16_bn"  # NOTE: This can be also a hf hub model
+    reco_arch: str = "crnn_vgg16_bn"
    reco_bs: int = 512
    auto_correct_orientation: bool = False
    preserve_aspect_ratio: bool = True
    symmetric_pad: bool = True
    paragraph_break: float = 0.035
    load_in_8_bit: bool = False
+    # Ref.: https://onnxruntime.ai/docs/api/python/api_summary.html
+    providers: list[tuple[str, dict[str, Any]]] | list[str] | None = None
+    session_options: Any = None

    model_config = ConfigDict(
        extra="forbid",
--- a/docling/models/onnxtr_model.py
+++ b/docling/models/onnxtr_model.py
@ -50,7 +50,7 @@ class OnnxtrOcrModel(BaseOcrModel):
                    ocr_predictor,
                )

-                # We diable multiprocessing for OnnxTR,
+                # We disable multiprocessing for OnnxTR,
                # because the speed up is minimal and it can raise memory leaks on windows
                os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE"
            except ImportError:
@ -72,12 +72,16 @@ class OnnxtrOcrModel(BaseOcrModel):
                config = {
                    "assume_straight_pages": True,
                    "straighten_pages": False,
-                    # This should be disabled when docling supports polygons
-                    "export_as_straight_boxes": True,
+                    "export_as_straight_boxes": False,
                    "disable_crop_orientation": False,
                    "disable_page_orientation": False,
                }

+            engine_cfg = EngineConfig(
+                providers=self.options.providers,
+                session_options=self.options.session_options,
+            )
+
            self.reader = ocr_predictor(
                det_arch=(
                    from_hub(self.options.det_arch)
@ -95,10 +99,9 @@ class OnnxtrOcrModel(BaseOcrModel):
                paragraph_break=self.options.paragraph_break,
                load_in_8_bit=self.options.load_in_8_bit,
                **config,
-                # TODO: Allow specification of the engine configs in the options
-                det_engine_cfg=None,
-                reco_engine_cfg=None,
-                clf_engine_cfg=None,
+                det_engine_cfg=engine_cfg,
+                reco_engine_cfg=engine_cfg,
+                clf_engine_cfg=engine_cfg,
            )

    def _to_absolute_and_docling_format(
@ -170,24 +173,29 @@ class OnnxtrOcrModel(BaseOcrModel):
                                for line in block.lines
                                for word in line.words
                            ):
-                                all_ocr_cells.append(
-                                    TextCell(
-                                        index=ix,
-                                        text=word.value,
-                                        orig=word.value,
-                                        from_ocr=True,
-                                        confidence=word.confidence,
-                                        rect=BoundingRectangle.from_bounding_box(
-                                            BoundingBox.from_tuple(
-                                                self._to_absolute_and_docling_format(
-                                                    word.geometry,
-                                                    img_shape=(im_height, im_width),
-                                                ),
-                                                origin=CoordOrigin.TOPLEFT,
-                                            )
-                                        ),
+                                if (
+                                    word.confidence >= self.options.confidence_score
+                                    and word.objectness_score
+                                    >= self.options.objectness_score
+                                ):
+                                    all_ocr_cells.append(
+                                        TextCell(
+                                            index=ix,
+                                            text=word.value,
+                                            orig=word.value,
+                                            from_ocr=True,
+                                            confidence=word.confidence,
+                                            rect=BoundingRectangle.from_bounding_box(
+                                                BoundingBox.from_tuple(
+                                                    self._to_absolute_and_docling_format(
+                                                        word.geometry,
+                                                        img_shape=(im_height, im_width),
+                                                    ),
+                                                    origin=CoordOrigin.TOPLEFT,
+                                                )
+                                            ),
+                                        )
                                    )
-                                )

                # Post-process the cells
                page.cells = self.post_process_cells(all_ocr_cells, page.cells)
--- a/docs/examples/onnxtr_with_custom_models.py
+++ b/docs/examples/onnxtr_with_custom_models.py
@ -14,7 +14,7 @@ def main():
    ocr_options = OnnxtrOcrOptions(
        det_arch="db_mobilenet_v3_large",
        reco_arch="Felix92/onnxtr-parseq-multilingual-v1",  # Model will be downloaded from Hugging Face Hub
-        auto_correct_orientation=True,  # This can be used to correct the orientation of the pages
+        auto_correct_orientation=False,  # This can be set to `True` to auto-correct the orientation of the pages
    )

    pipeline_options = PdfPipelineOptions(