From cfc42458ae645af83679e6a68e818bfb2cd313e0 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 21 Mar 2025 22:17:51 +0100 Subject: [PATCH] [Feature] Add OnnxTR as possible OCR engine Signed-off-by: felix --- docling/datamodel/pipeline_options.py | 11 ++++- docling/models/onnxtr_model.py | 56 ++++++++++++---------- docs/examples/onnxtr_with_custom_models.py | 2 +- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d610ada3..5d809c58 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -157,16 +157,23 @@ class OnnxtrOcrOptions(OcrOptions): kind: ClassVar[Literal["onnxtr"]] = "onnxtr" lang: List[str] = ["en", "fr"] - confidence_score: float = 0.5 + # word confidence threshold for the recognition model + confidence_score: float = 0.7 + # detection model objectness score threshold 'fast algorithm' + objectness_score: float = 0.3 + # NOTE: This can be also a hf hub model det_arch: str = "fast_base" - reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model + reco_arch: str = "crnn_vgg16_bn" reco_bs: int = 512 auto_correct_orientation: bool = False preserve_aspect_ratio: bool = True symmetric_pad: bool = True paragraph_break: float = 0.035 load_in_8_bit: bool = False + # Ref.: https://onnxruntime.ai/docs/api/python/api_summary.html + providers: list[tuple[str, dict[str, Any]]] | list[str] | None = None + session_options: Any = None model_config = ConfigDict( extra="forbid", diff --git a/docling/models/onnxtr_model.py b/docling/models/onnxtr_model.py index 11a899cd..2423752f 100644 --- a/docling/models/onnxtr_model.py +++ b/docling/models/onnxtr_model.py @@ -50,7 +50,7 @@ class OnnxtrOcrModel(BaseOcrModel): ocr_predictor, ) - # We diable multiprocessing for OnnxTR, + # We disable multiprocessing for OnnxTR, # because the speed up is minimal and it can raise memory leaks on windows os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE" except ImportError: @@ -72,12 +72,16 @@ class OnnxtrOcrModel(BaseOcrModel): config = { "assume_straight_pages": True, "straighten_pages": False, - # This should be disabled when docling supports polygons - "export_as_straight_boxes": True, + "export_as_straight_boxes": False, "disable_crop_orientation": False, "disable_page_orientation": False, } + engine_cfg = EngineConfig( + providers=self.options.providers, + session_options=self.options.session_options, + ) + self.reader = ocr_predictor( det_arch=( from_hub(self.options.det_arch) @@ -95,10 +99,9 @@ class OnnxtrOcrModel(BaseOcrModel): paragraph_break=self.options.paragraph_break, load_in_8_bit=self.options.load_in_8_bit, **config, - # TODO: Allow specification of the engine configs in the options - det_engine_cfg=None, - reco_engine_cfg=None, - clf_engine_cfg=None, + det_engine_cfg=engine_cfg, + reco_engine_cfg=engine_cfg, + clf_engine_cfg=engine_cfg, ) def _to_absolute_and_docling_format( @@ -170,24 +173,29 @@ class OnnxtrOcrModel(BaseOcrModel): for line in block.lines for word in line.words ): - all_ocr_cells.append( - TextCell( - index=ix, - text=word.value, - orig=word.value, - from_ocr=True, - confidence=word.confidence, - rect=BoundingRectangle.from_bounding_box( - BoundingBox.from_tuple( - self._to_absolute_and_docling_format( - word.geometry, - img_shape=(im_height, im_width), - ), - origin=CoordOrigin.TOPLEFT, - ) - ), + if ( + word.confidence >= self.options.confidence_score + and word.objectness_score + >= self.options.objectness_score + ): + all_ocr_cells.append( + TextCell( + index=ix, + text=word.value, + orig=word.value, + from_ocr=True, + confidence=word.confidence, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + self._to_absolute_and_docling_format( + word.geometry, + img_shape=(im_height, im_width), + ), + origin=CoordOrigin.TOPLEFT, + ) + ), + ) ) - ) # Post-process the cells page.cells = self.post_process_cells(all_ocr_cells, page.cells) diff --git a/docs/examples/onnxtr_with_custom_models.py b/docs/examples/onnxtr_with_custom_models.py index 9853502b..f459da61 100644 --- a/docs/examples/onnxtr_with_custom_models.py +++ b/docs/examples/onnxtr_with_custom_models.py @@ -14,7 +14,7 @@ def main(): ocr_options = OnnxtrOcrOptions( det_arch="db_mobilenet_v3_large", reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub - auto_correct_orientation=True, # This can be used to correct the orientation of the pages + auto_correct_orientation=False, # This can be set to `True` to auto-correct the orientation of the pages ) pipeline_options = PdfPipelineOptions(