[Feature] Add OnnxTR as possible OCR engine

Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
felix 2025-03-21 22:17:51 +01:00
parent a19cf81f98
commit cfc42458ae
3 changed files with 42 additions and 27 deletions

View File

@ -157,16 +157,23 @@ class OnnxtrOcrOptions(OcrOptions):
kind: ClassVar[Literal["onnxtr"]] = "onnxtr" kind: ClassVar[Literal["onnxtr"]] = "onnxtr"
lang: List[str] = ["en", "fr"] lang: List[str] = ["en", "fr"]
confidence_score: float = 0.5 # word confidence threshold for the recognition model
confidence_score: float = 0.7
# detection model objectness score threshold 'fast algorithm'
objectness_score: float = 0.3
# NOTE: This can be also a hf hub model
det_arch: str = "fast_base" det_arch: str = "fast_base"
reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model reco_arch: str = "crnn_vgg16_bn"
reco_bs: int = 512 reco_bs: int = 512
auto_correct_orientation: bool = False auto_correct_orientation: bool = False
preserve_aspect_ratio: bool = True preserve_aspect_ratio: bool = True
symmetric_pad: bool = True symmetric_pad: bool = True
paragraph_break: float = 0.035 paragraph_break: float = 0.035
load_in_8_bit: bool = False load_in_8_bit: bool = False
# Ref.: https://onnxruntime.ai/docs/api/python/api_summary.html
providers: list[tuple[str, dict[str, Any]]] | list[str] | None = None
session_options: Any = None
model_config = ConfigDict( model_config = ConfigDict(
extra="forbid", extra="forbid",

View File

@ -50,7 +50,7 @@ class OnnxtrOcrModel(BaseOcrModel):
ocr_predictor, ocr_predictor,
) )
# We diable multiprocessing for OnnxTR, # We disable multiprocessing for OnnxTR,
# because the speed up is minimal and it can raise memory leaks on windows # because the speed up is minimal and it can raise memory leaks on windows
os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE" os.environ["ONNXTR_MULTIPROCESSING_DISABLE"] = "TRUE"
except ImportError: except ImportError:
@ -72,12 +72,16 @@ class OnnxtrOcrModel(BaseOcrModel):
config = { config = {
"assume_straight_pages": True, "assume_straight_pages": True,
"straighten_pages": False, "straighten_pages": False,
# This should be disabled when docling supports polygons "export_as_straight_boxes": False,
"export_as_straight_boxes": True,
"disable_crop_orientation": False, "disable_crop_orientation": False,
"disable_page_orientation": False, "disable_page_orientation": False,
} }
engine_cfg = EngineConfig(
providers=self.options.providers,
session_options=self.options.session_options,
)
self.reader = ocr_predictor( self.reader = ocr_predictor(
det_arch=( det_arch=(
from_hub(self.options.det_arch) from_hub(self.options.det_arch)
@ -95,10 +99,9 @@ class OnnxtrOcrModel(BaseOcrModel):
paragraph_break=self.options.paragraph_break, paragraph_break=self.options.paragraph_break,
load_in_8_bit=self.options.load_in_8_bit, load_in_8_bit=self.options.load_in_8_bit,
**config, **config,
# TODO: Allow specification of the engine configs in the options det_engine_cfg=engine_cfg,
det_engine_cfg=None, reco_engine_cfg=engine_cfg,
reco_engine_cfg=None, clf_engine_cfg=engine_cfg,
clf_engine_cfg=None,
) )
def _to_absolute_and_docling_format( def _to_absolute_and_docling_format(
@ -170,24 +173,29 @@ class OnnxtrOcrModel(BaseOcrModel):
for line in block.lines for line in block.lines
for word in line.words for word in line.words
): ):
all_ocr_cells.append( if (
TextCell( word.confidence >= self.options.confidence_score
index=ix, and word.objectness_score
text=word.value, >= self.options.objectness_score
orig=word.value, ):
from_ocr=True, all_ocr_cells.append(
confidence=word.confidence, TextCell(
rect=BoundingRectangle.from_bounding_box( index=ix,
BoundingBox.from_tuple( text=word.value,
self._to_absolute_and_docling_format( orig=word.value,
word.geometry, from_ocr=True,
img_shape=(im_height, im_width), confidence=word.confidence,
), rect=BoundingRectangle.from_bounding_box(
origin=CoordOrigin.TOPLEFT, BoundingBox.from_tuple(
) self._to_absolute_and_docling_format(
), word.geometry,
img_shape=(im_height, im_width),
),
origin=CoordOrigin.TOPLEFT,
)
),
)
) )
)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) page.cells = self.post_process_cells(all_ocr_cells, page.cells)

View File

@ -14,7 +14,7 @@ def main():
ocr_options = OnnxtrOcrOptions( ocr_options = OnnxtrOcrOptions(
det_arch="db_mobilenet_v3_large", det_arch="db_mobilenet_v3_large",
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
auto_correct_orientation=True, # This can be used to correct the orientation of the pages auto_correct_orientation=False, # This can be set to `True` to auto-correct the orientation of the pages
) )
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(