feat(ocr): Add OnnxTR as possible OCR engine

Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
felix 2025-03-22 13:28:17 +01:00
parent 268fa98821
commit e4ab4ce576
3 changed files with 22 additions and 15 deletions

View File

@ -1,7 +1,7 @@
import logging import logging
import os import os
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type from typing import Iterable, Optional, Tuple, Type, Union
import numpy import numpy
import numpy as np import numpy as np
@ -11,14 +11,12 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
OcrOptions, OcrOptions,
OnnxtrOcrOptions, OnnxtrOcrOptions,
) )
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -104,25 +102,25 @@ class OnnxtrOcrModel(BaseOcrModel):
clf_engine_cfg=engine_cfg, clf_engine_cfg=engine_cfg,
) )
def _to_absolute_and_docling_format( def _to_absolute_docling_format(
self, self,
geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray, geom: Union[Tuple[Tuple[float, float], Tuple[float, float]], np.ndarray],
img_shape: tuple[int, int], img_shape: Tuple[int, int],
) -> tuple[int, int, int, int]: ) -> Tuple[int, int, int, int]:
""" """
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format. Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
Args: Args:
geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]] geom: Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
img_shape (tuple[int, int]): (height, width) of the image img_shape: (height, width) of the image
Returns: Returns:
tuple: (x1, y1, x2, y2) top-left and bottom-right coordinates in absolute format (x1, y1, x2, y2)
""" """
h, w = img_shape h, w = img_shape
scale_inv = 1 / self.scale # Precompute inverse for efficiency scale_inv = 1 / self.scale # Precompute inverse for efficiency
def scale_point(x: float, y: float) -> tuple[int, int]: def scale_point(x: float, y: float) -> Tuple[int, int]:
"""Scale and round a point to absolute coordinates.""" """Scale and round a point to absolute coordinates."""
return int(round(x * w * scale_inv)), int(round(y * h * scale_inv)) return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
@ -187,7 +185,7 @@ class OnnxtrOcrModel(BaseOcrModel):
confidence=word.confidence, confidence=word.confidence,
rect=BoundingRectangle.from_bounding_box( rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple( BoundingBox.from_tuple(
self._to_absolute_and_docling_format( self._to_absolute_docling_format(
word.geometry, word.geometry,
img_shape=(im_height, im_width), img_shape=(im_height, im_width),
), ),

View File

@ -11,10 +11,19 @@ def main():
# Source document to convert # Source document to convert
source = "https://arxiv.org/pdf/2408.09869v4" source = "https://arxiv.org/pdf/2408.09869v4"
# Available detection & recognition models can be found at
# https://github.com/felixdittrich92/OnnxTR
# Or you choose a model from Hugging Face Hub
# Collection: https://huggingface.co/collections/Felix92/onnxtr-66bf213a9f88f7346c90e842
ocr_options = OnnxtrOcrOptions( ocr_options = OnnxtrOcrOptions(
# Text detection model
det_arch="db_mobilenet_v3_large", det_arch="db_mobilenet_v3_large",
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub # Text recognition model - from Hugging Face Hub
auto_correct_orientation=False, # This can be set to `True` to auto-correct the orientation of the pages reco_arch="Felix92/onnxtr-parseq-multilingual-v1",
# This can be set to `True` to auto-correct the orientation of the pages
auto_correct_orientation=False,
) )
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(

View File

@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
lxml = ">=4.0.0,<6.0.0" lxml = ">=4.0.0,<6.0.0"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true } ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" } rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" } onnxtr = { extras= ["gpu"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
onnxruntime = [ onnxruntime = [
# 1.19.2 is the last version with python3.9 support, # 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0