feat(ocr): Add OnnxTR as possible OCR engine

Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
felix 2025-03-22 13:28:17 +01:00
parent 268fa98821
commit e4ab4ce576
3 changed files with 22 additions and 15 deletions

View File

@ -1,7 +1,7 @@
import logging
import os
from pathlib import Path
from typing import Iterable, Optional, Type
from typing import Iterable, Optional, Tuple, Type, Union
import numpy
import numpy as np
@ -11,14 +11,12 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
OcrOptions,
OnnxtrOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@ -104,25 +102,25 @@ class OnnxtrOcrModel(BaseOcrModel):
clf_engine_cfg=engine_cfg,
)
def _to_absolute_and_docling_format(
def _to_absolute_docling_format(
self,
geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
img_shape: tuple[int, int],
) -> tuple[int, int, int, int]:
geom: Union[Tuple[Tuple[float, float], Tuple[float, float]], np.ndarray],
img_shape: Tuple[int, int],
) -> Tuple[int, int, int, int]:
"""
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
Args:
geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
img_shape (tuple[int, int]): (height, width) of the image
geom: Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
img_shape: (height, width) of the image
Returns:
tuple: (x1, y1, x2, y2)
top-left and bottom-right coordinates in absolute format (x1, y1, x2, y2)
"""
h, w = img_shape
scale_inv = 1 / self.scale # Precompute inverse for efficiency
def scale_point(x: float, y: float) -> tuple[int, int]:
def scale_point(x: float, y: float) -> Tuple[int, int]:
"""Scale and round a point to absolute coordinates."""
return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
@ -187,7 +185,7 @@ class OnnxtrOcrModel(BaseOcrModel):
confidence=word.confidence,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
self._to_absolute_and_docling_format(
self._to_absolute_docling_format(
word.geometry,
img_shape=(im_height, im_width),
),

View File

@ -11,10 +11,19 @@ def main():
# Source document to convert
source = "https://arxiv.org/pdf/2408.09869v4"
# Available detection & recognition models can be found at
# https://github.com/felixdittrich92/OnnxTR
# Or you choose a model from Hugging Face Hub
# Collection: https://huggingface.co/collections/Felix92/onnxtr-66bf213a9f88f7346c90e842
ocr_options = OnnxtrOcrOptions(
# Text detection model
det_arch="db_mobilenet_v3_large",
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
auto_correct_orientation=False, # This can be set to `True` to auto-correct the orientation of the pages
# Text recognition model - from Hugging Face Hub
reco_arch="Felix92/onnxtr-parseq-multilingual-v1",
# This can be set to `True` to auto-correct the orientation of the pages
auto_correct_orientation=False,
)
pipeline_options = PdfPipelineOptions(

View File

@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
lxml = ">=4.0.0,<6.0.0"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
onnxtr = { extras= ["gpu"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
onnxruntime = [
# 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0