mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat(ocr): Add OnnxTR as possible OCR engine
Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
parent
268fa98821
commit
e4ab4ce576
@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type
|
||||
from typing import Iterable, Optional, Tuple, Type, Union
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
@ -11,14 +11,12 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
OcrOptions,
|
||||
OnnxtrOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -104,25 +102,25 @@ class OnnxtrOcrModel(BaseOcrModel):
|
||||
clf_engine_cfg=engine_cfg,
|
||||
)
|
||||
|
||||
def _to_absolute_and_docling_format(
|
||||
def _to_absolute_docling_format(
|
||||
self,
|
||||
geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
|
||||
img_shape: tuple[int, int],
|
||||
) -> tuple[int, int, int, int]:
|
||||
geom: Union[Tuple[Tuple[float, float], Tuple[float, float]], np.ndarray],
|
||||
img_shape: Tuple[int, int],
|
||||
) -> Tuple[int, int, int, int]:
|
||||
"""
|
||||
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
|
||||
|
||||
Args:
|
||||
geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
|
||||
img_shape (tuple[int, int]): (height, width) of the image
|
||||
geom: Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
|
||||
img_shape: (height, width) of the image
|
||||
|
||||
Returns:
|
||||
tuple: (x1, y1, x2, y2)
|
||||
top-left and bottom-right coordinates in absolute format (x1, y1, x2, y2)
|
||||
"""
|
||||
h, w = img_shape
|
||||
scale_inv = 1 / self.scale # Precompute inverse for efficiency
|
||||
|
||||
def scale_point(x: float, y: float) -> tuple[int, int]:
|
||||
def scale_point(x: float, y: float) -> Tuple[int, int]:
|
||||
"""Scale and round a point to absolute coordinates."""
|
||||
return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
|
||||
|
||||
@ -187,7 +185,7 @@ class OnnxtrOcrModel(BaseOcrModel):
|
||||
confidence=word.confidence,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
self._to_absolute_and_docling_format(
|
||||
self._to_absolute_docling_format(
|
||||
word.geometry,
|
||||
img_shape=(im_height, im_width),
|
||||
),
|
||||
|
@ -11,10 +11,19 @@ def main():
|
||||
# Source document to convert
|
||||
source = "https://arxiv.org/pdf/2408.09869v4"
|
||||
|
||||
# Available detection & recognition models can be found at
|
||||
# https://github.com/felixdittrich92/OnnxTR
|
||||
|
||||
# Or you choose a model from Hugging Face Hub
|
||||
# Collection: https://huggingface.co/collections/Felix92/onnxtr-66bf213a9f88f7346c90e842
|
||||
|
||||
ocr_options = OnnxtrOcrOptions(
|
||||
# Text detection model
|
||||
det_arch="db_mobilenet_v3_large",
|
||||
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
|
||||
auto_correct_orientation=False, # This can be set to `True` to auto-correct the orientation of the pages
|
||||
# Text recognition model - from Hugging Face Hub
|
||||
reco_arch="Felix92/onnxtr-parseq-multilingual-v1",
|
||||
# This can be set to `True` to auto-correct the orientation of the pages
|
||||
auto_correct_orientation=False,
|
||||
)
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
|
@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
|
||||
lxml = ">=4.0.0,<6.0.0"
|
||||
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
|
||||
onnxtr = { extras= ["gpu"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
|
||||
onnxruntime = [
|
||||
# 1.19.2 is the last version with python3.9 support,
|
||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||
|
Loading…
Reference in New Issue
Block a user