mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat(ocr): Add OnnxTR as possible OCR engine
Signed-off-by: felix <felixdittrich92@gmail.com>
This commit is contained in:
parent
268fa98821
commit
e4ab4ce576
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type
|
from typing import Iterable, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -11,14 +11,12 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
|
|||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
OnnxtrOcrOptions,
|
OnnxtrOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -104,25 +102,25 @@ class OnnxtrOcrModel(BaseOcrModel):
|
|||||||
clf_engine_cfg=engine_cfg,
|
clf_engine_cfg=engine_cfg,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _to_absolute_and_docling_format(
|
def _to_absolute_docling_format(
|
||||||
self,
|
self,
|
||||||
geom: tuple[tuple[float, float], tuple[float, float]] | np.ndarray,
|
geom: Union[Tuple[Tuple[float, float], Tuple[float, float]], np.ndarray],
|
||||||
img_shape: tuple[int, int],
|
img_shape: Tuple[int, int],
|
||||||
) -> tuple[int, int, int, int]:
|
) -> Tuple[int, int, int, int]:
|
||||||
"""
|
"""
|
||||||
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
|
Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
|
geom: Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
|
||||||
img_shape (tuple[int, int]): (height, width) of the image
|
img_shape: (height, width) of the image
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (x1, y1, x2, y2)
|
top-left and bottom-right coordinates in absolute format (x1, y1, x2, y2)
|
||||||
"""
|
"""
|
||||||
h, w = img_shape
|
h, w = img_shape
|
||||||
scale_inv = 1 / self.scale # Precompute inverse for efficiency
|
scale_inv = 1 / self.scale # Precompute inverse for efficiency
|
||||||
|
|
||||||
def scale_point(x: float, y: float) -> tuple[int, int]:
|
def scale_point(x: float, y: float) -> Tuple[int, int]:
|
||||||
"""Scale and round a point to absolute coordinates."""
|
"""Scale and round a point to absolute coordinates."""
|
||||||
return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
|
return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
|
||||||
|
|
||||||
@ -187,7 +185,7 @@ class OnnxtrOcrModel(BaseOcrModel):
|
|||||||
confidence=word.confidence,
|
confidence=word.confidence,
|
||||||
rect=BoundingRectangle.from_bounding_box(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
BoundingBox.from_tuple(
|
BoundingBox.from_tuple(
|
||||||
self._to_absolute_and_docling_format(
|
self._to_absolute_docling_format(
|
||||||
word.geometry,
|
word.geometry,
|
||||||
img_shape=(im_height, im_width),
|
img_shape=(im_height, im_width),
|
||||||
),
|
),
|
||||||
|
@ -11,10 +11,19 @@ def main():
|
|||||||
# Source document to convert
|
# Source document to convert
|
||||||
source = "https://arxiv.org/pdf/2408.09869v4"
|
source = "https://arxiv.org/pdf/2408.09869v4"
|
||||||
|
|
||||||
|
# Available detection & recognition models can be found at
|
||||||
|
# https://github.com/felixdittrich92/OnnxTR
|
||||||
|
|
||||||
|
# Or you choose a model from Hugging Face Hub
|
||||||
|
# Collection: https://huggingface.co/collections/Felix92/onnxtr-66bf213a9f88f7346c90e842
|
||||||
|
|
||||||
ocr_options = OnnxtrOcrOptions(
|
ocr_options = OnnxtrOcrOptions(
|
||||||
|
# Text detection model
|
||||||
det_arch="db_mobilenet_v3_large",
|
det_arch="db_mobilenet_v3_large",
|
||||||
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
|
# Text recognition model - from Hugging Face Hub
|
||||||
auto_correct_orientation=False, # This can be set to `True` to auto-correct the orientation of the pages
|
reco_arch="Felix92/onnxtr-parseq-multilingual-v1",
|
||||||
|
# This can be set to `True` to auto-correct the orientation of the pages
|
||||||
|
auto_correct_orientation=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
|
@ -72,7 +72,7 @@ openpyxl = "^3.1.5"
|
|||||||
lxml = ">=4.0.0,<6.0.0"
|
lxml = ">=4.0.0,<6.0.0"
|
||||||
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
||||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||||
onnxtr = { extras= ["gpu", "viz"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
|
onnxtr = { extras= ["gpu"], version = "^0.6.2", optional = true, markers = "python_version >= '3.10'" }
|
||||||
onnxruntime = [
|
onnxruntime = [
|
||||||
# 1.19.2 is the last version with python3.9 support,
|
# 1.19.2 is the last version with python3.9 support,
|
||||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||||
|
Loading…
Reference in New Issue
Block a user