From 35f185f545c4ebe5a5360f9de8ef243f74128f62 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 21 Mar 2025 21:09:16 +0100 Subject: [PATCH] init Signed-off-by: felix --- docling/datamodel/pipeline_options.py | 26 ++++ docling/models/onnxtr_model.py | 174 ++++++++++++++++++++++++++ docling/models/plugins/defaults.py | 2 + pyproject.toml | 5 +- tests/test_e2e_ocr_conversion.py | 2 + 5 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 docling/models/onnxtr_model.py diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 654e04df..3bef70da 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -151,6 +151,32 @@ class RapidOcrOptions(OcrOptions): ) +class OnnxtrOcrOptions(OcrOptions): + """Options for the Onnxtr engine.""" + + kind: ClassVar[Literal["onnxtr"]] = "onnxtr" + + lang: List[str] = ["en", "fr"] + confidence_score: float = 0.5 + + det_arch: str = "fast_base" + reco_arch: str = "crnn_vgg16_bn" # NOTE: This can be also a hf hub model + det_bs: int = 1 # NOTE: Should be 1 because docling seems not to support batch processing yet + reco_bs: int = 512 + auto_correct_orientation: bool = False + preserve_aspect_ratio: bool = True + symmetric_pad: bool = True + paragraph_break: float = 0.035 + load_in_8_bit: bool = False + det_engine_cfg: Dict[str, Any] = {} + reco_engine_cfg: Dict[str, Any] = {} + clf_engine_cfg: Dict[str, Any] = {} + + model_config = ConfigDict( + extra="forbid", + ) + + class EasyOcrOptions(OcrOptions): """Options for the EasyOCR engine.""" diff --git a/docling/models/onnxtr_model.py b/docling/models/onnxtr_model.py new file mode 100644 index 00000000..e1eb4e5d --- /dev/null +++ b/docling/models/onnxtr_model.py @@ -0,0 +1,174 @@ +import logging +from pathlib import Path +from typing import Iterable, Optional, Type + +import numpy +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell + +from docling.datamodel.base_models import Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + OcrOptions, + OnnxtrOcrOptions, +) +from docling.datamodel.settings import settings +from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.accelerator_utils import decide_device +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class OnnxtrOcrModel(BaseOcrModel): + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Path], + options: OnnxtrOcrOptions, + accelerator_options: AcceleratorOptions, + ): + super().__init__( + enabled=enabled, + artifacts_path=artifacts_path, + options=options, + accelerator_options=accelerator_options, + ) + self.options: OnnxtrOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + try: + from onnxtr.models import ocr_predictor, EngineConfig, from_hub # type: ignore + except ImportError: + raise ImportError( + "OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + + + if options.auto_correct_orientation: + config = { + "assume_straight_pages": False, + "straighten_pages": True, + "export_as_straight_boxes": True, + # Disable crop orientation because we straighten the pages already + "disable_crop_orientation": True, + "disable_page_orientation": False, + } + else: + config = { + "assume_straight_pages": True, + "straighten_pages": False, + "export_as_straight_boxes": True, + "disable_crop_orientation": False, + "disable_page_orientation": False, + } + + self.reader = ocr_predictor( + det_arch=from_hub(self.options.det_arch) if self.options.det_arch.count("/") == 1 else self.options.det_arch, + reco_arch=from_hub(self.options.reco_arch) if self.options.reco_arch.count("/") == 1 else self.options.reco_arch, + preserve_aspect_ratio=self.options.preserve_aspect_ratio, + symmetric_pad=self.options.symmetric_pad, + paragraph_break=self.options.paragraph_break, + load_in_8_bit=self.options.load_in_8_bit, + **config, + ) + + def _to_absolute_and_docling_format(self, geom: list[list[float]], img_shape: tuple[int, int]) -> tuple[int, int, int, int]: + """ + Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format. + + Args: + geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]] + img_shape (tuple[int, int]): (height, width) of the image + + Returns: + tuple: (x1, y1, x2, y2) + """ + h, w = img_shape + scale_inv = 1 / self.scale # Precompute inverse for efficiency + + def scale_point(x: float, y: float) -> tuple[int, int]: + """Scale and round a point to absolute coordinates.""" + return int(round(x * w * scale_inv)), int(round(y * h * scale_inv)) + + if len(geom) == 2: + (xmin, ymin), (xmax, ymax) = geom + x1, y1 = scale_point(xmin, ymin) + x2, y2 = scale_point(xmax, ymax) + elif len(geom) == 4: + abs_points = [scale_point(*point) for point in geom] + x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points) + x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points) + else: + raise ValueError(f"Invalid geometry format: {geom}. Expected either 2 or 4 points.") + + return x1, y1, x2, y2 + + + def __call__(self, conv_res: ConversionResult, page_batch: Iterable[Page]) -> Iterable[Page]: + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + continue + + with TimeRecorder(conv_res, "ocr"): + ocr_rects = self.get_ocr_rects(page) + all_ocr_cells = [] + + for ocr_rect in ocr_rects: + if ocr_rect.area() == 0: + continue + + with page._backend.get_page_image(scale=self.scale, cropbox=ocr_rect) as high_res_image: + im_width, im_height = high_res_image.size + result = self.reader([numpy.array(high_res_image)]) + + if result is not None: + for p in result.pages: + for ix, word in enumerate( + word + for block in p.blocks + for line in block.lines + for word in line.words + ): + all_ocr_cells.append( + TextCell( + index=ix, + text=word.value, + orig=word.value, + from_ocr=True, + confidence=word.confidence, + rect=BoundingRectangle.from_bounding_box( + BoundingBox.from_tuple( + self._to_absolute_and_docling_format( + word.geometry, img_shape=(im_height, im_width) + ), + origin=CoordOrigin.TOPLEFT, + ) + ), + ) + ) + + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) + + # DEBUG code: + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) + + yield page + + + @classmethod + def get_options_type(cls) -> Type[OcrOptions]: + return OnnxtrOcrOptions diff --git a/docling/models/plugins/defaults.py b/docling/models/plugins/defaults.py index 00873579..1775e23a 100644 --- a/docling/models/plugins/defaults.py +++ b/docling/models/plugins/defaults.py @@ -3,6 +3,7 @@ from docling.models.ocr_mac_model import OcrMacModel from docling.models.picture_description_api_model import PictureDescriptionApiModel from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.rapid_ocr_model import RapidOcrModel +from docling.models.onnxtr_model import OnnxtrOcrModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel @@ -13,6 +14,7 @@ def ocr_engines(): EasyOcrModel, OcrMacModel, RapidOcrModel, + OnnxtrOcrModel, TesseractOcrModel, TesseractOcrCliModel, ] diff --git a/pyproject.toml b/pyproject.toml index dd48a9d2..5dbbe74b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,11 +72,12 @@ openpyxl = "^3.1.5" lxml = ">=4.0.0,<6.0.0" ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true } rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" } +onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" } onnxruntime = [ # 1.19.2 is the last version with python3.9 support, # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 - { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, - { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }, + { version = "^1.7.0", optional = true, markers = "python_version < '3.10'" }, + { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version >= '3.10'" }, ] transformers = [ diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 985a6250..c110906a 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import ( OcrOptions, PdfPipelineOptions, RapidOcrOptions, + OnnxtrOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -62,6 +63,7 @@ def test_e2e_conversions(): TesseractOcrOptions(), TesseractCliOcrOptions(), EasyOcrOptions(force_full_page_ocr=True), + OnnxtrOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), TesseractCliOcrOptions(force_full_page_ocr=True),