init

Signed-off-by: felix <felixdittrich92@gmail.com>
2025-07-30 14:04:27 +00:00 · 2025-03-21 21:09:16 +01:00 · 2025-03-21 21:09:16 +01:00 · 35f185f545
commit 35f185f545
parent b3d111a3cd
5 changed files with 207 additions and 2 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -151,6 +151,32 @@ class RapidOcrOptions(OcrOptions):
    )
 class OnnxtrOcrOptions(OcrOptions):
    """Options for the Onnxtr engine."""
    kind: ClassVar[Literal["onnxtr"]] = "onnxtr"
    lang: List[str] = ["en", "fr"]
    confidence_score: float = 0.5
    det_arch: str = "fast_base"
    reco_arch: str = "crnn_vgg16_bn"  # NOTE: This can be also a hf hub model
    det_bs: int = 1  # NOTE: Should be 1 because docling seems not to support batch processing yet
    reco_bs: int = 512
    auto_correct_orientation: bool = False
    preserve_aspect_ratio: bool = True
    symmetric_pad: bool = True
    paragraph_break: float = 0.035
    load_in_8_bit: bool = False
    det_engine_cfg: Dict[str, Any] = {}
    reco_engine_cfg: Dict[str, Any] = {}
    clf_engine_cfg: Dict[str, Any] = {}
    model_config = ConfigDict(
        extra="forbid",
    )
 class EasyOcrOptions(OcrOptions):
    """Options for the EasyOCR engine."""
--- a/docling/models/onnxtr_model.py
+++ b/docling/models/onnxtr_model.py
@ -0,0 +1,174 @@
 import logging
 from pathlib import Path
 from typing import Iterable, Optional, Type
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    OcrOptions,
    OnnxtrOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class OnnxtrOcrModel(BaseOcrModel):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        options: OnnxtrOcrOptions,
        accelerator_options: AcceleratorOptions,
    ):
        super().__init__(
            enabled=enabled,
            artifacts_path=artifacts_path,
            options=options,
            accelerator_options=accelerator_options,
        )
        self.options: OnnxtrOcrOptions
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        if self.enabled:
            try:
                from onnxtr.models import ocr_predictor, EngineConfig, from_hub  # type: ignore
            except ImportError:
                raise ImportError(
                    "OnnxTR is not installed. Please install it via `pip install 'onnxtr[gpu]'` to use this OCR engine. "
                    "Alternatively, Docling has support for other OCR engines. See the documentation."
                )
            if options.auto_correct_orientation:
                config = {
                    "assume_straight_pages": False,
                    "straighten_pages": True,
                    "export_as_straight_boxes": True,
                    # Disable crop orientation because we straighten the pages already
                    "disable_crop_orientation": True,
                    "disable_page_orientation": False,
                }
            else:
                config = {
                    "assume_straight_pages": True,
                    "straighten_pages": False,
                    "export_as_straight_boxes": True,
                    "disable_crop_orientation": False,
                    "disable_page_orientation": False,
                }
            self.reader = ocr_predictor(
                det_arch=from_hub(self.options.det_arch) if self.options.det_arch.count("/") == 1 else self.options.det_arch,
                reco_arch=from_hub(self.options.reco_arch) if self.options.reco_arch.count("/") == 1 else self.options.reco_arch,
                preserve_aspect_ratio=self.options.preserve_aspect_ratio,
                symmetric_pad=self.options.symmetric_pad,
                paragraph_break=self.options.paragraph_break,
                load_in_8_bit=self.options.load_in_8_bit,
                **config,
            )
    def _to_absolute_and_docling_format(self, geom: list[list[float]], img_shape: tuple[int, int]) -> tuple[int, int, int, int]:
        """
        Convert a bounding box or polygon from relative to absolute coordinates and return in [x1, y1, x2, y2] format.
        Args:
            geom (list): Either [[xmin, ymin], [xmax, ymax]] or [[x1, y1], ..., [x4, y4]]
            img_shape (tuple[int, int]): (height, width) of the image
        Returns:
            tuple: (x1, y1, x2, y2)
        """
        h, w = img_shape
        scale_inv = 1 / self.scale  # Precompute inverse for efficiency
        def scale_point(x: float, y: float) -> tuple[int, int]:
            """Scale and round a point to absolute coordinates."""
            return int(round(x * w * scale_inv)), int(round(y * h * scale_inv))
        if len(geom) == 2:
            (xmin, ymin), (xmax, ymax) = geom
            x1, y1 = scale_point(xmin, ymin)
            x2, y2 = scale_point(xmax, ymax)
        elif len(geom) == 4:
            abs_points = [scale_point(*point) for point in geom]
            x1, y1 = min(p[0] for p in abs_points), min(p[1] for p in abs_points)
            x2, y2 = max(p[0] for p in abs_points), max(p[1] for p in abs_points)
        else:
            raise ValueError(f"Invalid geometry format: {geom}. Expected either 2 or 4 points.")
        return x1, y1, x2, y2
    def __call__(self, conv_res: ConversionResult, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
                continue
            with TimeRecorder(conv_res, "ocr"):
                ocr_rects = self.get_ocr_rects(page)
                all_ocr_cells = []
                for ocr_rect in ocr_rects:
                    if ocr_rect.area() == 0:
                        continue
                    with page._backend.get_page_image(scale=self.scale, cropbox=ocr_rect) as high_res_image:
                        im_width, im_height = high_res_image.size
                        result = self.reader([numpy.array(high_res_image)])
                    if result is not None:
                        for p in result.pages:
                            for ix, word in enumerate(
                                word
                                for block in p.blocks
                                for line in block.lines
                                for word in line.words
                            ):
                                all_ocr_cells.append(
                                    TextCell(
                                        index=ix,
                                        text=word.value,
                                        orig=word.value,
                                        from_ocr=True,
                                        confidence=word.confidence,
                                        rect=BoundingRectangle.from_bounding_box(
                                            BoundingBox.from_tuple(
                                                self._to_absolute_and_docling_format(
                                                    word.geometry, img_shape=(im_height, im_width)
                                                ),
                                                origin=CoordOrigin.TOPLEFT,
                                            )
                                        ),
                                    )
                                )
                # Post-process the cells
                page.cells = self.post_process_cells(all_ocr_cells, page.cells)
            # DEBUG code:
            if settings.debug.visualize_ocr:
                self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
            yield page
    @classmethod
    def get_options_type(cls) -> Type[OcrOptions]:
        return OnnxtrOcrOptions
--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@ -3,6 +3,7 @@ from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.picture_description_api_model import PictureDescriptionApiModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.onnxtr_model import OnnxtrOcrModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
@ -13,6 +14,7 @@ def ocr_engines():
            EasyOcrModel,
            OcrMacModel,
            RapidOcrModel,
            OnnxtrOcrModel,
            TesseractOcrModel,
            TesseractOcrCliModel,
        ]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -72,11 +72,12 @@ openpyxl = "^3.1.5"
 lxml = ">=4.0.0,<6.0.0"
 ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
 rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
 onnxtr = { extras= ["gpu", "viz"], version = "^0.6.3", optional = true, markers = "python_version < '3.13'" }
 onnxruntime = [
  # 1.19.2 is the last version with python3.9 support,
  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
-  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
+  { version = "^1.7.0", optional = true, markers = "python_version < '3.10'" },
-  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
+  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version >= '3.10'" },
 ]
 transformers = [
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
    OcrOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    OnnxtrOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
@ -62,6 +63,7 @@ def test_e2e_conversions():
        TesseractOcrOptions(),
        TesseractCliOcrOptions(),
        EasyOcrOptions(force_full_page_ocr=True),
        OnnxtrOcrOptions(force_full_page_ocr=True),
        TesseractOcrOptions(force_full_page_ocr=True),
        TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
        TesseractCliOcrOptions(force_full_page_ocr=True),