adding doctr ocr to pipeline

2025-07-27 04:24:45 +00:00 · 2025-04-29 12:25:31 +00:00 · 2025-04-29 12:25:31 +00:00 · 2acce04305
commit 2acce04305
parent 976e92e289
3 changed files with 126 additions and 0 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -196,6 +196,15 @@ class TesseractOcrOptions(OcrOptions):
    )
 class DoctrOcrOptions(OcrOptions):
    kind: ClassVar[Literal["doctr"]] = "doctr"
    lang: Optional[List[str]] = None
    model_name: str = "db_resnet50"
    pretrained: bool = True
 class OcrMacOptions(OcrOptions):
    """Options for the Mac OCR engine."""
--- a/docling/models/doctr_ocr_model.py
+++ b/docling/models/doctr_ocr_model.py
@ -0,0 +1,115 @@
 import logging
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional, Type
 import numpy as np
 from io import BytesIO
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    OcrOptions,
    DoctrOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class DoctrOcrModel(BaseOcrModel):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        options: OcrOptions,
        accelerator_options: AcceleratorOptions,
    ):
        super().__init__(
            enabled=enabled,
            artifacts_path=artifacts_path,
            options=options,
            accelerator_options=accelerator_options,
        )
        if self.enabled:
            try:
                from doctr.models import ocr_predictor
            except ImportError:
                raise ImportError(
                    "The 'python-doctr' library is not installed. Install it via `pip install python-doctr`."
                )
            _log.debug("Initializing Doctr OCR engine")
            # Initialize a simple Doctr OCR model
            self.model = ocr_predictor(pretrained=True)
        else:
            self.model = None
    def __call__(self, conv_res: ConversionResult, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled or self.model is None:
            yield from page_batch
            return
        from doctr.io import DocumentFile
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
                    pil_image = page._backend.get_page_image(scale=1).convert("RGB")
                    # 2) Convert it to raw PNG bytes
                    buf = BytesIO()
                    pil_image.save(buf, format="PNG")
                    img_bytes = buf.getvalue()
                    # 3) Wrap in a list and hand to doctr
                    doc = DocumentFile.from_images([img_bytes])
                    result = self.model(doc)
                    all_cells = []
                    if len(result.pages) > 0:
                        doc_page = result.pages[0]
                        for block in doc_page.blocks:
                            for line in block.lines:
                                line_text = " ".join(w.value for w in line.words)
                                (left, top), (right, bottom) = line.geometry
                                if line.words:
                                    conf = float(np.mean([w.confidence for w in line.words]))
                                else:
                                    conf = 0.0
                                all_cells.append(
                                    TextCell(
                                        index=len(all_cells),
                                        text=line_text,
                                        orig=line_text,
                                        from_ocr=True,
                                        confidence=conf,
                                        rect=BoundingRectangle.from_bounding_box(
                                            BoundingBox.from_tuple(
                                                coord=(left, top, right, bottom),
                                                origin=CoordOrigin.TOPLEFT,
                                            ),
                                        ),
                                    )
                                )
                    # Attach the OCR cells to the page
                    page.cells = self.post_process_cells(all_cells, page.cells)
                if settings.debug.visualize_ocr:
                    self.draw_ocr_rects_and_cells(conv_res, page, [])
                yield page
    @classmethod
    def get_options_type(cls) -> Type[OcrOptions]:
        return DoctrOcrOptions
--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@ -5,6 +5,7 @@ from docling.models.picture_description_vlm_model import PictureDescriptionVlmMo
 from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.models.doctr_ocr_model import DoctrOcrModel
 def ocr_engines():
@ -15,6 +16,7 @@ def ocr_engines():
            RapidOcrModel,
            TesseractOcrModel,
            TesseractOcrCliModel,
            DoctrOcrModel,
        ]
    }