feat: add support for user-provided OCR model

The ocr_model field added to the OcrOptions class with a reference to the BaseOcrModel inheritor class. In case the options are not one of supported model options, the class from this field is used. Signed-off-by: vdaleke <vdalekesmirnov@gmail.com>
2025-07-31 14:34:40 +00:00 · 2025-02-07 14:36:28 +03:00 · 2025-02-07 14:36:28 +03:00 · 898a497e71
commit 898a497e71
parent cf78d5b7b9
2 changed files with 16 additions and 8 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -2,11 +2,13 @@ import logging
 import os
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Any, Dict, List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Type, Union

 from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict

+from docling.models.base_ocr_model import BaseOcrModel
+
 _log = logging.getLogger(__name__)


@ -85,6 +87,7 @@ class OcrOptions(BaseModel):
    bitmap_area_threshold: float = (
        0.05  # percentage of the area for a bitmap to processed with OCR
    )
+    ocr_model: Optional[Type[BaseOcrModel]] = None


 class RapidOcrOptions(OcrOptions):
@ -151,6 +154,7 @@ class TesseractCliOcrOptions(OcrOptions):

    kind: Literal["tesseract"] = "tesseract"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
+
    tesseract_cmd: str = "tesseract"
    path: Optional[str] = None

@ -164,6 +168,7 @@ class TesseractOcrOptions(OcrOptions):

    kind: Literal["tesserocr"] = "tesserocr"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
+
    path: Optional[str] = None

    model_config = ConfigDict(
@ -176,6 +181,7 @@ class OcrMacOptions(OcrOptions):

    kind: Literal["ocrmac"] = "ocrmac"
    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
+
    recognition: str = "accurate"
    framework: str = "vision"

@ -271,13 +277,7 @@ class PdfPipelineOptions(PipelineOptions):
    do_picture_description: bool = False  # True: run describe pictures in documents

    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[
-        EasyOcrOptions,
-        TesseractCliOcrOptions,
-        TesseractOcrOptions,
-        OcrMacOptions,
-        RapidOcrOptions,
-    ] = Field(EasyOcrOptions(), discriminator="kind")
+    ocr_options: OcrOptions = EasyOcrOptions()
    picture_description_options: Annotated[
        Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
        Field(discriminator="kind"),
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
+    OcrOptions,
    PdfPipelineOptions,
    PictureDescriptionApiOptions,
    PictureDescriptionVlmOptions,
@ -73,6 +74,7 @@ class StandardPdfPipeline(PaginatedPipeline):
        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
            raise RuntimeError(
                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
+                " You can provide a custom OCR model class in the options."
            )

        self.build_pipe = [
@ -190,6 +192,12 @@ class StandardPdfPipeline(PaginatedPipeline):
                enabled=self.pipeline_options.do_ocr,
                options=self.pipeline_options.ocr_options,
            )
+        elif isinstance(self.pipeline_options.ocr_options, OcrOptions):
+            if self.pipeline_options.ocr_options.ocr_model is not None:
+                return self.pipeline_options.ocr_options.ocr_model(
+                    enabled=self.pipeline_options.do_ocr,
+                    options=self.pipeline_options.ocr_options,
+                )
        return None

    def get_picture_description_model(