mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
feat: add support for user-provided OCR model
The ocr_model field added to the OcrOptions class with a reference to the BaseOcrModel inheritor class. In case the options are not one of supported model options, the class from this field is used. Signed-off-by: vdaleke <vdalekesmirnov@gmail.com>
This commit is contained in:
parent
cf78d5b7b9
commit
898a497e71
@ -2,11 +2,13 @@ import logging
|
||||
import os
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
||||
from typing import Annotated, Any, Dict, List, Literal, Optional, Type, Union
|
||||
|
||||
from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -85,6 +87,7 @@ class OcrOptions(BaseModel):
|
||||
bitmap_area_threshold: float = (
|
||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||
)
|
||||
ocr_model: Optional[Type[BaseOcrModel]] = None
|
||||
|
||||
|
||||
class RapidOcrOptions(OcrOptions):
|
||||
@ -151,6 +154,7 @@ class TesseractCliOcrOptions(OcrOptions):
|
||||
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
|
||||
tesseract_cmd: str = "tesseract"
|
||||
path: Optional[str] = None
|
||||
|
||||
@ -164,6 +168,7 @@ class TesseractOcrOptions(OcrOptions):
|
||||
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
|
||||
path: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(
|
||||
@ -176,6 +181,7 @@ class OcrMacOptions(OcrOptions):
|
||||
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
|
||||
recognition: str = "accurate"
|
||||
framework: str = "vision"
|
||||
|
||||
@ -271,13 +277,7 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
do_picture_description: bool = False # True: run describe pictures in documents
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: Union[
|
||||
EasyOcrOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
OcrMacOptions,
|
||||
RapidOcrOptions,
|
||||
] = Field(EasyOcrOptions(), discriminator="kind")
|
||||
ocr_options: OcrOptions = EasyOcrOptions()
|
||||
picture_description_options: Annotated[
|
||||
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
||||
Field(discriminator="kind"),
|
||||
|
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
PictureDescriptionApiOptions,
|
||||
PictureDescriptionVlmOptions,
|
||||
@ -73,6 +74,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||
" You can provide a custom OCR model class in the options."
|
||||
)
|
||||
|
||||
self.build_pipe = [
|
||||
@ -190,6 +192,12 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, OcrOptions):
|
||||
if self.pipeline_options.ocr_options.ocr_model is not None:
|
||||
return self.pipeline_options.ocr_options.ocr_model(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
return None
|
||||
|
||||
def get_picture_description_model(
|
||||
|
Loading…
Reference in New Issue
Block a user