mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
feat: add support for user-provided OCR model
The ocr_model field added to the OcrOptions class with a reference to the BaseOcrModel inheritor class. In case the options are not one of supported model options, the class from this field is used. Signed-off-by: vdaleke <vdalekesmirnov@gmail.com>
This commit is contained in:
parent
cf78d5b7b9
commit
898a497e71
@ -2,11 +2,13 @@ import logging
|
|||||||
import os
|
import os
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
from typing import Annotated, Any, Dict, List, Literal, Optional, Type, Union
|
||||||
|
|
||||||
from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
|
from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -85,6 +87,7 @@ class OcrOptions(BaseModel):
|
|||||||
bitmap_area_threshold: float = (
|
bitmap_area_threshold: float = (
|
||||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||||
)
|
)
|
||||||
|
ocr_model: Optional[Type[BaseOcrModel]] = None
|
||||||
|
|
||||||
|
|
||||||
class RapidOcrOptions(OcrOptions):
|
class RapidOcrOptions(OcrOptions):
|
||||||
@ -151,6 +154,7 @@ class TesseractCliOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
kind: Literal["tesseract"] = "tesseract"
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
|
|
||||||
tesseract_cmd: str = "tesseract"
|
tesseract_cmd: str = "tesseract"
|
||||||
path: Optional[str] = None
|
path: Optional[str] = None
|
||||||
|
|
||||||
@ -164,6 +168,7 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
kind: Literal["tesserocr"] = "tesserocr"
|
kind: Literal["tesserocr"] = "tesserocr"
|
||||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
|
|
||||||
path: Optional[str] = None
|
path: Optional[str] = None
|
||||||
|
|
||||||
model_config = ConfigDict(
|
model_config = ConfigDict(
|
||||||
@ -176,6 +181,7 @@ class OcrMacOptions(OcrOptions):
|
|||||||
|
|
||||||
kind: Literal["ocrmac"] = "ocrmac"
|
kind: Literal["ocrmac"] = "ocrmac"
|
||||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||||
|
|
||||||
recognition: str = "accurate"
|
recognition: str = "accurate"
|
||||||
framework: str = "vision"
|
framework: str = "vision"
|
||||||
|
|
||||||
@ -271,13 +277,7 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
do_picture_description: bool = False # True: run describe pictures in documents
|
do_picture_description: bool = False # True: run describe pictures in documents
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
ocr_options: Union[
|
ocr_options: OcrOptions = EasyOcrOptions()
|
||||||
EasyOcrOptions,
|
|
||||||
TesseractCliOcrOptions,
|
|
||||||
TesseractOcrOptions,
|
|
||||||
OcrMacOptions,
|
|
||||||
RapidOcrOptions,
|
|
||||||
] = Field(EasyOcrOptions(), discriminator="kind")
|
|
||||||
picture_description_options: Annotated[
|
picture_description_options: Annotated[
|
||||||
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
||||||
Field(discriminator="kind"),
|
Field(discriminator="kind"),
|
||||||
|
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
|
|||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
|
OcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
PictureDescriptionApiOptions,
|
PictureDescriptionApiOptions,
|
||||||
PictureDescriptionVlmOptions,
|
PictureDescriptionVlmOptions,
|
||||||
@ -73,6 +74,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||||
|
" You can provide a custom OCR model class in the options."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
@ -190,6 +192,12 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
enabled=self.pipeline_options.do_ocr,
|
enabled=self.pipeline_options.do_ocr,
|
||||||
options=self.pipeline_options.ocr_options,
|
options=self.pipeline_options.ocr_options,
|
||||||
)
|
)
|
||||||
|
elif isinstance(self.pipeline_options.ocr_options, OcrOptions):
|
||||||
|
if self.pipeline_options.ocr_options.ocr_model is not None:
|
||||||
|
return self.pipeline_options.ocr_options.ocr_model(
|
||||||
|
enabled=self.pipeline_options.do_ocr,
|
||||||
|
options=self.pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_picture_description_model(
|
def get_picture_description_model(
|
||||||
|
Loading…
Reference in New Issue
Block a user