feat: add factory for ocr engines via plugins (#1010)

* add factory for ocr engines

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply pre-commit after rebase

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add picture description factory

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix enable option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* switch to create methods

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* make `options` an explicit kwarg

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* keep old lock of docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add allow_external_plugins option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add factory return and ignore options type

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-03-18 13:58:05 +01:00
committed by GitHub
parent 3960b199d6
commit 6eaae3cba0
21 changed files with 485 additions and 158 deletions

View File

@@ -1,14 +1,22 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional
from typing import Any, Generic, Iterable, Optional, Protocol, Type
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from typing_extensions import TypeVar
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import BaseOptions
from docling.datamodel.settings import settings
class BaseModelWithOptions(Protocol):
@classmethod
def get_options_type(cls) -> Type[BaseOptions]: ...
def __init__(self, *, options: BaseOptions, **kwargs): ...
class BasePageModel(ABC):
@abstractmethod
def __call__(

View File

@@ -2,7 +2,7 @@ import copy
import logging
from abc import abstractmethod
from pathlib import Path
from typing import Iterable, List
from typing import Iterable, List, Optional, Type
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -13,15 +13,22 @@ from scipy.ndimage import binary_dilation, find_objects, label
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.base_model import BaseModelWithOptions, BasePageModel
_log = logging.getLogger(__name__)
class BaseOcrModel(BasePageModel):
def __init__(self, enabled: bool, options: OcrOptions):
class BaseOcrModel(BasePageModel, BaseModelWithOptions):
def __init__(
self,
*,
enabled: bool,
artifacts_path: Optional[Path],
options: OcrOptions,
accelerator_options: AcceleratorOptions,
):
self.enabled = enabled
self.options = options
@@ -186,3 +193,8 @@ class BaseOcrModel(BasePageModel):
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
pass
@classmethod
@abstractmethod
def get_options_type(cls) -> Type[OcrOptions]:
pass

View File

@@ -2,7 +2,7 @@ import logging
import warnings
import zipfile
from pathlib import Path
from typing import Iterable, List, Optional
from typing import Iterable, List, Optional, Type
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -14,6 +14,7 @@ from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
@@ -34,7 +35,12 @@ class EasyOcrModel(BaseOcrModel):
options: EasyOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
super().__init__(
enabled=enabled,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: EasyOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -180,3 +186,7 @@ class EasyOcrModel(BaseOcrModel):
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return EasyOcrOptions

View File

@@ -0,0 +1,27 @@
import logging
from functools import lru_cache
from docling.models.factories.ocr_factory import OcrFactory
from docling.models.factories.picture_description_factory import (
PictureDescriptionFactory,
)
logger = logging.getLogger(__name__)
@lru_cache()
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
factory = OcrFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
logger.info("Registered ocr engines: %r", factory.registered_kind)
return factory
@lru_cache()
def get_picture_description_factory(
allow_external_plugins: bool = False,
) -> PictureDescriptionFactory:
factory = PictureDescriptionFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
logger.info("Registered picture descriptions: %r", factory.registered_kind)
return factory

View File

@@ -0,0 +1,122 @@
import enum
import logging
from abc import ABCMeta
from typing import Generic, Optional, Type, TypeVar
from pluggy import PluginManager
from pydantic import BaseModel
from docling.datamodel.pipeline_options import BaseOptions
from docling.models.base_model import BaseModelWithOptions
A = TypeVar("A", bound=BaseModelWithOptions)
logger = logging.getLogger(__name__)
class FactoryMeta(BaseModel):
kind: str
plugin_name: str
module: str
class BaseFactory(Generic[A], metaclass=ABCMeta):
default_plugin_name = "docling"
def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
self.plugin_name = plugin_name
self.plugin_attr_name = plugin_attr_name
self._classes: dict[Type[BaseOptions], Type[A]] = {}
self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
@property
def registered_kind(self) -> list[str]:
return list(opt.kind for opt in self._classes.keys())
def get_enum(self) -> enum.Enum:
return enum.Enum(
self.plugin_attr_name + "_enum",
names={kind: kind for kind in self.registered_kind},
type=str,
module=__name__,
)
@property
def classes(self):
return self._classes
@property
def registered_meta(self):
return self._meta
def create_instance(self, options: BaseOptions, **kwargs) -> A:
try:
_cls = self._classes[type(options)]
return _cls(options=options, **kwargs)
except KeyError:
raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
for opt_cls, _ in self._classes.items():
if opt_cls.kind == kind:
return opt_cls(*args, **kwargs)
raise RuntimeError(self._err_msg_on_class_not_found(kind))
def _err_msg_on_class_not_found(self, kind: str):
msg = []
for opt, cls in self._classes.items():
msg.append(f"\t{opt.kind!r} => {cls!r}")
msg_str = "\n".join(msg)
return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
opt_type = cls.get_options_type()
if opt_type in self._classes:
raise ValueError(
f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
)
self._classes[opt_type] = cls
self._meta[opt_type] = FactoryMeta(
kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
)
def load_from_plugins(
self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
):
plugin_name = plugin_name or self.plugin_name
plugin_manager = PluginManager(plugin_name)
plugin_manager.load_setuptools_entrypoints(plugin_name)
for plugin_name, plugin_module in plugin_manager.list_name_plugin():
plugin_module_name = str(plugin_module.__name__) # type: ignore
if not allow_external_plugins and not plugin_module_name.startswith(
"docling."
):
logger.warning(
f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
)
continue
attr = getattr(plugin_module, self.plugin_attr_name, None)
if callable(attr):
logger.info("Loading plugin %r", plugin_name)
config = attr()
self.process_plugin(config, plugin_name, plugin_module_name)
def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
for item in config[self.plugin_attr_name]:
try:
self.register(item, plugin_name, plugin_module_name)
except ValueError:
logger.warning("%r already registered", item)

View File

@@ -0,0 +1,11 @@
import logging
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.factories.base_factory import BaseFactory
logger = logging.getLogger(__name__)
class OcrFactory(BaseFactory[BaseOcrModel]):
def __init__(self, *args, **kwargs):
super().__init__("ocr_engines", *args, **kwargs)

View File

@@ -0,0 +1,11 @@
import logging
from docling.models.factories.base_factory import BaseFactory
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
logger = logging.getLogger(__name__)
class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
def __init__(self, *args, **kwargs):
super().__init__("picture_description", *args, **kwargs)

View File

@@ -1,13 +1,19 @@
import logging
import sys
import tempfile
from typing import Iterable, Optional, Tuple
from pathlib import Path
from typing import Iterable, Optional, Tuple, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrMacOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
OcrMacOptions,
OcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
@@ -16,13 +22,26 @@ _log = logging.getLogger(__name__)
class OcrMacModel(BaseOcrModel):
def __init__(self, enabled: bool, options: OcrMacOptions):
super().__init__(enabled=enabled, options=options)
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
options: OcrMacOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(
enabled=enabled,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: OcrMacOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
if self.enabled:
if "darwin" != sys.platform:
raise RuntimeError(f"OcrMac is only supported on Mac.")
install_errmsg = (
"ocrmac is not correctly installed. "
"Please install it via `pip install ocrmac` to use this OCR engine. "
@@ -121,3 +140,7 @@ class OcrMacModel(BaseOcrModel):
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return OcrMacOptions

View File

@@ -1,13 +1,18 @@
import base64
import io
import logging
from typing import Iterable, List, Optional
from pathlib import Path
from typing import Iterable, List, Optional, Type, Union
import requests
from PIL import Image
from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionApiOptions,
PictureDescriptionBaseOptions,
)
from docling.exceptions import OperationNotAllowed
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
@@ -46,13 +51,25 @@ class ApiResponse(BaseModel):
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
@classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return PictureDescriptionApiOptions
def __init__(
self,
enabled: bool,
enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionApiOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
super().__init__(
enabled=enabled,
enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: PictureDescriptionApiOptions
if self.enabled:

View File

@@ -1,6 +1,7 @@
import logging
from abc import abstractmethod
from pathlib import Path
from typing import Any, Iterable, List, Optional, Union
from typing import Any, Iterable, List, Optional, Type, Union
from docling_core.types.doc import (
DoclingDocument,
@@ -13,20 +14,30 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
)
from PIL import Image
from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionBaseOptions,
)
from docling.models.base_model import (
BaseItemAndImageEnrichmentModel,
BaseModelWithOptions,
ItemAndImageEnrichmentElement,
)
class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
class PictureDescriptionBaseModel(
BaseItemAndImageEnrichmentModel, BaseModelWithOptions
):
images_scale: float = 2.0
def __init__(
self,
*,
enabled: bool,
enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionBaseOptions,
accelerator_options: AcceleratorOptions,
):
self.enabled = enabled
self.options = options
@@ -62,3 +73,8 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
PictureDescriptionData(text=output, provenance=self.provenance)
)
yield item
@classmethod
@abstractmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
pass

View File

@@ -1,10 +1,11 @@
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Iterable, Optional, Type, Union
from PIL import Image
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
PictureDescriptionBaseOptions,
PictureDescriptionVlmOptions,
)
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
@@ -13,14 +14,25 @@ from docling.utils.accelerator_utils import decide_device
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
@classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return PictureDescriptionVlmOptions
def __init__(
self,
enabled: bool,
enable_remote_services: bool,
artifacts_path: Optional[Union[Path, str]],
options: PictureDescriptionVlmOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
super().__init__(
enabled=enabled,
enable_remote_services=enable_remote_services,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: PictureDescriptionVlmOptions
if self.enabled:

View File

View File

@@ -0,0 +1,28 @@
from docling.models.easyocr_model import EasyOcrModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.models.picture_description_api_model import PictureDescriptionApiModel
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
def ocr_engines():
return {
"ocr_engines": [
EasyOcrModel,
OcrMacModel,
RapidOcrModel,
TesseractOcrModel,
TesseractOcrCliModel,
]
}
def picture_description():
return {
"picture_description": [
PictureDescriptionVlmModel,
PictureDescriptionApiModel,
]
}

View File

@@ -1,5 +1,6 @@
import logging
from typing import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -10,6 +11,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
OcrOptions,
RapidOcrOptions,
)
from docling.datamodel.settings import settings
@@ -24,10 +26,16 @@ class RapidOcrModel(BaseOcrModel):
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
options: RapidOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
super().__init__(
enabled=enabled,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: RapidOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -135,3 +143,7 @@ class RapidOcrModel(BaseOcrModel):
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return RapidOcrOptions

View File

@@ -3,8 +3,9 @@ import io
import logging
import os
import tempfile
from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, List, Optional, Tuple
from typing import Iterable, List, Optional, Tuple, Type
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -12,7 +13,11 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
OcrOptions,
TesseractCliOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
@@ -22,8 +27,19 @@ _log = logging.getLogger(__name__)
class TesseractOcrCliModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
super().__init__(enabled=enabled, options=options)
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
options: TesseractCliOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(
enabled=enabled,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: TesseractCliOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -257,3 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return TesseractCliOcrOptions

View File

@@ -1,12 +1,17 @@
import logging
from typing import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
OcrOptions,
TesseractOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
@@ -16,8 +21,19 @@ _log = logging.getLogger(__name__)
class TesseractOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions):
super().__init__(enabled=enabled, options=options)
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
options: TesseractOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(
enabled=enabled,
artifacts_path=artifacts_path,
options=options,
accelerator_options=accelerator_options,
)
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -200,3 +216,7 @@ class TesseractOcrModel(BaseOcrModel):
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return TesseractOcrOptions