[WIP] introducting extra backend abstraction and input formats

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-09-25 11:17:49 +02:00 · 2024-09-25 11:17:49 +02:00 · 95c539579d
commit 95c539579d
parent 850a521195
3 changed files with 44 additions and 23 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -10,6 +10,24 @@ if TYPE_CHECKING:
    from docling.datamodel.base_models import Cell
 class AbstractDocumentBackend(ABC):
    @abstractmethod
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
        self.path_or_stream = path_or_stream
        self.document_hash = document_hash
    @abstractmethod
    def is_valid(self) -> bool:
        pass
    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
 class PdfPageBackend(ABC):
    @abstractmethod
@ -43,12 +61,7 @@ class PdfPageBackend(ABC):
        pass
-class PdfDocumentBackend(ABC):
+class PdfDocumentBackend(AbstractDocumentBackend):
    @abstractmethod
    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
        self.path_or_stream = path_or_stream
        self.document_hash = document_hash
    @abstractmethod
    def load_page(self, page_no: int) -> PdfPageBackend:
        pass
@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC):
    @abstractmethod
    def page_count(self) -> int:
        pass
    @abstractmethod
    def is_valid(self) -> bool:
        pass
    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -22,6 +22,14 @@ class ConversionStatus(str, Enum):
    PARTIAL_SUCCESS = auto()
 class InputFormat(str, Enum):
    DOCX = auto()
    PPTX = auto()
    HTML = auto()
    IMAGE = auto()
    PDF = auto()
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel
 from pydantic import BaseModel
 from typing_extensions import deprecated
-from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
    DocumentStream,
    ErrorItem,
    FigureElement,
    InputFormat,
    Page,
    PageElement,
    Table,
@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
    description={}, file_info=FileInfo(document_hash="123xyz")
 )  # TODO: Stub
 _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
    InputFormat.PDF: DoclingParseDocumentBackend,
    InputFormat.DOCX: None,
    InputFormat.PPTX: None,
    InputFormat.IMAGE: None,
 }
 class InputDocument(BaseModel):
    file: PurePath = None
@ -82,10 +90,13 @@ class InputDocument(BaseModel):
        path_or_stream: Union[BytesIO, Path],
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
-        pdf_backend=DoclingParseDocumentBackend,
+        backend: Optional[Type[AbstractDocumentBackend]] = None,
    ):
        super().__init__()
        if not backend:
            backend = _input_format_default_backends[InputFormat.PDF]
        self.limits = limits or DocumentLimits()
        try:
@ -96,7 +107,7 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
+                    self._backend = backend(
                        path_or_stream=path_or_stream, document_hash=self.document_hash
                    )
@ -108,7 +119,7 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
+                    self._backend = backend(
                        path_or_stream=path_or_stream, document_hash=self.document_hash
                    )
@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel):
        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
-                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
+                    path_or_stream=obj, limits=self.limits, backend=pdf_backend
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
                    filename=obj.filename,
                    limits=self.limits,
-                    pdf_backend=pdf_backend,
+                    backend=pdf_backend,
                )
    @classmethod