[WIP] introducting extra backend abstraction and input formats

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-09-25 11:17:49 +02:00
parent 850a521195
commit 95c539579d
3 changed files with 44 additions and 23 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -10,6 +10,24 @@ if TYPE_CHECKING:
    from docling.datamodel.base_models import Cell


+class AbstractDocumentBackend(ABC):
+    @abstractmethod
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        self.path_or_stream = path_or_stream
+        self.document_hash = document_hash
+
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
+
+    @abstractmethod
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+
 class PdfPageBackend(ABC):

    @abstractmethod
@@ -43,12 +61,7 @@ class PdfPageBackend(ABC):
        pass


-class PdfDocumentBackend(ABC):
-    @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        self.path_or_stream = path_or_stream
-        self.document_hash = document_hash
-
+class PdfDocumentBackend(AbstractDocumentBackend):
    @abstractmethod
    def load_page(self, page_no: int) -> PdfPageBackend:
        pass
@@ -56,14 +69,3 @@ class PdfDocumentBackend(ABC):
    @abstractmethod
    def page_count(self) -> int:
        pass
-
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-
-    @abstractmethod
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-
-        self.path_or_stream = None
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -22,6 +22,14 @@ class ConversionStatus(str, Enum):
    PARTIAL_SUCCESS = auto()


+class InputFormat(str, Enum):
+    DOCX = auto()
+    PPTX = auto()
+    HTML = auto()
+    IMAGE = auto()
+    PDF = auto()
+
+
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -16,7 +16,7 @@ from docling_core.types.experimental.labels import PageLabel
 from pydantic import BaseModel
 from typing_extensions import deprecated

-from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
@@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
    DocumentStream,
    ErrorItem,
    FigureElement,
+    InputFormat,
    Page,
    PageElement,
    Table,
@@ -65,6 +66,13 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
    description={}, file_info=FileInfo(document_hash="123xyz")
 )  # TODO: Stub

+_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
+    InputFormat.PDF: DoclingParseDocumentBackend,
+    InputFormat.DOCX: None,
+    InputFormat.PPTX: None,
+    InputFormat.IMAGE: None,
+}
+

 class InputDocument(BaseModel):
    file: PurePath = None
@@ -82,10 +90,13 @@ class InputDocument(BaseModel):
        path_or_stream: Union[BytesIO, Path],
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
-        pdf_backend=DoclingParseDocumentBackend,
+        backend: Optional[Type[AbstractDocumentBackend]] = None,
    ):
        super().__init__()

+        if not backend:
+            backend = _input_format_default_backends[InputFormat.PDF]
+
        self.limits = limits or DocumentLimits()

        try:
@@ -96,7 +107,7 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
+                    self._backend = backend(
                        path_or_stream=path_or_stream, document_hash=self.document_hash
                    )

@@ -108,7 +119,7 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(
+                    self._backend = backend(
                        path_or_stream=path_or_stream, document_hash=self.document_hash
                    )

@@ -435,14 +446,14 @@ class DocumentConversionInput(BaseModel):
        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
-                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
+                    path_or_stream=obj, limits=self.limits, backend=pdf_backend
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
                    filename=obj.filename,
                    limits=self.limits,
-                    pdf_backend=pdf_backend,
+                    backend=pdf_backend,
                )

    @classmethod