Update examples and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-17 00:58:25 +00:00 · 2024-10-09 15:20:27 +02:00
parent 080042d06d
commit 0dfbd0b6fc
25 changed files with 181 additions and 150 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):

    @classmethod
    @abstractmethod
-    def is_paginated(cls) -> bool:
+    def supports_pagination(cls) -> bool:
        pass

    @abstractmethod
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def is_valid(self) -> bool:
        return True

-    def is_paginated(cls) -> bool:
+    def supports_pagination(cls) -> bool:
        return False

    def unload(self):
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
    def is_valid(self) -> bool:
        return self.valid

-    def is_paginated(cls) -> bool:
+    def supports_pagination(cls) -> bool:
        return True  # True? if so, how to handle pages...

    def unload(self):
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def is_valid(self) -> bool:
        return True

-    def is_paginated(cls) -> bool:
+    def supports_pagination(cls) -> bool:
        return False

    def unload(self):
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
 from docling_core.types.experimental import BoundingBox, Size
 from PIL import Image

-from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.abstract_backend import PaginatedDocumentBackend
 from docling.datamodel.base_models import Cell, InputFormat


@@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
        pass


-class PdfDocumentBackend(AbstractDocumentBackend):
+class PdfDocumentBackend(PaginatedDocumentBackend):
    @abstractmethod
    def load_page(self, page_no: int) -> PdfPageBackend:
        pass
@@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
        return {InputFormat.PDF}

    @classmethod
-    def is_paginated(cls) -> bool:
+    def supports_pagination(cls) -> bool:
        return True
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
+from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.pdf_document_converter import PdfDocumentConverter
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -195,9 +196,13 @@ def convert(
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
-    doc_converter = PdfDocumentConverter(
-        pipeline_options=pipeline_options,
-        pdf_backend=pdf_backend,
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options, backend=pdf_backend
+            )
+        }
    )

    # Define input files
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -168,5 +168,5 @@ class Page(BaseModel):
 class DocumentStream(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

-    filename: str
+    name: str
    stream: BytesIO
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
    description=DescriptionItem(), name="dummy"
 )  # TODO: Stub

-_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
-    InputFormat.PDF: DoclingParseDocumentBackend,
-    InputFormat.HTML: HTMLDocumentBackend,
-    InputFormat.DOCX: MsWordDocumentBackend,
-    InputFormat.PPTX: MsPowerpointDocumentBackend,
-    InputFormat.IMAGE: None,
-}
-

 class InputDocument(BaseModel):
    file: PurePath = None
@@ -110,14 +102,12 @@ class InputDocument(BaseModel):

        try:
            if isinstance(path_or_stream, Path):
-
                self.file = path_or_stream
                self.filesize = path_or_stream.stat().st_size
                if self.filesize > self.limits.max_file_size:
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-
                    self._init_doc(backend, path_or_stream)

            elif isinstance(path_or_stream, BytesIO):
@@ -128,12 +118,11 @@ class InputDocument(BaseModel):
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
-
                    self._init_doc(backend, path_or_stream)

            # For paginated backends, check if the maximum page count is exceeded.
            if self.valid and self._backend.is_valid():
-                if self._backend.is_paginated():
+                if self._backend.supports_pagination():
                    self.page_count = self._backend.page_count()
                    if not self.page_count <= self.limits.max_num_pages:
                        self.valid = False
@@ -156,12 +145,10 @@ class InputDocument(BaseModel):
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
        if backend is None:
-            backend = _input_format_default_backends.get(self.format)
-            if backend is None:
-                self.valid = False
-                raise RuntimeError(
-                    f"Could not find suitable backend for file: {self.file}"
-                )
+            raise RuntimeError(
+                f"No backend configuration provided for file {self.file} with format {self.format}. "
+                f"Please check your format configuration on DocumentConverter."
+            )

        self._backend = backend(
            path_or_stream=path_or_stream, document_hash=self.document_hash
@@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
    ) -> Iterable[InputDocument]:

        for obj in self._path_or_stream_iterator:
+            format = self._guess_format(obj)
+            if format not in format_options.keys():
+                _log.debug(
+                    f"Skipping input document {obj.name} because its format is not in the whitelist."
+                )
+                continue
+            else:
+                backend = format_options.get(format).backend
+
            if isinstance(obj, Path):
-
-                mime = filetype.guess_mime(str(obj))
-                if mime is None:
-                    if obj.suffix == ".html":
-                        mime = "text/html"
-
-                format = MimeTypeToFormat.get(mime)
-                if format not in format_options.keys():
-                    continue
-                else:
-                    backend = format_options.get(format).backend
-
                yield InputDocument(
                    path_or_stream=obj,
                    format=format,
+                    filename=obj.name,
                    limits=self.limits,
                    backend=backend,
                )
            elif isinstance(obj, DocumentStream):
-                mime = filetype.guess_mime(obj.stream.read(8192))
-                obj.stream.seek(0)
-
-                if mime is None:
-                    if obj.suffix == ".html":
-                        mime = "text/html"
-
-                format = MimeTypeToFormat.get(mime)
-                if format not in format_options.keys():
-                    continue
-                else:
-                    backend = format_options.get(format).backend
-
                yield InputDocument(
                    path_or_stream=obj.stream,
                    format=format,
-                    filename=obj.filename,
+                    filename=obj.name,
                    limits=self.limits,
                    backend=backend,
                )

+    def _guess_format(self, obj):
+        if isinstance(obj, Path):
+            mime = filetype.guess_mime(str(obj))
+        elif isinstance(obj, DocumentStream):
+            mime = filetype.guess_mime(obj.stream.read(8192))
+        else:
+            1 == 1  # alert!!
+        if mime is None:
+            if obj.suffix == ".html":
+                mime = "text/html"
+        format = MimeTypeToFormat.get(mime)
+        return format
+
    @classmethod
    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
        paths = [Path(p) for p in paths]
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,7 +1,7 @@
 import warnings
 from enum import Enum, auto
 from pathlib import Path
-from typing import Annotated, Optional, Self, Union
+from typing import Annotated, Optional, Union

 from pydantic import BaseModel, Field, model_validator

@@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
    images_scale: Optional[float] = None  # if set, the scale for generated images

    @model_validator(mode="after")
-    def set_page_images_from_deprecated(self) -> Self:
+    def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", DeprecationWarning)
            default_scale = 1.0
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -5,10 +5,21 @@ from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Type

 import requests
-from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
+from pydantic import (
+    AnyHttpUrl,
+    BaseModel,
+    ConfigDict,
+    TypeAdapter,
+    ValidationError,
+    field_validator,
+    model_validator,
+)

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import (
    ConversionResult,
@@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
 class FormatOption(BaseModel):
    pipeline_cls: Type[BaseModelPipeline]
    pipeline_options: Optional[PipelineOptions] = None
-    backend: Optional[Type[AbstractDocumentBackend]] = None
+    backend: Type[AbstractDocumentBackend]

    model_config = ConfigDict(arbitrary_types_allowed=True)

-    def __init__(
-        self,
-        pipeline_cls: Type[BaseModelPipeline],
-        pipeline_options: Optional[PipelineOptions] = None,
-        backend: Optional[Type[AbstractDocumentBackend]] = None,
-    ):
-        if pipeline_options is None:
-            pipeline_options = pipeline_cls.get_default_options()
+    @model_validator(mode="after")
+    def set_optional_field_default(self) -> "FormatOption":
+        if self.pipeline_options is None:
+            self.pipeline_options = self.pipeline_cls.get_default_options()
+        return self

-        super().__init__(
-            pipeline_cls=pipeline_cls,
-            pipeline_options=pipeline_options,
-            backend=backend,
-        )
+
+class WordFormatOption(FormatOption):
+    pipeline_cls: Type = SimpleModelPipeline
+    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
+
+
+class PowerpointFormatOption(FormatOption):
+    pipeline_cls: Type = SimpleModelPipeline
+    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
+
+
+class HTMLFormatOption(FormatOption):
+    pipeline_cls: Type = SimpleModelPipeline
+    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend


 class PdfFormatOption(FormatOption):
-    def __init__(
-        self,
-        pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
-        pipeline_options: Optional[PipelineOptions] = None,
-        backend: Optional[Type[AbstractDocumentBackend]] = None,
-    ):
-        if pipeline_cls is None:
-            pipeline_cls = StandardPdfModelPipeline
-        if backend is None:
-            backend = DoclingParseDocumentBackend
-        super().__init__(
-            pipeline_cls=pipeline_cls,
-            pipeline_options=pipeline_options,
-            backend=backend,
-        )
+    pipeline_cls: Type = StandardPdfModelPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend


 _format_to_default_options = {
-    InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
-    InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
-    InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
-    InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
-    InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
+    InputFormat.DOCX: FormatOption(
+        pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
+    ),
+    InputFormat.PPTX: FormatOption(
+        pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
+    ),
+    InputFormat.HTML: FormatOption(
+        pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
+    ),
+    InputFormat.IMAGE: FormatOption(
+        pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
+    ),
+    InputFormat.PDF: FormatOption(
+        pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
+    ),
 }


--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline):  # TODO this is a bad name.
        _log.info(f"Processing document {in_doc.file.name}")

        if not isinstance(in_doc._backend, PdfDocumentBackend):
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
+            raise RuntimeError(
+                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
+                f"Can not convert this with a PDF pipeline. "
+                f"Please check your format configuration on DocumentConverter."
+            )
+            # conv_res.status = ConversionStatus.FAILURE
+            # return conv_res

        for i in range(0, in_doc.page_count):
            conv_res.pages.append(Page(page_no=i))
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
            return conv_res

        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
+            raise RuntimeError(
+                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
+                f"Can not convert this with simple pipeline. "
+                f"Please check your format configuration on DocumentConverter."
+            )
+            # conv_res.status = ConversionStatus.FAILURE
+            # return conv_res

        # Instead of running a page-level pipeline to build up the document structure,
        # the backend is expected to be of type DeclarativeDocumentBackend, which can output