Merge branch 'release_v3' of github.com:DS4SD/docling into cau/layout-postprocessing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-15 16:18:22 +00:00 · 2024-12-04 14:21:09 +01:00
parent 11c7c43bad 78fad801fe
commit e97688cd3d
27 changed files with 1581 additions and 835 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,5 +1,4 @@
 from enum import Enum, auto
-from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Union

 from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
    Size,
    TableCell,
 )
+from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+    DocumentStream,
+)
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict

@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
    FAILURE = auto()
    SUCCESS = auto()
    PARTIAL_SUCCESS = auto()
+    SKIPPED = auto()


 class InputFormat(str, Enum):
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()
+    USER_INPUT = auto()


 class ErrorItem(BaseModel):
@@ -214,10 +218,3 @@ class Page(BaseModel):
    @property
    def image(self) -> Optional[Image]:
        return self.get_image(scale=self._default_image_scale)
-
-
-class DocumentStream(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    name: str
-    stream: BytesIO
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union

 import filetype
 from docling_core.types.doc import (
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_stream
 from pydantic import BaseModel
 from typing_extensions import deprecated

@@ -166,12 +166,6 @@ class InputDocument(BaseModel):
        backend: Type[AbstractDocumentBackend],
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
-        if backend is None:
-            raise RuntimeError(
-                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
-                f"Please check your format configuration on DocumentConverter."
-            )
-
        self._backend = backend(self, path_or_stream=path_or_stream)
        if not self._backend.is_valid():
            self.valid = False
@@ -452,6 +446,25 @@ class ConversionResult(BaseModel):
        return ds_doc


+class _DummyBackend(AbstractDocumentBackend):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def is_valid(self) -> bool:
+        return False
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set()
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        return super().unload()
+
+
 class _DocumentConversionInput(BaseModel):

    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -461,13 +474,14 @@ class _DocumentConversionInput(BaseModel):
        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
        for item in self.path_or_stream_iterator:
-            obj = resolve_file_source(item) if isinstance(item, str) else item
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
            format = self._guess_format(obj)
+            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
-                _log.info(
-                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                _log.error(
+                    f"Input document {obj.name} does not match any allowed format."
                )
-                continue
+                backend = _DummyBackend
            else:
                backend = format_options[format].backend

--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field


 class TableFormerMode(str, Enum):
+    """Modes for the TableFormer model."""
+
    FAST = "fast"
    ACCURATE = "accurate"


 class TableStructureOptions(BaseModel):
+    """Options for the table structure."""
+
    do_cell_matching: bool = (
        True
        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):


 class OcrOptions(BaseModel):
+    """OCR options."""
+
    kind: str
    lang: List[str]
    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):


 class RapidOcrOptions(OcrOptions):
+    """Options for the RapidOCR engine."""
+
    kind: Literal["rapidocr"] = "rapidocr"

    # English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):


 class EasyOcrOptions(OcrOptions):
+    """Options for the EasyOCR engine."""
+
    kind: Literal["easyocr"] = "easyocr"
    lang: List[str] = ["fr", "de", "es", "en"]
    use_gpu: bool = True  # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):


 class TesseractCliOcrOptions(OcrOptions):
+    """Options for the TesseractCli engine."""
+
    kind: Literal["tesseract"] = "tesseract"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):


 class TesseractOcrOptions(OcrOptions):
+    """Options for the Tesseract engine."""
+
    kind: Literal["tesserocr"] = "tesserocr"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):


 class OcrMacOptions(OcrOptions):
+    """Options for the Mac OCR engine."""
+
    kind: Literal["ocrmac"] = "ocrmac"
    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
    recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):


 class PipelineOptions(BaseModel):
+    """Base pipeline options."""
+
    create_legacy_output: bool = (
        True  # This defautl will be set to False on a future version of docling
    )


 class PdfPipelineOptions(PipelineOptions):
+    """Options for the PDF pipeline."""
+
    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text