feat(experimental): Add experimental TableCropsLayoutModel (#2669)

* feat: Scaffolding for layout and table model plugin factory Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing files Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add base options classes for layout and table Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(experimental): Add experimental TableCropsLayoutModel Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-11-25 05:14:51 +01:00
parent b75c6461f4
commit 134436245a
8 changed files with 184 additions and 6 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -71,6 +71,7 @@ from docling.datamodel.pipeline_options import (
    PipelineOptions,
    ProcessingPipeline,
    TableFormerMode,
+    TableStructureOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
    VlmPipelineOptions,
@@ -645,10 +646,13 @@ def convert(  # noqa: C901
                do_picture_classification=enrich_picture_classes,
                document_timeout=document_timeout,
            )
-            pipeline_options.table_structure_options.do_cell_matching = (
-                True  # do_cell_matching
-            )
-            pipeline_options.table_structure_options.mode = table_mode
+            if isinstance(
+                pipeline_options.table_structure_options, TableStructureOptions
+            ):
+                pipeline_options.table_structure_options.do_cell_matching = (
+                    True  # do_cell_matching
+                )
+                pipeline_options.table_structure_options.mode = table_mode

            if image_export_mode != ImageRefMode.PLACEHOLDER:
                pipeline_options.generate_page_images = True
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -354,9 +354,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    )
    # If True, text from backend will be used instead of generated text

-    table_structure_options: TableStructureOptions = TableStructureOptions()
+    table_structure_options: BaseTableStructureOptions = TableStructureOptions()
    ocr_options: OcrOptions = OcrAutoOptions()
-    layout_options: LayoutOptions = LayoutOptions()
+    layout_options: BaseLayoutOptions = LayoutOptions()

    images_scale: float = 1.0
    generate_page_images: bool = False
--- a/docling/experimental/datamodel/table_crops_layout_options.py
+++ b/docling/experimental/datamodel/table_crops_layout_options.py
@@ -0,0 +1,13 @@
+"""Internal options for the experimental TableCrops layout model."""
+
+from typing import ClassVar
+
+from docling.datamodel.pipeline_options import BaseLayoutOptions
+
+__all__ = ["TableCropsLayoutOptions"]
+
+
+class TableCropsLayoutOptions(BaseLayoutOptions):
+    """Options for TableCropsLayoutModel (internal-only)."""
+
+    kind: ClassVar[str] = "docling_experimental_table_crops_layout"
--- a/docling/experimental/models/init.py
+++ b/docling/experimental/models/init.py
@@ -0,0 +1,3 @@
+"""Experimental models for Docling."""
+
+__all__: list[str] = []
--- a/docling/experimental/models/table_crops_layout_model.py
+++ b/docling/experimental/models/table_crops_layout_model.py
@@ -0,0 +1,114 @@
+"""Internal TableCrops layout model that marks full pages as table clusters."""
+
+from __future__ import annotations
+
+import warnings
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+from docling_core.types.doc import DocItemLabel
+
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
+from docling.datamodel.document import ConversionResult
+from docling.experimental.datamodel.table_crops_layout_options import (
+    TableCropsLayoutOptions,
+)
+from docling.models.base_layout_model import BaseLayoutModel
+
+__all__ = ["TableCropsLayoutModel"]
+
+
+class TableCropsLayoutModel(BaseLayoutModel):
+    """Experimental layout model that treats the full page as a table cluster.
+    This is useful in cases where a Docling pipeline is applied to images of table crops only.
+
+    This model is internal and not part of the stable public interface.
+    """
+
+    def __init__(
+        self,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        options: TableCropsLayoutOptions,
+    ):
+        self.options = options
+        self.artifacts_path = artifacts_path
+        self.accelerator_options = accelerator_options
+
+    @classmethod
+    def get_options_type(cls) -> type[TableCropsLayoutOptions]:
+        return TableCropsLayoutOptions
+
+    def predict_layout(
+        self,
+        conv_res: ConversionResult,
+        pages: Sequence[Page],
+    ) -> Sequence[LayoutPrediction]:
+        layout_predictions: list[LayoutPrediction] = []
+
+        for page in pages:
+            if page._backend is None or not page._backend.is_valid():
+                existing_prediction = page.predictions.layout or LayoutPrediction()
+                layout_predictions.append(existing_prediction)
+                continue
+
+            clusters = self._build_page_clusters(page)
+            prediction = LayoutPrediction(clusters=clusters)
+
+            self._update_confidence(conv_res, page, clusters)
+
+            layout_predictions.append(prediction)
+
+        return layout_predictions
+
+    def _build_page_clusters(self, page: Page) -> list[Cluster]:
+        page_size = page.size
+        if page_size is None:
+            return []
+
+        bbox = BoundingBox(
+            l=0.0,
+            t=0.0,
+            r=page_size.width,
+            b=page_size.height,
+        )
+
+        cluster = Cluster(
+            id=0,
+            label=DocItemLabel.TABLE,
+            bbox=bbox,
+            confidence=1.0,
+            cells=[],
+        )
+
+        clusters = [cluster]
+
+        if not self.options.skip_cell_assignment:
+            page_cells = list(page.cells)
+            cluster.cells = page_cells
+
+            if not page_cells and not self.options.keep_empty_clusters:
+                clusters = []
+
+        return clusters
+
+    def _update_confidence(
+        self, conv_res: ConversionResult, page: Page, clusters: list[Cluster]
+    ) -> None:
+        """Populate layout and OCR confidence scores for the page."""
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                "Mean of empty slice|invalid value encountered in scalar divide",
+                RuntimeWarning,
+                "numpy",
+            )
+
+            conv_res.confidence.pages[page.page_no].layout_score = 1.0
+
+            ocr_cells = [cell for cell in page.cells if cell.from_ocr]
+            ocr_confidence = float(np.mean([cell.confidence for cell in ocr_cells]))
+            conv_res.confidence.pages[page.page_no].ocr_score = ocr_confidence
--- a/docling/models/plugins/defaults.py
+++ b/docling/models/plugins/defaults.py
@@ -31,11 +31,15 @@ def picture_description():


 def layout_engines():
+    from docling.experimental.models.table_crops_layout_model import (
+        TableCropsLayoutModel,
+    )
    from docling.models.layout_model import LayoutModel

    return {
        "layout_engines": [
            LayoutModel,
+            TableCropsLayoutModel,
        ]
    }

--- a/docs/examples/experimental/process_table_crops.py
+++ b/docs/examples/experimental/process_table_crops.py
@@ -0,0 +1,40 @@
+"""Run Docling on an image using the experimental TableCrops layout model."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import docling
+from docling.datamodel.document import InputFormat
+from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
+from docling.document_converter import DocumentConverter, ImageFormatOption
+from docling.experimental.datamodel.table_crops_layout_options import (
+    TableCropsLayoutOptions,
+)
+from docling.experimental.models.table_crops_layout_model import TableCropsLayoutModel
+from docling.models.factories import get_layout_factory
+
+
+def main() -> None:
+    sample_image = "tests/data/2305.03393v1-table_crop.png"
+
+    pipeline_options = ThreadedPdfPipelineOptions(
+        layout_options=TableCropsLayoutOptions(),
+        do_table_structure=True,
+        generate_page_images=True,
+    )
+
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.IMAGE],
+        format_options={
+            InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
+        },
+    )
+
+    conv_res = converter.convert(sample_image)
+
+    print(conv_res.document.tables[0].export_to_markdown())
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/data/2305.03393v1-table_crop.png
+++ b/tests/data/2305.03393v1-table_crop.png