feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)

* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-11 06:08:09 +00:00 · 2025-03-18 10:38:19 +01:00
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -2,9 +2,9 @@ import logging
 from typing import Any, Dict, Iterable, List, Tuple, Union

 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table

-from docling.datamodel.base_models import OcrCell
 from docling.datamodel.document import ConversionResult, Page

 _log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
        if page.size is None:
            return cells
        for cell in page.cells:
-            new_bbox = cell.bbox.to_top_left_origin(
-                page_height=page.size.height
-            ).normalized(page_size=page.size)
-            is_ocr = isinstance(cell, OcrCell)
-            ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
+            new_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page.size.height)
+                .normalized(page_size=page.size)
+            )
+            is_ocr = cell.from_ocr
+            ocr_confidence = cell.confidence
            cells.append(
                {
                    "text": cell.text,
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -5,9 +5,10 @@ from collections import defaultdict
 from typing import Dict, List, Set, Tuple

 from docling_core.types.doc import DocItemLabel, Size
+from docling_core.types.doc.page import TextCell
 from rtree import index

-from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
+from docling.datamodel.base_models import BoundingBox, Cluster

 _log = logging.getLogger(__name__)

@@ -198,7 +199,7 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }

-    def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
+    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
        """Initialize processor with cells and clusters."""
        """Initialize processor with cells and spatial indices."""
        self.cells = cells
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
            [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
        )

-    def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
+    def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
        """Main processing pipeline."""
        self.regular_clusters = self._process_regular_clusters()
        self.special_clusters = self._process_special_clusters()
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
            next_id = max((c.id for c in self.all_clusters), default=0) + 1
            orphan_clusters = []
            for i, cell in enumerate(unassigned):
-                conf = 1.0
-                if isinstance(cell, OcrCell):
-                    conf = cell.confidence
+                conf = cell.confidence

                orphan_clusters.append(
                    Cluster(
                        id=next_id + i,
                        label=DocItemLabel.TEXT,
-                        bbox=cell.bbox,
+                        bbox=cell.to_bounding_box(),
                        confidence=conf,
                        cells=[cell],
                    )
@@ -557,13 +556,13 @@ class LayoutPostprocessor:

        return current_best if current_best else clusters[0]

-    def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
+    def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
        """Ensure each cell appears only once, maintaining order of first appearance."""
        seen_ids = set()
        unique_cells = []
        for cell in cells:
-            if cell.id not in seen_ids:
-                seen_ids.add(cell.id)
+            if cell.index not in seen_ids:
+                seen_ids.add(cell.index)
                unique_cells.append(cell)
        return unique_cells

@@ -582,11 +581,13 @@ class LayoutPostprocessor:
            best_cluster = None

            for cluster in clusters:
-                if cell.bbox.area() <= 0:
+                if cell.rect.to_bounding_box().area() <= 0:
                    continue

-                overlap = cell.bbox.intersection_area_with(cluster.bbox)
-                overlap_ratio = overlap / cell.bbox.area()
+                overlap = cell.rect.to_bounding_box().intersection_area_with(
+                    cluster.bbox
+                )
+                overlap_ratio = overlap / cell.rect.to_bounding_box().area()

                if overlap_ratio > best_overlap:
                    best_overlap = overlap_ratio
@@ -601,11 +602,13 @@ class LayoutPostprocessor:

        return clusters

-    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
+    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
        """Find cells not assigned to any cluster."""
-        assigned = {cell.id for cluster in clusters for cell in cluster.cells}
+        assigned = {cell.index for cluster in clusters for cell in cluster.cells}
        return [
-            cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
+            cell
+            for cell in self.cells
+            if cell.index not in assigned and cell.text.strip()
        ]

    def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
                continue

            cells_bbox = BoundingBox(
-                l=min(cell.bbox.l for cell in cluster.cells),
-                t=min(cell.bbox.t for cell in cluster.cells),
-                r=max(cell.bbox.r for cell in cluster.cells),
-                b=max(cell.bbox.b for cell in cluster.cells),
+                l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
+                t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
+                r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
+                b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
            )

            if cluster.label == DocItemLabel.TABLE:
@@ -634,9 +637,9 @@ class LayoutPostprocessor:

        return clusters

-    def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
+    def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
        """Sort cells in native reading order."""
-        return sorted(cells, key=lambda c: (c.id))
+        return sorted(cells, key=lambda c: (c.index))

    def _sort_clusters(
        self, clusters: List[Cluster], mode: str = "id"
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
                clusters,
                key=lambda cluster: (
                    (
-                        min(cell.id for cell in cluster.cells)
+                        min(cell.index for cell in cluster.cells)
                        if cluster.cells
                        else sys.maxsize
                    ),
--- a/docling/utils/visualization.py
+++ b/docling/utils/visualization.py
@@ -25,7 +25,7 @@ def draw_clusters(
            # Draw cells first (underneath)
            cell_color = (0, 0, 0, 40)  # Transparent black for cells
            for tc in c.cells:
-                cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
                cx0 *= scale_x
                cx1 *= scale_x
                cy0 *= scale_x