feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s

* Add DoclingParseV3 backend implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use docling-core with docling-parse types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes and test updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test units

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add back DoclingParse v1 backend, pipeline options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update locks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update docling-core to 2.22.0

Update dependency library docling-core to latest release 2.22.0
Fix regression tests and ground truth files

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Ground-truth files updated

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests, use TextCell.from_ocr property

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Text fixes, new test data

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename docling backend to v4

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Test all backends, fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset all tests to use docling-parse v1 for now

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* test_input_doc use default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2025-03-18 10:38:19 +01:00
committed by GitHub
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions

View File

@@ -2,9 +2,9 @@ import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
if page.size is None:
return cells
for cell in page.cells:
new_bbox = cell.bbox.to_top_left_origin(
page_height=page.size.height
).normalized(page_size=page.size)
is_ocr = isinstance(cell, OcrCell)
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
new_bbox = (
cell.rect.to_bounding_box()
.to_top_left_origin(page_height=page.size.height)
.normalized(page_size=page.size)
)
is_ocr = cell.from_ocr
ocr_confidence = cell.confidence
cells.append(
{
"text": cell.text,

View File

@@ -5,9 +5,10 @@ from collections import defaultdict
from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel, Size
from docling_core.types.doc.page import TextCell
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
from docling.datamodel.base_models import BoundingBox, Cluster
_log = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
"""Initialize processor with cells and clusters."""
"""Initialize processor with cells and spatial indices."""
self.cells = cells
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
)
def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
"""Main processing pipeline."""
self.regular_clusters = self._process_regular_clusters()
self.special_clusters = self._process_special_clusters()
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
next_id = max((c.id for c in self.all_clusters), default=0) + 1
orphan_clusters = []
for i, cell in enumerate(unassigned):
conf = 1.0
if isinstance(cell, OcrCell):
conf = cell.confidence
conf = cell.confidence
orphan_clusters.append(
Cluster(
id=next_id + i,
label=DocItemLabel.TEXT,
bbox=cell.bbox,
bbox=cell.to_bounding_box(),
confidence=conf,
cells=[cell],
)
@@ -557,13 +556,13 @@ class LayoutPostprocessor:
return current_best if current_best else clusters[0]
def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
"""Ensure each cell appears only once, maintaining order of first appearance."""
seen_ids = set()
unique_cells = []
for cell in cells:
if cell.id not in seen_ids:
seen_ids.add(cell.id)
if cell.index not in seen_ids:
seen_ids.add(cell.index)
unique_cells.append(cell)
return unique_cells
@@ -582,11 +581,13 @@ class LayoutPostprocessor:
best_cluster = None
for cluster in clusters:
if cell.bbox.area() <= 0:
if cell.rect.to_bounding_box().area() <= 0:
continue
overlap = cell.bbox.intersection_area_with(cluster.bbox)
overlap_ratio = overlap / cell.bbox.area()
overlap = cell.rect.to_bounding_box().intersection_area_with(
cluster.bbox
)
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
if overlap_ratio > best_overlap:
best_overlap = overlap_ratio
@@ -601,11 +602,13 @@ class LayoutPostprocessor:
return clusters
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
"""Find cells not assigned to any cluster."""
assigned = {cell.id for cluster in clusters for cell in cluster.cells}
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
return [
cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
cell
for cell in self.cells
if cell.index not in assigned and cell.text.strip()
]
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
continue
cells_bbox = BoundingBox(
l=min(cell.bbox.l for cell in cluster.cells),
t=min(cell.bbox.t for cell in cluster.cells),
r=max(cell.bbox.r for cell in cluster.cells),
b=max(cell.bbox.b for cell in cluster.cells),
l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
)
if cluster.label == DocItemLabel.TABLE:
@@ -634,9 +637,9 @@ class LayoutPostprocessor:
return clusters
def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
"""Sort cells in native reading order."""
return sorted(cells, key=lambda c: (c.id))
return sorted(cells, key=lambda c: (c.index))
def _sort_clusters(
self, clusters: List[Cluster], mode: str = "id"
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
clusters,
key=lambda cluster: (
(
min(cell.id for cell in cluster.cells)
min(cell.index for cell in cluster.cells)
if cluster.cells
else sys.maxsize
),

View File

@@ -25,7 +25,7 @@ def draw_clusters(
# Draw cells first (underneath)
cell_color = (0, 0, 0, 40) # Transparent black for cells
for tc in c.cells:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
cx0 *= scale_x
cx1 *= scale_x
cy0 *= scale_x