mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 06:08:09 +00:00
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -2,9 +2,9 @@ import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.base_models import OcrCell
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
|
||||
if page.size is None:
|
||||
return cells
|
||||
for cell in page.cells:
|
||||
new_bbox = cell.bbox.to_top_left_origin(
|
||||
page_height=page.size.height
|
||||
).normalized(page_size=page.size)
|
||||
is_ocr = isinstance(cell, OcrCell)
|
||||
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
|
||||
new_bbox = (
|
||||
cell.rect.to_bounding_box()
|
||||
.to_top_left_origin(page_height=page.size.height)
|
||||
.normalized(page_size=page.size)
|
||||
)
|
||||
is_ocr = cell.from_ocr
|
||||
ocr_confidence = cell.confidence
|
||||
cells.append(
|
||||
{
|
||||
"text": cell.text,
|
||||
|
||||
@@ -5,9 +5,10 @@ from collections import defaultdict
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, Size
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -198,7 +199,7 @@ class LayoutPostprocessor:
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
|
||||
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
|
||||
"""Initialize processor with cells and clusters."""
|
||||
"""Initialize processor with cells and spatial indices."""
|
||||
self.cells = cells
|
||||
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
|
||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||
)
|
||||
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||
"""Main processing pipeline."""
|
||||
self.regular_clusters = self._process_regular_clusters()
|
||||
self.special_clusters = self._process_special_clusters()
|
||||
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
|
||||
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
||||
orphan_clusters = []
|
||||
for i, cell in enumerate(unassigned):
|
||||
conf = 1.0
|
||||
if isinstance(cell, OcrCell):
|
||||
conf = cell.confidence
|
||||
conf = cell.confidence
|
||||
|
||||
orphan_clusters.append(
|
||||
Cluster(
|
||||
id=next_id + i,
|
||||
label=DocItemLabel.TEXT,
|
||||
bbox=cell.bbox,
|
||||
bbox=cell.to_bounding_box(),
|
||||
confidence=conf,
|
||||
cells=[cell],
|
||||
)
|
||||
@@ -557,13 +556,13 @@ class LayoutPostprocessor:
|
||||
|
||||
return current_best if current_best else clusters[0]
|
||||
|
||||
def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
|
||||
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
||||
seen_ids = set()
|
||||
unique_cells = []
|
||||
for cell in cells:
|
||||
if cell.id not in seen_ids:
|
||||
seen_ids.add(cell.id)
|
||||
if cell.index not in seen_ids:
|
||||
seen_ids.add(cell.index)
|
||||
unique_cells.append(cell)
|
||||
return unique_cells
|
||||
|
||||
@@ -582,11 +581,13 @@ class LayoutPostprocessor:
|
||||
best_cluster = None
|
||||
|
||||
for cluster in clusters:
|
||||
if cell.bbox.area() <= 0:
|
||||
if cell.rect.to_bounding_box().area() <= 0:
|
||||
continue
|
||||
|
||||
overlap = cell.bbox.intersection_area_with(cluster.bbox)
|
||||
overlap_ratio = overlap / cell.bbox.area()
|
||||
overlap = cell.rect.to_bounding_box().intersection_area_with(
|
||||
cluster.bbox
|
||||
)
|
||||
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
||||
|
||||
if overlap_ratio > best_overlap:
|
||||
best_overlap = overlap_ratio
|
||||
@@ -601,11 +602,13 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
|
||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
|
||||
"""Find cells not assigned to any cluster."""
|
||||
assigned = {cell.id for cluster in clusters for cell in cluster.cells}
|
||||
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
||||
return [
|
||||
cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
|
||||
cell
|
||||
for cell in self.cells
|
||||
if cell.index not in assigned and cell.text.strip()
|
||||
]
|
||||
|
||||
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
||||
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
|
||||
continue
|
||||
|
||||
cells_bbox = BoundingBox(
|
||||
l=min(cell.bbox.l for cell in cluster.cells),
|
||||
t=min(cell.bbox.t for cell in cluster.cells),
|
||||
r=max(cell.bbox.r for cell in cluster.cells),
|
||||
b=max(cell.bbox.b for cell in cluster.cells),
|
||||
l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
|
||||
t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
|
||||
r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
|
||||
b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
|
||||
)
|
||||
|
||||
if cluster.label == DocItemLabel.TABLE:
|
||||
@@ -634,9 +637,9 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
|
||||
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
"""Sort cells in native reading order."""
|
||||
return sorted(cells, key=lambda c: (c.id))
|
||||
return sorted(cells, key=lambda c: (c.index))
|
||||
|
||||
def _sort_clusters(
|
||||
self, clusters: List[Cluster], mode: str = "id"
|
||||
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
|
||||
clusters,
|
||||
key=lambda cluster: (
|
||||
(
|
||||
min(cell.id for cell in cluster.cells)
|
||||
min(cell.index for cell in cluster.cells)
|
||||
if cluster.cells
|
||||
else sys.maxsize
|
||||
),
|
||||
|
||||
@@ -25,7 +25,7 @@ def draw_clusters(
|
||||
# Draw cells first (underneath)
|
||||
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
||||
for tc in c.cells:
|
||||
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
||||
cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
|
||||
cx0 *= scale_x
|
||||
cx1 *= scale_x
|
||||
cy0 *= scale_x
|
||||
|
||||
Reference in New Issue
Block a user