feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)

* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-16 16:48:21 +00:00 · 2025-03-18 10:38:19 +01:00
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -6,11 +6,12 @@ from typing import Iterable, List

 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
        p.dimension = 2
        idx = index.Index(properties=p)
        for i, cell in enumerate(programmatic_cells):
-            idx.insert(i, cell.bbox.as_tuple())
+            idx.insert(i, cell.rect.to_bounding_box().as_tuple())

        def is_overlapping_with_existing_cells(ocr_cell):
            # Query the R-tree to get overlapping rectangles
-            possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
+            possible_matches_index = list(
+                idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
+            )

            return (
                len(possible_matches_index) > 0
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
        """
        if self.options.force_full_page_ocr:
            # If a full page OCR is forced, use only the OCR cells
-            cells = [
-                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
-                for c_ocr in ocr_cells
-            ]
+            cells = ocr_cells
            return cells

        ## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):

        # Draw OCR and programmatic cells
        for tc in page.cells:
-            x0, y0, x1, y1 = tc.bbox.as_tuple()
+            x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
            y0 *= scale_x
            y1 *= scale_y
            x0 *= scale_x
@@ -165,9 +165,8 @@ class BaseOcrModel(BasePageModel):
            if y1 <= y0:
                y1, y0 = y0, y1

-            color = "gray"
-            if isinstance(tc, OcrCell):
-                color = "magenta"
+            color = "magenta" if tc.from_ocr else "gray"
+
            draw.rectangle([(x0, y0), (x1, y1)], outline=color)

        if show:
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -6,8 +6,9 @@ from typing import Iterable, List, Optional

 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@@ -148,18 +149,22 @@ class EasyOcrModel(BaseOcrModel):
                        del im

                        cells = [
-                            OcrCell(
-                                id=ix,
+                            TextCell(
+                                index=ix,
                                text=line[1],
+                                orig=line[1],
+                                from_ocr=True,
                                confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
                                ),
                            )
                            for ix, line in enumerate(result)
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@@ -3,8 +3,9 @@ import tempfile
 from typing import Iterable, Optional, Tuple

 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrMacOptions
 from docling.datamodel.settings import settings
@@ -94,13 +95,17 @@ class OcrMacModel(BaseOcrModel):
                            bottom = y2 / self.scale

                            cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                    text=text,
+                                    orig=text,
+                                    from_ocr=True,
                                    confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
                                    ),
                                )
                            )
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder

 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
+    create_parsed_page: bool


 class PagePreprocessingModel(BasePageModel):
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):

        page.cells = list(page._backend.get_text_cells())

+        if self.options.create_parsed_page:
+            page.parsed_page = page._backend.get_segmented_page()
+
        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
            draw = ImageDraw.Draw(image)
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@@ -3,8 +3,9 @@ from typing import Iterable

 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@@ -100,18 +101,26 @@ class RapidOcrModel(BaseOcrModel):

                        if result is not None:
                            cells = [
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                    text=line[1],
+                                    orig=line[1],
                                    confidence=line[2],
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(
-                                            (line[0][0][0] / self.scale) + ocr_rect.l,
-                                            (line[0][0][1] / self.scale) + ocr_rect.t,
-                                            (line[0][2][0] / self.scale) + ocr_rect.l,
-                                            (line[0][2][1] / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    from_ocr=True,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(
+                                                (line[0][0][0] / self.scale)
+                                                + ocr_rect.l,
+                                                (line[0][0][1] / self.scale)
+                                                + ocr_rect.t,
+                                                (line[0][2][0] / self.scale)
+                                                + ocr_rect.l,
+                                                (line[0][2][1] / self.scale)
+                                                + ocr_rect.t,
+                                            ),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
                                    ),
                                )
                                for ix, line in enumerate(result)
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union

 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
+from docling_core.types.doc.page import BoundingRectangle
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
            draw.rectangle([(x0, y0), (x1, y1)], outline="red")

            for cell in table_element.cluster.cells:
-                x0, y0, x1, y1 = cell.bbox.as_tuple()
+                x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
                x0 *= scale_x
                x1 *= scale_x
                y0 *= scale_x
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
                                # Only allow non empty stings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
-                                    new_cell.bbox = new_cell.bbox.scaled(
-                                        scale=self.scale
+                                    new_cell.rect = BoundingRectangle.from_bounding_box(
+                                        new_cell.rect.to_bounding_box().scaled(
+                                            scale=self.scale
+                                        )
                                    )

-                                    tokens.append(new_cell.model_dump())
+                                    tokens.append(
+                                        {
+                                            "id": new_cell.index,
+                                            "text": new_cell.text,
+                                            "bbox": new_cell.rect.to_bounding_box().model_dump(),
+                                        }
+                                    )
                            page_input["tokens"] = tokens

                            tf_output = self.tf_predictor.multi_table_predict(
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
@@ -228,18 +229,22 @@ class TesseractOcrCliModel(BaseOcrModel):
                            t = b + h
                            r = l + w

-                            cell = OcrCell(
-                                id=ix,
+                            cell = TextCell(
+                                index=ix,
                                text=text,
+                                orig=text,
+                                from_ocr=True,
                                confidence=conf / 100.0,
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (l / self.scale) + ocr_rect.l,
-                                        (b / self.scale) + ocr_rect.t,
-                                        (r / self.scale) + ocr_rect.l,
-                                        (t / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (l / self.scale) + ocr_rect.l,
+                                            (b / self.scale) + ocr_rect.t,
+                                            (r / self.scale) + ocr_rect.l,
+                                            (t / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
                                ),
                            )
                            all_ocr_cells.append(cell)
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -2,8 +2,9 @@ import logging
 from typing import Iterable

 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
@@ -173,13 +174,17 @@ class TesseractOcrModel(BaseOcrModel):
                            top = (box["y"] + box["h"]) / self.scale

                            cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                    text=text,
+                                    orig=text,
+                                    from_ocr=True,
                                    confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        ),
                                    ),
                                )
                            )