feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)

* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-08 20:58:11 +00:00 · 2025-03-18 10:38:19 +01:00
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -6,12 +6,12 @@ from typing import Iterable, List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
 from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)
@@ -68,8 +68,11 @@ class DoclingParsePageBackend(PdfPageBackend):

        return text_piece

-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return None
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        cells: List[TextCell] = []
        cell_counter = 0

        if not self.valid:
@@ -91,19 +94,24 @@ class DoclingParsePageBackend(PdfPageBackend):

            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
            cells.append(
-                Cell(
-                    id=cell_counter,
+                TextCell(
+                    index=cell_counter,
                    text=text_piece,
-                    bbox=BoundingBox(
-                        # l=x0, b=y0, r=x1, t=y1,
-                        l=x0 * page_size.width / parser_width,
-                        b=y0 * page_size.height / parser_height,
-                        r=x1 * page_size.width / parser_width,
-                        t=y1 * page_size.height / parser_height,
-                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    orig=text_piece,
+                    from_ocr=False,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            # l=x0, b=y0, r=x1, t=y1,
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
+
            cell_counter += 1

        def draw_clusters_and_cells():
@@ -112,7 +120,7 @@ class DoclingParsePageBackend(PdfPageBackend):
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
+                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell, Size
+from docling.datamodel.base_models import Size
 from docling.utils.locks import pypdfium2_lock

 if TYPE_CHECKING:
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):

        return text_piece

-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return None
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        cells: List[TextCell] = []
        cell_counter = 0

        if not self.valid:
@@ -106,16 +110,20 @@ class DoclingParseV2PageBackend(PdfPageBackend):

            text_piece = cell_data[cells_header.index("text")]
            cells.append(
-                Cell(
-                    id=cell_counter,
+                TextCell(
+                    index=cell_counter,
                    text=text_piece,
-                    bbox=BoundingBox(
-                        # l=x0, b=y0, r=x1, t=y1,
-                        l=x0 * page_size.width / parser_width,
-                        b=y0 * page_size.height / parser_height,
-                        r=x1 * page_size.width / parser_width,
-                        t=y1 * page_size.height / parser_height,
-                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    orig=text_piece,
+                    from_ocr=False,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            # l=x0, b=y0, r=x1, t=y1,
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@@ -0,0 +1,185 @@
+import logging
+import random
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+
+import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
+from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Size
+from docling.utils.locks import pypdfium2_lock
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class DoclingParseV4PageBackend(PdfPageBackend):
+    def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
+        self._ppage = page_obj
+        self._dpage = parsed_page
+        self.valid = parsed_page is not None
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+
+        for i, cell in enumerate(self._dpage.textline_cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
+
+            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
+
+        return text_piece
+
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return self._dpage
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        page_size = self.get_size()
+
+        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
+
+        # for cell in self._dpage.textline_cells:
+        #     rect = cell.rect
+        #
+        #     assert (
+        #         rect.to_bounding_box().l <= rect.to_bounding_box().r
+        #     ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
+        #     assert (
+        #         rect.to_bounding_box().t <= rect.to_bounding_box().b
+        #     ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
+
+        return self._dpage.textline_cells
+
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+
+        images = self._dpage.bitmap_resources
+
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
+
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+
+                yield cropbox
+
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+
+        page_size = self.get_size()
+
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+
+        return image
+
+    def get_size(self) -> Size:
+        return Size(
+            width=self._dpage.dimension.width,
+            height=self._dpage.dimension.height,
+        )
+
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+
+
+class DoclingParseV4DocumentBackend(PdfDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        with pypdfium2_lock:
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+        self.parser = DoclingPdfParser(loglevel="fatal")
+        self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
+        success = self.dp_doc is not None
+
+        if not success:
+            raise RuntimeError(
+                f"docling-parse v4 could not load document {self.document_hash}."
+            )
+
+    def page_count(self) -> int:
+        # return len(self._pdoc)  # To be replaced with docling-parse API
+
+        len_1 = len(self._pdoc)
+        len_2 = self.dp_doc.number_of_pages()
+
+        if len_1 != len_2:
+            _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
+
+        return len_2
+
+    def load_page(
+        self, page_no: int, create_words: bool = True, create_textlines: bool = True
+    ) -> DoclingParseV4PageBackend:
+        with pypdfium2_lock:
+            return DoclingParseV4PageBackend(
+                self.dp_doc.get_page(
+                    page_no + 1,
+                    create_words=create_words,
+                    create_textlines=create_textlines,
+                ),
+                self._pdoc[page_no],
+            )
+
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+
+    def unload(self):
+        super().unload()
+        self.dp_doc.unload()
+        with pypdfium2_lock:
+            self._pdoc.close()
+        self._pdoc = None
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -4,10 +4,11 @@ from pathlib import Path
 from typing import Iterable, Optional, Set, Union

 from docling_core.types.doc import BoundingBox, Size
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from PIL import Image

 from docling.backend.abstract_backend import PaginatedDocumentBackend
-from docling.datamodel.base_models import Cell, InputFormat
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument


@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
        pass

    @abstractmethod
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        pass
+
+    @abstractmethod
+    def get_text_cells(self) -> Iterable[TextCell]:
        pass

    @abstractmethod
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
 from docling.utils.locks import pypdfium2_lock

 if TYPE_CHECKING:
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):

        return text_piece

-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return None
+
+    def get_text_cells(self) -> Iterable[TextCell]:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
                text_piece = self.text_page.get_text_bounded(*rect)
                x0, y0, x1, y1 = rect
                cells.append(
-                    Cell(
-                        id=cell_counter,
+                    TextCell(
+                        index=cell_counter,
                        text=text_piece,
-                        bbox=BoundingBox(
-                            l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
+                        orig=text_piece,
+                        from_ocr=False,
+                        rect=BoundingRectangle.from_bounding_box(
+                            BoundingBox(
+                                l=x0,
+                                b=y0,
+                                r=x1,
+                                t=y1,
+                                coord_origin=CoordOrigin.BOTTOMLEFT,
+                            )
                        ).to_top_left_origin(page_size.height),
                    )
                )
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
        # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
        # The cell merging code below is to clean this up.
        def merge_horizontal_cells(
-            cells: List[Cell],
+            cells: List[TextCell],
            horizontal_threshold_factor: float = 1.0,
            vertical_threshold_factor: float = 0.5,
-        ) -> List[Cell]:
+        ) -> List[TextCell]:
            if not cells:
                return []

-            def group_rows(cells: List[Cell]) -> List[List[Cell]]:
+            def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
                rows = []
                current_row = [cells[0]]
-                row_top = cells[0].bbox.t
-                row_bottom = cells[0].bbox.b
-                row_height = cells[0].bbox.height
+                row_top = cells[0].rect.to_bounding_box().t
+                row_bottom = cells[0].rect.to_bounding_box().b
+                row_height = cells[0].rect.to_bounding_box().height

                for cell in cells[1:]:
                    vertical_threshold = row_height * vertical_threshold_factor
                    if (
-                        abs(cell.bbox.t - row_top) <= vertical_threshold
-                        and abs(cell.bbox.b - row_bottom) <= vertical_threshold
+                        abs(cell.rect.to_bounding_box().t - row_top)
+                        <= vertical_threshold
+                        and abs(cell.rect.to_bounding_box().b - row_bottom)
+                        <= vertical_threshold
                    ):
                        current_row.append(cell)
-                        row_top = min(row_top, cell.bbox.t)
-                        row_bottom = max(row_bottom, cell.bbox.b)
+                        row_top = min(row_top, cell.rect.to_bounding_box().t)
+                        row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
                        row_height = row_bottom - row_top
                    else:
                        rows.append(current_row)
                        current_row = [cell]
-                        row_top = cell.bbox.t
-                        row_bottom = cell.bbox.b
-                        row_height = cell.bbox.height
+                        row_top = cell.rect.to_bounding_box().t
+                        row_bottom = cell.rect.to_bounding_box().b
+                        row_height = cell.rect.to_bounding_box().height

                if current_row:
                    rows.append(current_row)

                return rows

-            def merge_row(row: List[Cell]) -> List[Cell]:
+            def merge_row(row: List[TextCell]) -> List[TextCell]:
                merged = []
                current_group = [row[0]]

                for cell in row[1:]:
                    prev_cell = current_group[-1]
-                    avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
+                    avg_height = (
+                        prev_cell.rect.height + cell.rect.to_bounding_box().height
+                    ) / 2
                    if (
-                        cell.bbox.l - prev_cell.bbox.r
+                        cell.rect.to_bounding_box().l
+                        - prev_cell.rect.to_bounding_box().r
                        <= avg_height * horizontal_threshold_factor
                    ):
                        current_group.append(cell)
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):

                return merged

-            def merge_group(group: List[Cell]) -> Cell:
+            def merge_group(group: List[TextCell]) -> TextCell:
                if len(group) == 1:
                    return group[0]

                merged_text = "".join(cell.text for cell in group)
                merged_bbox = BoundingBox(
-                    l=min(cell.bbox.l for cell in group),
-                    t=min(cell.bbox.t for cell in group),
-                    r=max(cell.bbox.r for cell in group),
-                    b=max(cell.bbox.b for cell in group),
+                    l=min(cell.rect.to_bounding_box().l for cell in group),
+                    t=min(cell.rect.to_bounding_box().t for cell in group),
+                    r=max(cell.rect.to_bounding_box().r for cell in group),
+                    b=max(cell.rect.to_bounding_box().b for cell in group),
+                )
+                return TextCell(
+                    index=group[0].index,
+                    text=merged_text,
+                    orig=merged_text,
+                    rect=BoundingRectangle.from_bounding_box(merged_bbox),
+                    from_ocr=False,
                )
-                return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)

            rows = group_rows(cells)
            merged_cells = [cell for row in rows for cell in merge_row(row)]

            for i, cell in enumerate(merged_cells, 1):
-                cell.id = i
+                cell.index = i

            return merged_cells

@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
+                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -16,6 +16,7 @@ from pydantic import TypeAdapter

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
@@ -412,12 +413,15 @@ def convert(
        if artifacts_path is not None:
            pipeline_options.artifacts_path = artifacts_path

+        backend: Type[PdfDocumentBackend]
        if pdf_backend == PdfBackend.DLPARSE_V1:
-            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+            backend = DoclingParseDocumentBackend
        elif pdf_backend == PdfBackend.DLPARSE_V2:
            backend = DoclingParseV2DocumentBackend
+        elif pdf_backend == PdfBackend.DLPARSE_V4:
+            backend = DoclingParseV4DocumentBackend  # type: ignore
        elif pdf_backend == PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend
+            backend = PyPdfiumDocumentBackend  # type: ignore
        else:
            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
    Size,
    TableCell,
 )
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
    DocumentStream,
 )
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
    error_message: str


-class Cell(BaseModel):
-    id: int
-    text: str
-    bbox: BoundingBox
-
-
-class OcrCell(Cell):
-    confidence: float
+# class Cell(BaseModel):
+#    id: int
+#    text: str
+#    bbox: BoundingBox


 class Cluster(BaseModel):
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
-    cells: List[Cell] = []
+    cells: List[TextCell] = []
    children: List["Cluster"] = []  # Add child cluster support


@@ -226,7 +223,8 @@ class Page(BaseModel):
    page_no: int
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
-    cells: List[Cell] = []
+    cells: List[TextCell] = []
+    parsed_page: Optional[SegmentedPdfPage] = None
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None

--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -301,6 +301,7 @@ class PdfBackend(str, Enum):
    PYPDFIUM2 = "pypdfium2"
    DLPARSE_V1 = "dlparse_v1"
    DLPARSE_V2 = "dlparse_v2"
+    DLPARSE_V4 = "dlparse_v4"


 # Define an enum for the ocr engines
@@ -381,3 +382,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
            "before conversion and then use the `TableItem.get_image` function."
        ),
    )
+
+    generate_parsed_pages: bool = False
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.csv_backend import CsvDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
@@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption):

 class ImageFormatOption(FormatOption):
    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend


 class PdfFormatOption(FormatOption):
    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend


 def _get_default_option(format: InputFormat) -> FormatOption:
@@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
        ),
        InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
        ),
        InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
        ),
        InputFormat.JSON_DOCLING: FormatOption(
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -6,11 +6,12 @@ from typing import Iterable, List

 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
        p.dimension = 2
        idx = index.Index(properties=p)
        for i, cell in enumerate(programmatic_cells):
-            idx.insert(i, cell.bbox.as_tuple())
+            idx.insert(i, cell.rect.to_bounding_box().as_tuple())

        def is_overlapping_with_existing_cells(ocr_cell):
            # Query the R-tree to get overlapping rectangles
-            possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
+            possible_matches_index = list(
+                idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
+            )

            return (
                len(possible_matches_index) > 0
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
        """
        if self.options.force_full_page_ocr:
            # If a full page OCR is forced, use only the OCR cells
-            cells = [
-                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
-                for c_ocr in ocr_cells
-            ]
+            cells = ocr_cells
            return cells

        ## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):

        # Draw OCR and programmatic cells
        for tc in page.cells:
-            x0, y0, x1, y1 = tc.bbox.as_tuple()
+            x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
            y0 *= scale_x
            y1 *= scale_y
            x0 *= scale_x
@@ -165,9 +165,8 @@ class BaseOcrModel(BasePageModel):
            if y1 <= y0:
                y1, y0 = y0, y1

-            color = "gray"
-            if isinstance(tc, OcrCell):
-                color = "magenta"
+            color = "magenta" if tc.from_ocr else "gray"
+
            draw.rectangle([(x0, y0), (x1, y1)], outline=color)

        if show:
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -6,8 +6,9 @@ from typing import Iterable, List, Optional

 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@@ -148,18 +149,22 @@ class EasyOcrModel(BaseOcrModel):
                        del im

                        cells = [
-                            OcrCell(
-                                id=ix,
+                            TextCell(
+                                index=ix,
                                text=line[1],
+                                orig=line[1],
+                                from_ocr=True,
                                confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
                                ),
                            )
                            for ix, line in enumerate(result)
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@@ -3,8 +3,9 @@ import tempfile
 from typing import Iterable, Optional, Tuple

 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrMacOptions
 from docling.datamodel.settings import settings
@@ -94,13 +95,17 @@ class OcrMacModel(BaseOcrModel):
                            bottom = y2 / self.scale

                            cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                    text=text,
+                                    orig=text,
+                                    from_ocr=True,
                                    confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
                                    ),
                                )
                            )
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder

 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
+    create_parsed_page: bool


 class PagePreprocessingModel(BasePageModel):
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):

        page.cells = list(page._backend.get_text_cells())

+        if self.options.create_parsed_page:
+            page.parsed_page = page._backend.get_segmented_page()
+
        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
            draw = ImageDraw.Draw(image)
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@@ -3,8 +3,9 @@ from typing import Iterable

 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@@ -100,18 +101,26 @@ class RapidOcrModel(BaseOcrModel):

                        if result is not None:
                            cells = [
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                    text=line[1],
+                                    orig=line[1],
                                    confidence=line[2],
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(
-                                            (line[0][0][0] / self.scale) + ocr_rect.l,
-                                            (line[0][0][1] / self.scale) + ocr_rect.t,
-                                            (line[0][2][0] / self.scale) + ocr_rect.l,
-                                            (line[0][2][1] / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    from_ocr=True,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(
+                                                (line[0][0][0] / self.scale)
+                                                + ocr_rect.l,
+                                                (line[0][0][1] / self.scale)
+                                                + ocr_rect.t,
+                                                (line[0][2][0] / self.scale)
+                                                + ocr_rect.l,
+                                                (line[0][2][1] / self.scale)
+                                                + ocr_rect.t,
+                                            ),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
                                    ),
                                )
                                for ix, line in enumerate(result)
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union

 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
+from docling_core.types.doc.page import BoundingRectangle
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
            draw.rectangle([(x0, y0), (x1, y1)], outline="red")

            for cell in table_element.cluster.cells:
-                x0, y0, x1, y1 = cell.bbox.as_tuple()
+                x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
                x0 *= scale_x
                x1 *= scale_x
                y0 *= scale_x
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
                                # Only allow non empty stings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
-                                    new_cell.bbox = new_cell.bbox.scaled(
-                                        scale=self.scale
+                                    new_cell.rect = BoundingRectangle.from_bounding_box(
+                                        new_cell.rect.to_bounding_box().scaled(
+                                            scale=self.scale
+                                        )
                                    )

-                                    tokens.append(new_cell.model_dump())
+                                    tokens.append(
+                                        {
+                                            "id": new_cell.index,
+                                            "text": new_cell.text,
+                                            "bbox": new_cell.rect.to_bounding_box().model_dump(),
+                                        }
+                                    )
                            page_input["tokens"] = tokens

                            tf_output = self.tf_predictor.multi_table_predict(
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
@@ -228,18 +229,22 @@ class TesseractOcrCliModel(BaseOcrModel):
                            t = b + h
                            r = l + w

-                            cell = OcrCell(
-                                id=ix,
+                            cell = TextCell(
+                                index=ix,
                                text=text,
+                                orig=text,
+                                from_ocr=True,
                                confidence=conf / 100.0,
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (l / self.scale) + ocr_rect.l,
-                                        (b / self.scale) + ocr_rect.t,
-                                        (r / self.scale) + ocr_rect.l,
-                                        (t / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (l / self.scale) + ocr_rect.l,
+                                            (b / self.scale) + ocr_rect.t,
+                                            (r / self.scale) + ocr_rect.l,
+                                            (t / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
                                ),
                            )
                            all_ocr_cells.append(cell)
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -2,8 +2,9 @@ import logging
 from typing import Iterable

 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell

-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
@@ -173,13 +174,17 @@ class TesseractOcrModel(BaseOcrModel):
                            top = (box["y"] + box["h"]) / self.scale

                            cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                    text=text,
+                                    orig=text,
+                                    from_ocr=True,
                                    confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        ),
                                    ),
                                )
                            )
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -87,7 +87,8 @@ class StandardPdfPipeline(PaginatedPipeline):
            # Pre-processing
            PagePreprocessingModel(
                options=PagePreprocessingOptions(
-                    images_scale=pipeline_options.images_scale
+                    images_scale=pipeline_options.images_scale,
+                    create_parsed_page=pipeline_options.generate_parsed_pages,
                )
            ),
            # OCR
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -2,9 +2,9 @@ import logging
 from typing import Any, Dict, Iterable, List, Tuple, Union

 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table

-from docling.datamodel.base_models import OcrCell
 from docling.datamodel.document import ConversionResult, Page

 _log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
        if page.size is None:
            return cells
        for cell in page.cells:
-            new_bbox = cell.bbox.to_top_left_origin(
-                page_height=page.size.height
-            ).normalized(page_size=page.size)
-            is_ocr = isinstance(cell, OcrCell)
-            ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
+            new_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page.size.height)
+                .normalized(page_size=page.size)
+            )
+            is_ocr = cell.from_ocr
+            ocr_confidence = cell.confidence
            cells.append(
                {
                    "text": cell.text,
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -5,9 +5,10 @@ from collections import defaultdict
 from typing import Dict, List, Set, Tuple

 from docling_core.types.doc import DocItemLabel, Size
+from docling_core.types.doc.page import TextCell
 from rtree import index

-from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
+from docling.datamodel.base_models import BoundingBox, Cluster

 _log = logging.getLogger(__name__)

@@ -198,7 +199,7 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }

-    def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
+    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
        """Initialize processor with cells and clusters."""
        """Initialize processor with cells and spatial indices."""
        self.cells = cells
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
            [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
        )

-    def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
+    def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
        """Main processing pipeline."""
        self.regular_clusters = self._process_regular_clusters()
        self.special_clusters = self._process_special_clusters()
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
            next_id = max((c.id for c in self.all_clusters), default=0) + 1
            orphan_clusters = []
            for i, cell in enumerate(unassigned):
-                conf = 1.0
-                if isinstance(cell, OcrCell):
-                    conf = cell.confidence
+                conf = cell.confidence

                orphan_clusters.append(
                    Cluster(
                        id=next_id + i,
                        label=DocItemLabel.TEXT,
-                        bbox=cell.bbox,
+                        bbox=cell.to_bounding_box(),
                        confidence=conf,
                        cells=[cell],
                    )
@@ -557,13 +556,13 @@ class LayoutPostprocessor:

        return current_best if current_best else clusters[0]

-    def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
+    def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
        """Ensure each cell appears only once, maintaining order of first appearance."""
        seen_ids = set()
        unique_cells = []
        for cell in cells:
-            if cell.id not in seen_ids:
-                seen_ids.add(cell.id)
+            if cell.index not in seen_ids:
+                seen_ids.add(cell.index)
                unique_cells.append(cell)
        return unique_cells

@@ -582,11 +581,13 @@ class LayoutPostprocessor:
            best_cluster = None

            for cluster in clusters:
-                if cell.bbox.area() <= 0:
+                if cell.rect.to_bounding_box().area() <= 0:
                    continue

-                overlap = cell.bbox.intersection_area_with(cluster.bbox)
-                overlap_ratio = overlap / cell.bbox.area()
+                overlap = cell.rect.to_bounding_box().intersection_area_with(
+                    cluster.bbox
+                )
+                overlap_ratio = overlap / cell.rect.to_bounding_box().area()

                if overlap_ratio > best_overlap:
                    best_overlap = overlap_ratio
@@ -601,11 +602,13 @@ class LayoutPostprocessor:

        return clusters

-    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
+    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
        """Find cells not assigned to any cluster."""
-        assigned = {cell.id for cluster in clusters for cell in cluster.cells}
+        assigned = {cell.index for cluster in clusters for cell in cluster.cells}
        return [
-            cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
+            cell
+            for cell in self.cells
+            if cell.index not in assigned and cell.text.strip()
        ]

    def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
                continue

            cells_bbox = BoundingBox(
-                l=min(cell.bbox.l for cell in cluster.cells),
-                t=min(cell.bbox.t for cell in cluster.cells),
-                r=max(cell.bbox.r for cell in cluster.cells),
-                b=max(cell.bbox.b for cell in cluster.cells),
+                l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
+                t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
+                r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
+                b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
            )

            if cluster.label == DocItemLabel.TABLE:
@@ -634,9 +637,9 @@ class LayoutPostprocessor:

        return clusters

-    def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
+    def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
        """Sort cells in native reading order."""
-        return sorted(cells, key=lambda c: (c.id))
+        return sorted(cells, key=lambda c: (c.index))

    def _sort_clusters(
        self, clusters: List[Cluster], mode: str = "id"
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
                clusters,
                key=lambda cluster: (
                    (
-                        min(cell.id for cell in cluster.cells)
+                        min(cell.index for cell in cluster.cells)
                        if cluster.cells
                        else sys.maxsize
                    ),
--- a/docling/utils/visualization.py
+++ b/docling/utils/visualization.py
@@ -25,7 +25,7 @@ def draw_clusters(
            # Draw cells first (underneath)
            cell_color = (0, 0, 0, 40)  # Transparent black for cells
            for tc in c.cells:
-                cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
                cx0 *= scale_x
                cx1 *= scale_x
                cy0 *= scale_x