From f4117725698f952ccb8e4b29ca17a676958165c7 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 11 Mar 2025 16:06:28 +0100
Subject: [PATCH] Fixes and test updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/backend/docling_parse_backend.py    | 227 --------------------
 docling/backend/docling_parse_v2_backend.py |  31 +--
 docling/backend/docling_parse_v3_backend.py |  47 +---
 docling/backend/pdf_backend.py              |   9 +-
 docling/backend/pypdfium2_backend.py        |  80 ++++---
 docling/cli/main.py                         |  10 +-
 docling/datamodel/base_models.py            |  18 +-
 docling/datamodel/pipeline_options.py       |   2 +-
 docling/models/base_ocr_model.py            |  23 +-
 docling/models/easyocr_model.py             |  26 ++-
 docling/models/ocr_mac_model.py             |  16 +-
 docling/models/page_preprocessing_model.py  |   1 +
 docling/models/rapid_ocr_model.py           |  30 ++-
 docling/models/table_structure_model.py     |  17 +-
 docling/models/tesseract_ocr_cli_model.py   |  26 ++-
 docling/models/tesseract_ocr_model.py       |  16 +-
 docling/utils/export.py                     |  14 +-
 docling/utils/layout_postprocessor.py       |  47 ++--
 docling/utils/visualization.py              |   2 +-
 poetry.lock                                 |   2 +-
 tests/test_backend_docling_parse.py         |  77 -------
 tests/test_backend_docling_parse_v3.py      |   2 +-
 tests/test_code_formula.py                  |   1 -
 tests/test_e2e_conversion.py                |   4 +-
 tests/test_e2e_ocr_conversion.py            |   4 +-
 tests/test_interfaces.py                    |   4 +-
 tests/test_options.py                       |   4 +-
 tests/verify_utils.py                       |   4 +-
 28 files changed, 239 insertions(+), 505 deletions(-)
 delete mode 100644 docling/backend/docling_parse_backend.py
 delete mode 100644 tests/test_backend_docling_parse.py

diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
deleted file mode 100644
index 6d22127b..00000000
--- a/docling/backend/docling_parse_backend.py
+++ /dev/null
@@ -1,227 +0,0 @@
-import logging
-import random
-from io import BytesIO
-from pathlib import Path
-from typing import Iterable, List, Optional, Union
-
-import pypdfium2 as pdfium
-from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_parse.pdf_parsers import pdf_parser_v1
-from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage
-
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
-from docling.datamodel.document import InputDocument
-
-_log = logging.getLogger(__name__)
-
-
-class DoclingParsePageBackend(PdfPageBackend):
-    def __init__(
-        self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
-    ):
-        self._ppage = page_obj
-        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
-
-        self.valid = "pages" in parsed_page
-        if self.valid:
-            self._dpage = parsed_page["pages"][0]
-        else:
-            _log.info(
-                f"An error occurred when loading page {page_no} of document {document_hash}."
-            )
-
-    def is_valid(self) -> bool:
-        return self.valid
-
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if not self.valid:
-            return ""
-        # Find intersecting cells on the page
-        text_piece = ""
-        page_size = self.get_size()
-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
-
-        scale = (
-            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
-        )
-
-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
-            cell_bbox = BoundingBox(
-                l=x0 * scale * page_size.width / parser_width,
-                b=y0 * scale * page_size.height / parser_height,
-                r=x1 * scale * page_size.width / parser_width,
-                t=y1 * scale * page_size.height / parser_height,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            ).to_top_left_origin(page_height=page_size.height * scale)
-
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
-
-            if overlap_frac > 0.5:
-                if len(text_piece) > 0:
-                    text_piece += " "
-                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
-
-        return text_piece
-
-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
-        cell_counter = 0
-
-        if not self.valid:
-            return cells
-
-        page_size = self.get_size()
-
-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
-
-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
-
-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
-
-            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
-            cells.append(
-                Cell(
-                    id=cell_counter,
-                    text=text_piece,
-                    bbox=BoundingBox(
-                        # l=x0, b=y0, r=x1, t=y1,
-                        l=x0 * page_size.width / parser_width,
-                        b=y0 * page_size.height / parser_height,
-                        r=x1 * page_size.width / parser_width,
-                        t=y1 * page_size.height / parser_height,
-                        coord_origin=CoordOrigin.BOTTOMLEFT,
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-
-        # before merge:
-        # draw_clusters_and_cells()
-
-        # cells = merge_horizontal_cells(cells)
-
-        # after merge:
-        # draw_clusters_and_cells()
-
-        return cells
-
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-
-        for i in range(len(self._dpage["images"])):
-            bitmap = self._dpage["images"][i]
-            cropbox = BoundingBox.from_tuple(
-                bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
-            ).to_top_left_origin(self.get_size().height)
-
-            if cropbox.area() > AREA_THRESHOLD:
-                cropbox = cropbox.scaled(scale=scale)
-
-                yield cropbox
-
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
-    ) -> Image.Image:
-
-        page_size = self.get_size()
-
-        if not cropbox:
-            cropbox = BoundingBox(
-                l=0,
-                r=page_size.width,
-                t=0,
-                b=page_size.height,
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-            padbox = BoundingBox(
-                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
-            )
-        else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
-            padbox.r = page_size.width - padbox.r
-            padbox.t = page_size.height - padbox.t
-
-        image = (
-            self._ppage.render(
-                scale=scale * 1.5,
-                rotation=0,  # no additional rotation
-                crop=padbox.as_tuple(),
-            )
-            .to_pil()
-            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
-        )  # We resize the image from 1.5x the given scale to make it sharper.
-
-        return image
-
-    def get_size(self) -> Size:
-        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
-
-    def unload(self):
-        self._ppage = None
-        self._dpage = None
-
-
-class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-
-        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
-        self.parser = pdf_parser_v1()
-
-        success = False
-        if isinstance(self.path_or_stream, BytesIO):
-            success = self.parser.load_document_from_bytesio(
-                self.document_hash, self.path_or_stream
-            )
-        elif isinstance(self.path_or_stream, Path):
-            success = self.parser.load_document(
-                self.document_hash, str(self.path_or_stream)
-            )
-
-        if not success:
-            raise RuntimeError(
-                f"docling-parse could not load document with hash {self.document_hash}."
-            )
-
-    def page_count(self) -> int:
-        return len(self._pdoc)  # To be replaced with docling-parse API
-
-    def load_page(self, page_no: int) -> DoclingParsePageBackend:
-        return DoclingParsePageBackend(
-            self.parser, self.document_hash, page_no, self._pdoc[page_no]
-        )
-
-    def is_valid(self) -> bool:
-        return self.page_count() > 0
-
-    def unload(self):
-        super().unload()
-        self.parser.unload_document(self.document_hash)
-        self._pdoc.close()
-        self._pdoc = None
diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py
index 9178883f..96525ba3 100644
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell, Size
+from docling.datamodel.base_models import Size
 from docling.utils.locks import pypdfium2_lock
 
 if TYPE_CHECKING:
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
 
         return text_piece
 
-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return None
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        cells: List[TextCell] = []
         cell_counter = 0
 
         if not self.valid:
@@ -106,16 +110,19 @@ class DoclingParseV2PageBackend(PdfPageBackend):
 
             text_piece = cell_data[cells_header.index("text")]
             cells.append(
-                Cell(
-                    id=cell_counter,
+                TextCell(
+                    index=cell_counter,
                     text=text_piece,
-                    bbox=BoundingBox(
-                        # l=x0, b=y0, r=x1, t=y1,
-                        l=x0 * page_size.width / parser_width,
-                        b=y0 * page_size.height / parser_height,
-                        r=x1 * page_size.width / parser_width,
-                        t=y1 * page_size.height / parser_height,
-                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    orig=text_piece,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            # l=x0, b=y0, r=x1, t=y1,
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
                     ).to_top_left_origin(page_size.height),
                 )
             )
diff --git a/docling/backend/docling_parse_v3_backend.py b/docling/backend/docling_parse_v3_backend.py
index 7ff451ee..0ebeafd3 100644
--- a/docling/backend/docling_parse_v3_backend.py
+++ b/docling/backend/docling_parse_v3_backend.py
@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import SegmentedPdfPage
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell, Size
+from docling.datamodel.base_models import Size
 
 if TYPE_CHECKING:
     from docling.datamodel.document import InputDocument
@@ -54,48 +54,15 @@ class DoclingParseV3PageBackend(PdfPageBackend):
 
         return text_piece
 
-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
-        cell_counter = 0
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return self._dpage
 
+    def get_text_cells(self) -> Iterable[TextCell]:
         page_size = self.get_size()
 
-        for i, cell in enumerate(self._dpage.textline_cells):
-            cell_bbox = cell.rect.to_bounding_box()
+        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
 
-            if cell_bbox.r < cell_bbox.l:
-                cell_bbox.r, cell_bbox.l = cell_bbox.l, cell_bbox.r
-            if cell_bbox.b > cell_bbox.t:
-                cell_bbox.b, cell_bbox.t = cell_bbox.t, cell_bbox.b
-
-            text_piece = cell.text
-            cells.append(
-                Cell(
-                    id=cell_counter,
-                    text=text_piece,
-                    bbox=cell_bbox.to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-
-        # draw_clusters_and_cells()
-
-        return cells
+        return self._dpage.textline_cells
 
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 0  # 32 * 32
diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py
index 35c83b8c..cfecc7e6 100644
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -4,10 +4,11 @@ from pathlib import Path
 from typing import Iterable, Optional, Set, Union
 
 from docling_core.types.doc import BoundingBox, Size
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from PIL import Image
 
 from docling.backend.abstract_backend import PaginatedDocumentBackend
-from docling.datamodel.base_models import Cell, InputFormat
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 
 
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
         pass
 
     @abstractmethod
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        pass
+
+    @abstractmethod
+    def get_text_cells(self) -> Iterable[TextCell]:
         pass
 
     @abstractmethod
diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
index b585e2d5..5a5903de 100644
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
 
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
 from docling.utils.locks import pypdfium2_lock
 
 if TYPE_CHECKING:
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
 
         return text_piece
 
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return None
+
+    def get_text_cells(self) -> Iterable[TextCell]:
         with pypdfium2_lock:
             if not self.text_page:
                 self.text_page = self._ppage.get_textpage()
@@ -84,11 +87,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 text_piece = self.text_page.get_text_bounded(*rect)
                 x0, y0, x1, y1 = rect
                 cells.append(
-                    Cell(
-                        id=cell_counter,
+                    TextCell(
+                        index=cell_counter,
                         text=text_piece,
-                        bbox=BoundingBox(
-                            l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
+                        orig=text_piece,
+                        rect=BoundingRectangle.from_bounding_box(
+                            BoundingBox(
+                                l=x0,
+                                b=y0,
+                                r=x1,
+                                t=y1,
+                                coord_origin=CoordOrigin.BOTTOMLEFT,
+                            )
                         ).to_top_left_origin(page_size.height),
                     )
                 )
@@ -97,51 +107,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
         # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
         # The cell merging code below is to clean this up.
         def merge_horizontal_cells(
-            cells: List[Cell],
+            cells: List[TextCell],
             horizontal_threshold_factor: float = 1.0,
             vertical_threshold_factor: float = 0.5,
-        ) -> List[Cell]:
+        ) -> List[TextCell]:
             if not cells:
                 return []
 
-            def group_rows(cells: List[Cell]) -> List[List[Cell]]:
+            def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
                 rows = []
                 current_row = [cells[0]]
-                row_top = cells[0].bbox.t
-                row_bottom = cells[0].bbox.b
-                row_height = cells[0].bbox.height
+                row_top = cells[0].rect.to_bounding_box().t
+                row_bottom = cells[0].rect.to_bounding_box().b
+                row_height = cells[0].rect.to_bounding_box().height
 
                 for cell in cells[1:]:
                     vertical_threshold = row_height * vertical_threshold_factor
                     if (
-                        abs(cell.bbox.t - row_top) <= vertical_threshold
-                        and abs(cell.bbox.b - row_bottom) <= vertical_threshold
+                        abs(cell.rect.to_bounding_box().t - row_top)
+                        <= vertical_threshold
+                        and abs(cell.rect.to_bounding_box().b - row_bottom)
+                        <= vertical_threshold
                     ):
                         current_row.append(cell)
-                        row_top = min(row_top, cell.bbox.t)
-                        row_bottom = max(row_bottom, cell.bbox.b)
+                        row_top = min(row_top, cell.rect.to_bounding_box().t)
+                        row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
                         row_height = row_bottom - row_top
                     else:
                         rows.append(current_row)
                         current_row = [cell]
-                        row_top = cell.bbox.t
-                        row_bottom = cell.bbox.b
-                        row_height = cell.bbox.height
+                        row_top = cell.rect.to_bounding_box().t
+                        row_bottom = cell.rect.to_bounding_box().b
+                        row_height = cell.rect.to_bounding_box().height
 
                 if current_row:
                     rows.append(current_row)
 
                 return rows
 
-            def merge_row(row: List[Cell]) -> List[Cell]:
+            def merge_row(row: List[TextCell]) -> List[TextCell]:
                 merged = []
                 current_group = [row[0]]
 
                 for cell in row[1:]:
                     prev_cell = current_group[-1]
-                    avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
+                    avg_height = (
+                        prev_cell.rect.height + cell.rect.to_bounding_box().height
+                    ) / 2
                     if (
-                        cell.bbox.l - prev_cell.bbox.r
+                        cell.rect.to_bounding_box().l
+                        - prev_cell.rect.to_bounding_box().r
                         <= avg_height * horizontal_threshold_factor
                     ):
                         current_group.append(cell)
@@ -154,24 +169,29 @@ class PyPdfiumPageBackend(PdfPageBackend):
 
                 return merged
 
-            def merge_group(group: List[Cell]) -> Cell:
+            def merge_group(group: List[TextCell]) -> TextCell:
                 if len(group) == 1:
                     return group[0]
 
                 merged_text = "".join(cell.text for cell in group)
                 merged_bbox = BoundingBox(
-                    l=min(cell.bbox.l for cell in group),
-                    t=min(cell.bbox.t for cell in group),
-                    r=max(cell.bbox.r for cell in group),
-                    b=max(cell.bbox.b for cell in group),
+                    l=min(cell.rect.to_bounding_box().l for cell in group),
+                    t=min(cell.rect.to_bounding_box().t for cell in group),
+                    r=max(cell.rect.to_bounding_box().r for cell in group),
+                    b=max(cell.rect.to_bounding_box().b for cell in group),
+                )
+                return TextCell(
+                    index=group[0].index,
+                    text=merged_text,
+                    orig=merged_text,
+                    rect=BoundingRectangle.from_bounding_box(merged_bbox),
                 )
-                return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
 
             rows = group_rows(cells)
             merged_cells = [cell for row in rows for cell in merge_row(row)]
 
             for i, cell in enumerate(merged_cells, 1):
-                cell.id = i
+                cell.index = i
 
             return merged_cells
 
@@ -181,7 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
             )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
+                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                 cell_color = (
                     random.randint(30, 140),
                     random.randint(30, 140),
diff --git a/docling/cli/main.py b/docling/cli/main.py
index a2c28fd7..fc3e1a9b 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -14,8 +14,8 @@ from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
@@ -412,12 +412,12 @@ def convert(
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
 
-        if pdf_backend == PdfBackend.DLPARSE_V1:
-            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V2:
+        if pdf_backend == PdfBackend.DLPARSE_V2:
             backend = DoclingParseV2DocumentBackend
+        elif pdf_backend == PdfBackend.DLPARSE_V3:
+            backend = DoclingParseV3DocumentBackend  # type: ignore
         elif pdf_backend == PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend
+            backend = PyPdfiumDocumentBackend  # type: ignore
         else:
             raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
 
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 3297c9a5..76827a1b 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
     Size,
     TableCell,
 )
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
     DocumentStream,
 )
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
     error_message: str
 
 
-class Cell(BaseModel):
-    id: int
-    text: str
-    bbox: BoundingBox
-
-
-class OcrCell(Cell):
-    confidence: float
+# class Cell(BaseModel):
+#    id: int
+#    text: str
+#    bbox: BoundingBox
 
 
 class Cluster(BaseModel):
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
     label: DocItemLabel
     bbox: BoundingBox
     confidence: float = 1.0
-    cells: List[Cell] = []
+    cells: List[TextCell] = []
     children: List["Cluster"] = []  # Add child cluster support
 
 
@@ -226,7 +223,8 @@ class Page(BaseModel):
     page_no: int
     # page_hash: Optional[str] = None
     size: Optional[Size] = None
-    cells: List[Cell] = []
+    cells: List[TextCell] = []
+    parsed_page: Optional[SegmentedPdfPage] = None
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None
 
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 3a55ecfc..43fa6c7e 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -299,8 +299,8 @@ class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
 
     PYPDFIUM2 = "pypdfium2"
-    DLPARSE_V1 = "dlparse_v1"
     DLPARSE_V2 = "dlparse_v2"
+    DLPARSE_V3 = "dlparse_v3"
 
 
 # Define an enum for the ocr engines
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
index 9afb7dde..4b153ff6 100644
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -6,11 +6,12 @@ from typing import Iterable, List
 
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
 
-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
         p.dimension = 2
         idx = index.Index(properties=p)
         for i, cell in enumerate(programmatic_cells):
-            idx.insert(i, cell.bbox.as_tuple())
+            idx.insert(i, cell.rect.to_bounding_box().as_tuple())
 
         def is_overlapping_with_existing_cells(ocr_cell):
             # Query the R-tree to get overlapping rectangles
-            possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
+            possible_matches_index = list(
+                idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
+            )
 
             return (
                 len(possible_matches_index) > 0
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
         """
         if self.options.force_full_page_ocr:
             # If a full page OCR is forced, use only the OCR cells
-            cells = [
-                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
-                for c_ocr in ocr_cells
-            ]
+            cells = ocr_cells
             return cells
 
         ## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):
 
         # Draw OCR and programmatic cells
         for tc in page.cells:
-            x0, y0, x1, y1 = tc.bbox.as_tuple()
+            x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
             y0 *= scale_x
             y1 *= scale_y
             x0 *= scale_x
@@ -165,9 +165,10 @@ class BaseOcrModel(BasePageModel):
             if y1 <= y0:
                 y1, y0 = y0, y1
 
-            color = "gray"
-            if isinstance(tc, OcrCell):
-                color = "magenta"
+            color = "magenta"
+            if isinstance(tc, PdfTextCell):
+                color = "gray"
+
             draw.rectangle([(x0, y0), (x1, y1)], outline=color)
 
         if show:
diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
index 0eccb988..59b9f2ba 100644
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -6,8 +6,9 @@ from typing import Iterable, List, Optional
 
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
 
-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AcceleratorDevice,
@@ -148,18 +149,21 @@ class EasyOcrModel(BaseOcrModel):
                         del im
 
                         cells = [
-                            OcrCell(
-                                id=ix,
+                            TextCell(
+                                index=ix,
                                 text=line[1],
+                                orig=line[1],
                                 confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
                                 ),
                             )
                             for ix, line in enumerate(result)
diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py
index 38bcf1ca..74b25a73 100644
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@@ -3,8 +3,9 @@ import tempfile
 from typing import Iterable, Optional, Tuple
 
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
 
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrMacOptions
 from docling.datamodel.settings import settings
@@ -94,13 +95,16 @@ class OcrMacModel(BaseOcrModel):
                             bottom = y2 / self.scale
 
                             cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                     text=text,
+                                    orig=text,
                                     confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
                                     ),
                                 )
                             )
diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py
index 63f1a4f6..2ac52be1 100644
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@@ -54,6 +54,7 @@ class PagePreprocessingModel(BasePageModel):
         assert page._backend is not None
 
         page.cells = list(page._backend.get_text_cells())
+        page.parsed_page = page._backend.get_segmented_page()
 
         # DEBUG code:
         def draw_text_boxes(image, cells, show: bool = False):
diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py
index fa3fbedf..f13fd6cc 100644
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@@ -3,8 +3,9 @@ from typing import Iterable
 
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
 
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AcceleratorDevice,
@@ -100,18 +101,25 @@ class RapidOcrModel(BaseOcrModel):
 
                         if result is not None:
                             cells = [
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                     text=line[1],
+                                    orig=line[1],
                                     confidence=line[2],
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(
-                                            (line[0][0][0] / self.scale) + ocr_rect.l,
-                                            (line[0][0][1] / self.scale) + ocr_rect.t,
-                                            (line[0][2][0] / self.scale) + ocr_rect.l,
-                                            (line[0][2][1] / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(
+                                                (line[0][0][0] / self.scale)
+                                                + ocr_rect.l,
+                                                (line[0][0][1] / self.scale)
+                                                + ocr_rect.t,
+                                                (line[0][2][0] / self.scale)
+                                                + ocr_rect.l,
+                                                (line[0][2][1] / self.scale)
+                                                + ocr_rect.t,
+                                            ),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
                                     ),
                                 )
                                 for ix, line in enumerate(result)
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 64979157..c0225a0d 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
 
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
+from docling_core.types.doc.page import BoundingRectangle
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
 
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
             draw.rectangle([(x0, y0), (x1, y1)], outline="red")
 
             for cell in table_element.cluster.cells:
-                x0, y0, x1, y1 = cell.bbox.as_tuple()
+                x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
                 x0 *= scale_x
                 x1 *= scale_x
                 y0 *= scale_x
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
                                 # Only allow non empty stings (spaces) into the cells of a table
                                 if len(c.text.strip()) > 0:
                                     new_cell = copy.deepcopy(c)
-                                    new_cell.bbox = new_cell.bbox.scaled(
-                                        scale=self.scale
+                                    new_cell.rect = BoundingRectangle.from_bounding_box(
+                                        new_cell.rect.to_bounding_box().scaled(
+                                            scale=self.scale
+                                        )
                                     )
 
-                                    tokens.append(new_cell.model_dump())
+                                    tokens.append(
+                                        {
+                                            "id": new_cell.index,
+                                            "text": new_cell.text,
+                                            "bbox": new_cell.rect.to_bounding_box().model_dump(),
+                                        }
+                                    )
                             page_input["tokens"] = tokens
 
                             tf_output = self.tf_predictor.multi_table_predict(
diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py
index ac8dd51f..a4744c52 100644
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple
 
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
 
-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
@@ -228,18 +229,21 @@ class TesseractOcrCliModel(BaseOcrModel):
                             t = b + h
                             r = l + w
 
-                            cell = OcrCell(
-                                id=ix,
+                            cell = TextCell(
+                                index=ix,
                                 text=text,
+                                orig=text,
                                 confidence=conf / 100.0,
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (l / self.scale) + ocr_rect.l,
-                                        (b / self.scale) + ocr_rect.t,
-                                        (r / self.scale) + ocr_rect.l,
-                                        (t / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (l / self.scale) + ocr_rect.l,
+                                            (b / self.scale) + ocr_rect.t,
+                                            (r / self.scale) + ocr_rect.l,
+                                            (t / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
                                 ),
                             )
                             all_ocr_cells.append(cell)
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
index c41806f5..7ee6d377 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -2,8 +2,9 @@ import logging
 from typing import Iterable
 
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
 
-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
@@ -173,13 +174,16 @@ class TesseractOcrModel(BaseOcrModel):
                             top = (box["y"] + box["h"]) / self.scale
 
                             cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                     text=text,
+                                    orig=text,
                                     confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        ),
                                     ),
                                 )
                             )
diff --git a/docling/utils/export.py b/docling/utils/export.py
index 5b022f4a..d480c664 100644
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -2,9 +2,9 @@ import logging
 from typing import Any, Dict, Iterable, List, Tuple, Union
 
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
 
-from docling.datamodel.base_models import OcrCell
 from docling.datamodel.document import ConversionResult, Page
 
 _log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
         if page.size is None:
             return cells
         for cell in page.cells:
-            new_bbox = cell.bbox.to_top_left_origin(
-                page_height=page.size.height
-            ).normalized(page_size=page.size)
-            is_ocr = isinstance(cell, OcrCell)
-            ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
+            new_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page.size.height)
+                .normalized(page_size=page.size)
+            )
+            is_ocr = isinstance(cell, TextCell)
+            ocr_confidence = cell.confidence if isinstance(cell, TextCell) else 1.0
             cells.append(
                 {
                     "text": cell.text,
diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py
index e2b950f4..771b4207 100644
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -5,9 +5,10 @@ from collections import defaultdict
 from typing import Dict, List, Set, Tuple
 
 from docling_core.types.doc import DocItemLabel, Size
+from docling_core.types.doc.page import TextCell
 from rtree import index
 
-from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
+from docling.datamodel.base_models import BoundingBox, Cluster
 
 _log = logging.getLogger(__name__)
 
@@ -198,7 +199,7 @@ class LayoutPostprocessor:
         DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
     }
 
-    def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
+    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
         """Initialize processor with cells and clusters."""
         """Initialize processor with cells and spatial indices."""
         self.cells = cells
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
             [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
         )
 
-    def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
+    def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
         """Main processing pipeline."""
         self.regular_clusters = self._process_regular_clusters()
         self.special_clusters = self._process_special_clusters()
@@ -272,14 +273,14 @@ class LayoutPostprocessor:
             orphan_clusters = []
             for i, cell in enumerate(unassigned):
                 conf = 1.0
-                if isinstance(cell, OcrCell):
+                if isinstance(cell, TextCell):
                     conf = cell.confidence
 
                 orphan_clusters.append(
                     Cluster(
                         id=next_id + i,
                         label=DocItemLabel.TEXT,
-                        bbox=cell.bbox,
+                        bbox=cell.to_bounding_box(),
                         confidence=conf,
                         cells=[cell],
                     )
@@ -557,13 +558,13 @@ class LayoutPostprocessor:
 
         return current_best if current_best else clusters[0]
 
-    def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
+    def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
         """Ensure each cell appears only once, maintaining order of first appearance."""
         seen_ids = set()
         unique_cells = []
         for cell in cells:
-            if cell.id not in seen_ids:
-                seen_ids.add(cell.id)
+            if cell.index not in seen_ids:
+                seen_ids.add(cell.index)
                 unique_cells.append(cell)
         return unique_cells
 
@@ -582,11 +583,13 @@ class LayoutPostprocessor:
             best_cluster = None
 
             for cluster in clusters:
-                if cell.bbox.area() <= 0:
+                if cell.rect.to_bounding_box().area() <= 0:
                     continue
 
-                overlap = cell.bbox.intersection_area_with(cluster.bbox)
-                overlap_ratio = overlap / cell.bbox.area()
+                overlap = cell.rect.to_bounding_box().intersection_area_with(
+                    cluster.bbox
+                )
+                overlap_ratio = overlap / cell.rect.to_bounding_box().area()
 
                 if overlap_ratio > best_overlap:
                     best_overlap = overlap_ratio
@@ -601,11 +604,13 @@ class LayoutPostprocessor:
 
         return clusters
 
-    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
+    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
         """Find cells not assigned to any cluster."""
-        assigned = {cell.id for cluster in clusters for cell in cluster.cells}
+        assigned = {cell.index for cluster in clusters for cell in cluster.cells}
         return [
-            cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
+            cell
+            for cell in self.cells
+            if cell.index not in assigned and cell.text.strip()
         ]
 
     def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
@@ -615,10 +620,10 @@ class LayoutPostprocessor:
                 continue
 
             cells_bbox = BoundingBox(
-                l=min(cell.bbox.l for cell in cluster.cells),
-                t=min(cell.bbox.t for cell in cluster.cells),
-                r=max(cell.bbox.r for cell in cluster.cells),
-                b=max(cell.bbox.b for cell in cluster.cells),
+                l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
+                t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
+                r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
+                b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
             )
 
             if cluster.label == DocItemLabel.TABLE:
@@ -634,9 +639,9 @@ class LayoutPostprocessor:
 
         return clusters
 
-    def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
+    def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
         """Sort cells in native reading order."""
-        return sorted(cells, key=lambda c: (c.id))
+        return sorted(cells, key=lambda c: (c.index))
 
     def _sort_clusters(
         self, clusters: List[Cluster], mode: str = "id"
@@ -647,7 +652,7 @@ class LayoutPostprocessor:
                 clusters,
                 key=lambda cluster: (
                     (
-                        min(cell.id for cell in cluster.cells)
+                        min(cell.index for cell in cluster.cells)
                         if cluster.cells
                         else sys.maxsize
                     ),
diff --git a/docling/utils/visualization.py b/docling/utils/visualization.py
index e7ea24a5..6c4815fa 100644
--- a/docling/utils/visualization.py
+++ b/docling/utils/visualization.py
@@ -25,7 +25,7 @@ def draw_clusters(
             # Draw cells first (underneath)
             cell_color = (0, 0, 0, 40)  # Transparent black for cells
             for tc in c.cells:
-                cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
                 cx0 *= scale_x
                 cx1 *= scale_x
                 cy0 *= scale_x
diff --git a/poetry.lock b/poetry.lock
index 9c62c494..cccabac7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -898,7 +898,7 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
 type = "git"
 url = "https://github.com/DS4SD/docling-core"
 reference = "cau/docling-parse-types"
-resolved_reference = "31db5b0225a4baa8be5f26cc50050cf4bc845204"
+resolved_reference = "5f404c0270408ba794c18f8d6923cfa9f2980d73"
 
 [[package]]
 name = "docling-ibm-models"
diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py
deleted file mode 100644
index 3c214791..00000000
--- a/tests/test_backend_docling_parse.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from pathlib import Path
-
-import pytest
-from docling_core.types.doc import BoundingBox
-
-from docling.backend.docling_parse_backend import (
-    DoclingParseDocumentBackend,
-    DoclingParsePageBackend,
-)
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-
-
-@pytest.fixture
-def test_doc_path():
-    return Path("./tests/data/pdf/2206.01062.pdf")
-
-
-def _get_backend(pdf_doc):
-    in_doc = InputDocument(
-        path_or_stream=pdf_doc,
-        format=InputFormat.PDF,
-        backend=DoclingParseDocumentBackend,
-    )
-
-    doc_backend = in_doc._backend
-    return doc_backend
-
-
-def test_text_cell_counts():
-    pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
-
-    doc_backend = _get_backend(pdf_doc)
-
-    for page_index in range(0, doc_backend.page_count()):
-        last_cell_count = None
-        for i in range(10):
-            page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
-            cells = list(page_backend.get_text_cells())
-
-            if last_cell_count is None:
-                last_cell_count = len(cells)
-
-            if len(cells) != last_cell_count:
-                assert (
-                    False
-                ), "Loading page multiple times yielded non-identical text cell counts"
-            last_cell_count = len(cells)
-
-
-def test_get_text_from_rect(test_doc_path):
-    doc_backend = _get_backend(test_doc_path)
-    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
-
-    # Get the title text of the DocLayNet paper
-    textpiece = page_backend.get_text_in_rect(
-        bbox=BoundingBox(l=102, t=77, r=511, b=124)
-    )
-    ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
-
-    assert textpiece.strip() == ref
-
-
-def test_crop_page_image(test_doc_path):
-    doc_backend = _get_backend(test_doc_path)
-    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
-
-    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
-        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
-    )
-    # im.show()
-
-
-def test_num_pages(test_doc_path):
-    doc_backend = _get_backend(test_doc_path)
-    doc_backend.page_count() == 9
diff --git a/tests/test_backend_docling_parse_v3.py b/tests/test_backend_docling_parse_v3.py
index 8230e223..17a951c0 100644
--- a/tests/test_backend_docling_parse_v3.py
+++ b/tests/test_backend_docling_parse_v3.py
@@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
 
 @pytest.fixture
 def test_doc_path():
-    return Path("./tests/data/2206.01062.pdf")
+    return Path("./tests/data/pdf/2206.01062.pdf")
 
 
 def _get_backend(pdf_doc):
diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py
index ac7a1587..a607c09d 100644
--- a/tests/test_code_formula.py
+++ b/tests/test_code_formula.py
@@ -3,7 +3,6 @@ from pathlib import Path
 from docling_core.types.doc import CodeItem, TextItem
 from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py
index d2215d61..427c0c84 100644
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
@@ -33,7 +33,7 @@ def get_converter():
     converter = DocumentConverter(
         format_options={
             InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
+                pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend
             )
         }
     )
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 62e4c855..8c75b1b3 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -2,7 +2,7 @@ import sys
 from pathlib import Path
 from typing import List
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
@@ -44,7 +44,7 @@ def get_converter(ocr_options: OcrOptions):
         format_options={
             InputFormat.PDF: PdfFormatOption(
                 pipeline_options=pipeline_options,
-                backend=DoclingParseDocumentBackend,
+                backend=DoclingParseV3DocumentBackend,
             )
         }
     )
diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py
index 1978bc74..dcc6e510 100644
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@@ -3,7 +3,7 @@ from pathlib import Path
 
 import pytest
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -30,7 +30,7 @@ def converter():
     converter = DocumentConverter(
         format_options={
             InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
+                pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend
             )
         }
     )
diff --git a/tests/test_options.py b/tests/test_options.py
index c8701a1b..ffb114a9 100644
--- a/tests/test_options.py
+++ b/tests/test_options.py
@@ -3,7 +3,7 @@ from pathlib import Path
 
 import pytest
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
@@ -33,7 +33,7 @@ def get_converters_with_table_options():
                 format_options={
                     InputFormat.PDF: PdfFormatOption(
                         pipeline_options=pipeline_options,
-                        backend=DoclingParseDocumentBackend,
+                        backend=DoclingParseV3DocumentBackend,
                     )
                 }
             )
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 45152e0a..02861a8b 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -79,8 +79,8 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
             pred_text = cell_pred_item.text
             assert true_text == pred_text, f"{true_text}!={pred_text}"
 
-            true_bbox = cell_true_item.bbox.as_tuple()
-            pred_bbox = cell_pred_item.bbox.as_tuple()
+            true_bbox = cell_true_item.rect.to_bounding_box().as_tuple()
+            pred_bbox = cell_pred_item.rect.to_bounding_box().as_tuple()
             assert (
                 true_bbox == pred_bbox
             ), f"bbox is not the same: {true_bbox} != {pred_bbox}"