Make page.parsed_page the only source of truth for text cells

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-06-10 19:55:49 +02:00 · 2025-06-10 19:55:49 +02:00 · d73c9a2995
commit d73c9a2995
parent e310c5cff3
58 changed files with 349497 additions and 331004 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -7,7 +7,13 @@ from typing import List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
    BoundingRectangle,
    PdfPageBoundaryType,
    PdfPageGeometry,
    SegmentedPdfPage,
    TextCell,
 )
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@ -36,6 +42,51 @@ class DoclingParsePageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid
    def _compute_text_cells(self) -> List[TextCell]:
        """Compute text cells from docling-parse data."""
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["width"]
        parser_height = self._dpage["height"]
        for i in range(len(self._dpage["cells"])):
            rect = self._dpage["cells"][i]["box"]["device"]
            x0, y0, x1, y1 = rect
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        return cells
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -70,75 +121,45 @@ class DoclingParsePageBackend(PdfPageBackend):
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        return None
    def get_text_cells(self) -> Iterable[TextCell]:
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
-            return cells
+            return None
        page_size = self.get_size()
        text_cells = self._compute_text_cells()
-        parser_width = self._dpage["width"]
+        # Create page geometry
-        parser_height = self._dpage["height"]
+        crop_bbox = BoundingBox(
            l=0,
            r=page_size.width,
            t=0,
            b=page_size.height,
            coord_origin=CoordOrigin.TOPLEFT,
        ).to_bottom_left_origin(page_size.height)
-        for i in range(len(self._dpage["cells"])):
+        dimension = PdfPageGeometry(
-            rect = self._dpage["cells"][i]["box"]["device"]
+            angle=0.0,
-            x0, y0, x1, y1 = rect
+            rect=BoundingRectangle.from_bounding_box(crop_bbox),
            boundary_type=PdfPageBoundaryType.CROP_BOX,
            art_bbox=crop_bbox,
            bleed_bbox=crop_bbox,
            crop_bbox=crop_bbox,
            media_bbox=crop_bbox,
            trim_bbox=crop_bbox,
        )
-            if x1 < x0:
+        # Create SegmentedPdfPage
-                x0, x1 = x1, x0
+        return SegmentedPdfPage(
-            if y1 < y0:
+            dimension=dimension,
-                y0, y1 = y1, y0
+            textline_cells=text_cells,
            char_cells=[],
            word_cells=[],
            has_lines=len(text_cells) > 0,
            has_words=False,
            has_chars=False,
        )
-            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
+    def get_text_cells(self) -> Iterable[TextCell]:
-            cells.append(
+        return self._compute_text_cells()
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            # l=x0, b=y0, r=x1, t=y1,
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        def draw_clusters_and_cells():
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # before merge:
        # draw_clusters_and_cells()
        # cells = merge_horizontal_cells(cells)
        # after merge:
        # draw_clusters_and_cells()
        return cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -7,7 +7,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
    BoundingRectangle,
    PdfPageBoundaryType,
    PdfPageGeometry,
    SegmentedPdfPage,
    TextCell,
 )
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@ -40,6 +46,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid
    def _compute_text_cells(self) -> List[TextCell]:
        """Compute text cells from docling-parse v2 data."""
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["sanitized"]["dimension"]["width"]
        parser_height = self._dpage["sanitized"]["dimension"]["height"]
        cells_data = self._dpage["sanitized"]["cells"]["data"]
        cells_header = self._dpage["sanitized"]["cells"]["header"]
        for i, cell_data in enumerate(cells_data):
            x0 = cell_data[cells_header.index("x0")]
            y0 = cell_data[cells_header.index("y0")]
            x1 = cell_data[cells_header.index("x1")]
            y1 = cell_data[cells_header.index("y1")]
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = cell_data[cells_header.index("text")]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        return cells
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -81,73 +136,45 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        return None
    def get_text_cells(self) -> Iterable[TextCell]:
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
-            return cells
+            return None
        page_size = self.get_size()
        text_cells = self._compute_text_cells()
-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        # Create page geometry
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+        crop_bbox = BoundingBox(
            l=0,
            r=page_size.width,
            t=0,
            b=page_size.height,
            coord_origin=CoordOrigin.TOPLEFT,
        ).to_bottom_left_origin(page_size.height)
-        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        dimension = PdfPageGeometry(
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
+            angle=0.0,
            rect=BoundingRectangle.from_bounding_box(crop_bbox),
            boundary_type=PdfPageBoundaryType.CROP_BOX,
            art_bbox=crop_bbox,
            bleed_bbox=crop_bbox,
            crop_bbox=crop_bbox,
            media_bbox=crop_bbox,
            trim_bbox=crop_bbox,
        )
-        for i, cell_data in enumerate(cells_data):
+        # Create SegmentedPdfPage
-            x0 = cell_data[cells_header.index("x0")]
+        return SegmentedPdfPage(
-            y0 = cell_data[cells_header.index("y0")]
+            dimension=dimension,
-            x1 = cell_data[cells_header.index("x1")]
+            textline_cells=text_cells,
-            y1 = cell_data[cells_header.index("y1")]
+            char_cells=[],
            word_cells=[],
            has_lines=len(text_cells) > 0,
            has_words=False,
            has_chars=False,
        )
-            if x1 < x0:
+    def get_text_cells(self) -> Iterable[TextCell]:
-                x0, x1 = x1, x0
+        return self._compute_text_cells()
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = cell_data[cells_header.index("text")]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            # l=x0, b=y0, r=x1, t=y1,
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        def draw_clusters_and_cells():
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # draw_clusters_and_cells()
        return cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
    BoundingRectangle,
    PdfPageBoundaryType,
    PdfPageGeometry,
    SegmentedPdfPage,
    TextCell,
 )
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
@ -41,38 +47,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+    def _compute_text_cells(self) -> List[TextCell]:
-        AREA_THRESHOLD = 0  # 32 * 32
+        """Compute text cells from pypdfium."""
        page_size = self.get_size()
        with pypdfium2_lock:
            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
                pos = obj.get_pos()
                cropbox = BoundingBox.from_tuple(
                    pos, origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height=page_size.height)
                if cropbox.area() > AREA_THRESHOLD:
                    cropbox = cropbox.scaled(scale=scale)
                    yield cropbox
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
            bbox = bbox.to_bottom_left_origin(self.get_size().height)
        with pypdfium2_lock:
            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        return None
    def get_text_cells(self) -> Iterable[TextCell]:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
@ -203,30 +179,76 @@ class PyPdfiumPageBackend(PdfPageBackend):
            return merged_cells
-        def draw_clusters_and_cells():
+        return merge_horizontal_cells(cells)
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
-        # before merge:
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        # draw_clusters_and_cells()
+        AREA_THRESHOLD = 0  # 32 * 32
        page_size = self.get_size()
        with pypdfium2_lock:
            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
                pos = obj.get_pos()
                cropbox = BoundingBox.from_tuple(
                    pos, origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height=page_size.height)
-        cells = merge_horizontal_cells(cells)
+                if cropbox.area() > AREA_THRESHOLD:
                    cropbox = cropbox.scaled(scale=scale)
-        # after merge:
+                    yield cropbox
        # draw_clusters_and_cells()
-        return cells
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
            bbox = bbox.to_bottom_left_origin(self.get_size().height)
        with pypdfium2_lock:
            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        if not self.valid:
            return None
        page_size = self.get_size()
        text_cells = self._compute_text_cells()
        # Create page geometry
        crop_bbox = BoundingBox(
            l=0,
            r=page_size.width,
            t=0,
            b=page_size.height,
            coord_origin=CoordOrigin.TOPLEFT,
        ).to_bottom_left_origin(page_size.height)
        dimension = PdfPageGeometry(
            angle=0.0,
            rect=BoundingRectangle.from_bounding_box(crop_bbox),
            boundary_type=PdfPageBoundaryType.CROP_BOX,
            art_bbox=crop_bbox,
            bleed_bbox=crop_bbox,
            crop_bbox=crop_bbox,
            media_bbox=crop_bbox,
            trim_bbox=crop_bbox,
        )
        # Create SegmentedPdfPage
        return SegmentedPdfPage(
            dimension=dimension,
            textline_cells=text_cells,
            char_cells=[],
            word_cells=[],
            has_lines=len(text_cells) > 0,
            has_words=False,
            has_chars=False,
        )
    def get_text_cells(self) -> Iterable[TextCell]:
        return self._compute_text_cells()
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -232,7 +232,6 @@ class Page(BaseModel):
    page_no: int
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
    cells: List[TextCell] = []
    parsed_page: Optional[SegmentedPdfPage] = None
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None
@ -245,6 +244,14 @@ class Page(BaseModel):
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
    @property
    def cells(self) -> List[TextCell]:
        """Return text cells as a read-only view of parsed_page.textline_cells."""
        if self.parsed_page is not None:
            return self.parsed_page.textline_cells
        else:
            return []
    def get_image(
        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
    ) -> Optional[Image]:
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
        ),
    )
-    generate_parsed_pages: bool = False
+    generate_parsed_pages: Literal[True] = (
        True  # Always True since parsed_page is now mandatory
    )
 class PdfPipeline(str, Enum):
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -133,20 +133,19 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
    def post_process_cells(self, ocr_cells, page):
        r"""
        Post-process the OCR cells and update the page object.
-        Treats page.parsed_page as authoritative when available, with page.cells for compatibility.
+        Updates parsed_page.textline_cells directly since page.cells is now read-only.
        """
-        # Get existing cells (prefer parsed_page, fallback to page.cells)
+        # Get existing cells from the read-only property
-        existing_cells = self._get_existing_cells(page)
+        existing_cells = page.cells
        # Combine existing and OCR cells with overlap filtering
        final_cells = self._combine_cells(existing_cells, ocr_cells)
-        # Update both structures efficiently
+        assert page.parsed_page is not None
        self._update_page_structures(page, final_cells)
-    def _get_existing_cells(self, page):
+        # Update parsed_page.textline_cells directly
-        """Get existing cells, preferring parsed_page when available."""
+        page.parsed_page.textline_cells = final_cells
-        return page.parsed_page.textline_cells if page.parsed_page else page.cells
+        page.parsed_page.has_lines = bool(final_cells)
    def _combine_cells(self, existing_cells, ocr_cells):
        """Combine existing and OCR cells with filtering and re-indexing."""
@ -162,18 +161,6 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
        return combined
    def _update_page_structures(self, page, final_cells):
        """Update both page structures efficiently."""
        if page.parsed_page:
            # Update parsed_page as primary source
            page.parsed_page.textline_cells = final_cells
            page.parsed_page.has_lines = bool(final_cells)
            # Sync to page.cells for compatibility
            page.cells = final_cells
        else:
            # Legacy fallback: only page.cells available
            page.cells = final_cells
    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
        image = copy.deepcopy(page.image)
        scale_x = image.width / page.size.width
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
                            )
                        )
                    # page.cells is already updated by LayoutPostprocessor
                    page.predictions.layout = LayoutPrediction(
                        clusters=processed_clusters
                    )
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -2,7 +2,7 @@ import re
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional
 import numpy as np
 from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
    create_parsed_page: bool
 class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None
-        page.cells = list(page._backend.get_text_cells())
+        page.parsed_page = page._backend.get_segmented_page()
-
+        assert page.parsed_page is not None
        if self.options.create_parsed_page:
            page.parsed_page = page._backend.get_segmented_page()
        # Rate the text quality from the PDF parser, and aggregate on page
        text_scores = []
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
                            all_ocr_cells.extend(cells)
                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page.cells)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
            PagePreprocessingModel(
                options=PagePreprocessingOptions(
                    images_scale=pipeline_options.images_scale,
                    create_parsed_page=pipeline_options.generate_parsed_pages,
                )
            ),
            # OCR
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -196,8 +196,7 @@ class LayoutPostprocessor:
    def __init__(self, page, clusters: List[Cluster]):
        """Initialize processor with page and clusters."""
-        # Get cells from best available source (prefer parsed_page)
+        self.cells = page.cells
        self.cells = self._get_page_cells(page)
        self.page = page
        self.page_size = page.size
        self.all_clusters = clusters
@ -215,24 +214,6 @@ class LayoutPostprocessor:
            [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
        )
    def _get_page_cells(self, page):
        """Get cells from best available source (prefer parsed_page)."""
        return (
            page.parsed_page.textline_cells
            if page.parsed_page is not None
            else page.cells
        )
    def _update_page_structures(self, final_cells):
        """Update both page structures efficiently."""
        if self.page.parsed_page is not None:
            # Update parsed_page as primary source
            self.page.parsed_page.textline_cells = final_cells
            self.page.parsed_page.has_lines = len(final_cells) > 0
        # Legacy fallback: only page.cells available
        self.page.cells = final_cells
    def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
        """Main processing pipeline."""
        self.regular_clusters = self._process_regular_clusters()
@ -259,8 +240,9 @@ class LayoutPostprocessor:
            for child in cluster.children:
                child.cells = self._sort_cells(child.cells)
-        # Update page structures with processed cells
+        assert self.page.parsed_page is not None
-        self._update_page_structures(self.cells)
+        self.page.parsed_page.textline_cells = self.cells
        self.page.parsed_page.has_lines = len(self.cells) > 0
        return final_clusters, self.cells
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v1/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v1/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v2/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v2/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            69.6796630536824,
+            72.33333333333333,
-            689.0124221922704,
+            691.5883585611979,
-            504.8720051760782,
+            503.3333333333333,
-            764.9216921155637
+            763.9216918945312
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
@ -1,4 +1,4 @@
 <document>
-<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
+<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
 <paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            441.2561096985719,
+            444.6666666666667,
-            131.89488404865142,
+            131.58835856119788,
-            522.0347860494834,
+            521.6666666666666,
-            151.87873262042876
+            150.25502522786462
          ],
          "page": 1,
          "span": [
@ -67,10 +67,10 @@
      "prov": [
        {
          "bbox": [
-            89.23887497045128,
+            92.0,
-            77.02339852098021,
+            77.92169189453125,
-            523.208764293368,
+            523.0,
-            124.75312428291147
+            123.25502522786462
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
@ -1,3 +1,3 @@
 <document>
-<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
+<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            690.2441821046808,
+            691.6666666666666,
-            442.39487414368364,
+            444.53450520833337,
-            709.8255852011977,
+            710.3333333333334,
-            523.076601235155
+            521.5345052083334
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
@ -1,3 +1,3 @@
 <document>
-<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
+<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            131.21306574279092,
+            131.66666666666666,
-            74.12495603322407,
+            73.53450520833337,
-            152.19606490864376,
+            150.33333333333334,
-            154.19400205373182
+            150.53450520833331
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt
@ -1,2 +1,2 @@
-<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
+<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
@ -42,10 +42,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 69.6796630536824,
+            "l": 72.33333333333333,
-            "t": 764.9216921155637,
+            "t": 763.9216918945312,
-            "r": 504.8720051760782,
+            "r": 503.3333333333333,
-            "b": 689.0124221922704,
+            "b": 691.5883585611979,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt
@ -1,3 +1,3 @@
-<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
+<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
-<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
+<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json
@ -45,10 +45,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 441.2561096985719,
+            "l": 444.6666666666667,
-            "t": 151.87873262042876,
+            "t": 150.25502522786462,
-            "r": 522.0347860494834,
+            "r": 521.6666666666666,
-            "b": 131.89488404865142,
+            "b": 131.58835856119788,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
@ -74,10 +74,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 89.23887497045128,
+            "l": 92.0,
-            "t": 124.75312428291147,
+            "t": 123.25502522786462,
-            "r": 523.208764293368,
+            "r": 523.0,
-            "b": 77.02339852098021,
+            "b": 77.92169189453125,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt
@ -1,3 +1,3 @@
-<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
+<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
-<text><loc_410><loc_61><loc_422><loc_128>package</text>
+<text><loc_411><loc_62><loc_422><loc_127>package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json
@ -45,10 +45,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 717.168585936602,
+            "l": 718.6666666666666,
-            "t": 524.2990550512769,
+            "t": 522.8678385416666,
-            "r": 764.8982839673505,
+            "r": 764.0,
-            "b": 90.3291657283603,
+            "b": 91.86783854166669,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
@ -74,10 +74,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 690.2441821046808,
+            "l": 691.6666666666666,
-            "t": 523.076601235155,
+            "t": 521.5345052083334,
-            "r": 709.8255852011977,
+            "r": 710.3333333333334,
-            "b": 442.39487414368364,
+            "b": 444.53450520833337,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt
@ -1,3 +1,3 @@
-<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
+<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
-<text><loc_78><loc_370><loc_90><loc_438>package</text>
+<text><loc_78><loc_374><loc_89><loc_438>package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json
@ -45,10 +45,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 77.10171545548258,
+            "l": 78.0,
-            "t": 506.0744964609271,
+            "t": 503.201171875,
-            "r": 126.08064862014129,
+            "r": 123.33333333333333,
-            "b": 71.87755635676046,
+            "b": 72.201171875,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
@ -74,10 +74,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 131.21306574279092,
+            "l": 131.66666666666666,
-            "t": 154.19400205373182,
+            "t": 150.53450520833331,
-            "r": 152.19606490864376,
+            "r": 150.33333333333334,
-            "b": 74.12495603322407,
+            "b": 73.53450520833337,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
`@ -1,2 +1,2 @@`
	`<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>`	`<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>`
	`</doctag>`	`</doctag>`