Make page.parsed_page the only source of truth for text cells

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-06-10 19:55:49 +02:00 · 2025-06-10 19:55:49 +02:00 · d73c9a2995
commit d73c9a2995
parent e310c5cff3
58 changed files with 349497 additions and 331004 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -7,7 +7,13 @@ from typing import List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@ -36,6 +42,51 @@ class DoclingParsePageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid

+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from docling-parse data."""
+        cells: List[TextCell] = []
+        cell_counter = 0
+
+        if not self.valid:
+            return cells
+
+        page_size = self.get_size()
+
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
+
+            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
+            cells.append(
+                TextCell(
+                    index=cell_counter,
+                    text=text_piece,
+                    orig=text_piece,
+                    from_ocr=False,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+
+            cell_counter += 1
+
+        return cells
+
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -70,75 +121,45 @@ class DoclingParsePageBackend(PdfPageBackend):
        return text_piece

    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-
-    def get_text_cells(self) -> Iterable[TextCell]:
-        cells: List[TextCell] = []
-        cell_counter = 0
-
        if not self.valid:
-            return cells
+            return None

        page_size = self.get_size()
+        text_cells = self._compute_text_cells()

-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
+        # Create page geometry
+        crop_bbox = BoundingBox(
+            l=0,
+            r=page_size.width,
+            t=0,
+            b=page_size.height,
+            coord_origin=CoordOrigin.TOPLEFT,
+        ).to_bottom_left_origin(page_size.height)

-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
+        dimension = PdfPageGeometry(
+            angle=0.0,
+            rect=BoundingRectangle.from_bounding_box(crop_bbox),
+            boundary_type=PdfPageBoundaryType.CROP_BOX,
+            art_bbox=crop_bbox,
+            bleed_bbox=crop_bbox,
+            crop_bbox=crop_bbox,
+            media_bbox=crop_bbox,
+            trim_bbox=crop_bbox,
+        )

-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_lines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )

-            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
-            cells.append(
-                TextCell(
-                    index=cell_counter,
-                    text=text_piece,
-                    orig=text_piece,
-                    from_ocr=False,
-                    rect=BoundingRectangle.from_bounding_box(
-                        BoundingBox(
-                            # l=x0, b=y0, r=x1, t=y1,
-                            l=x0 * page_size.width / parser_width,
-                            b=y0 * page_size.height / parser_height,
-                            r=x1 * page_size.width / parser_width,
-                            t=y1 * page_size.height / parser_height,
-                            coord_origin=CoordOrigin.BOTTOMLEFT,
-                        )
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-
-            cell_counter += 1
-
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-
-        # before merge:
-        # draw_clusters_and_cells()
-
-        # cells = merge_horizontal_cells(cells)
-
-        # after merge:
-        # draw_clusters_and_cells()
-
-        return cells
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -7,7 +7,13 @@ from typing import TYPE_CHECKING, List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@ -40,6 +46,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid

+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from docling-parse v2 data."""
+        cells: List[TextCell] = []
+        cell_counter = 0
+
+        if not self.valid:
+            return cells
+
+        page_size = self.get_size()
+
+        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+
+        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        cells_header = self._dpage["sanitized"]["cells"]["header"]
+
+        for i, cell_data in enumerate(cells_data):
+            x0 = cell_data[cells_header.index("x0")]
+            y0 = cell_data[cells_header.index("y0")]
+            x1 = cell_data[cells_header.index("x1")]
+            y1 = cell_data[cells_header.index("y1")]
+
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
+
+            text_piece = cell_data[cells_header.index("text")]
+            cells.append(
+                TextCell(
+                    index=cell_counter,
+                    text=text_piece,
+                    orig=text_piece,
+                    from_ocr=False,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+
+        return cells
+
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -81,73 +136,45 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        return text_piece

    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-
-    def get_text_cells(self) -> Iterable[TextCell]:
-        cells: List[TextCell] = []
-        cell_counter = 0
-
        if not self.valid:
-            return cells
+            return None

        page_size = self.get_size()
+        text_cells = self._compute_text_cells()

-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+        # Create page geometry
+        crop_bbox = BoundingBox(
+            l=0,
+            r=page_size.width,
+            t=0,
+            b=page_size.height,
+            coord_origin=CoordOrigin.TOPLEFT,
+        ).to_bottom_left_origin(page_size.height)

-        cells_data = self._dpage["sanitized"]["cells"]["data"]
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
+        dimension = PdfPageGeometry(
+            angle=0.0,
+            rect=BoundingRectangle.from_bounding_box(crop_bbox),
+            boundary_type=PdfPageBoundaryType.CROP_BOX,
+            art_bbox=crop_bbox,
+            bleed_bbox=crop_bbox,
+            crop_bbox=crop_bbox,
+            media_bbox=crop_bbox,
+            trim_bbox=crop_bbox,
+        )

-        for i, cell_data in enumerate(cells_data):
-            x0 = cell_data[cells_header.index("x0")]
-            y0 = cell_data[cells_header.index("y0")]
-            x1 = cell_data[cells_header.index("x1")]
-            y1 = cell_data[cells_header.index("y1")]
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_lines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )

-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
-
-            text_piece = cell_data[cells_header.index("text")]
-            cells.append(
-                TextCell(
-                    index=cell_counter,
-                    text=text_piece,
-                    orig=text_piece,
-                    from_ocr=False,
-                    rect=BoundingRectangle.from_bounding_box(
-                        BoundingBox(
-                            # l=x0, b=y0, r=x1, t=y1,
-                            l=x0 * page_size.width / parser_width,
-                            b=y0 * page_size.height / parser_height,
-                            r=x1 * page_size.width / parser_width,
-                            t=y1 * page_size.height / parser_height,
-                            coord_origin=CoordOrigin.BOTTOMLEFT,
-                        )
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-
-        # draw_clusters_and_cells()
-
-        return cells
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
@ -41,38 +47,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid

-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        page_size = self.get_size()
-        with pypdfium2_lock:
-            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
-                pos = obj.get_pos()
-                cropbox = BoundingBox.from_tuple(
-                    pos, origin=CoordOrigin.BOTTOMLEFT
-                ).to_top_left_origin(page_height=page_size.height)
-
-                if cropbox.area() > AREA_THRESHOLD:
-                    cropbox = cropbox.scaled(scale=scale)
-
-                    yield cropbox
-
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        with pypdfium2_lock:
-            if not self.text_page:
-                self.text_page = self._ppage.get_textpage()
-
-        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
-            bbox = bbox.to_bottom_left_origin(self.get_size().height)
-
-        with pypdfium2_lock:
-            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
-
-        return text_piece
-
-    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-
-    def get_text_cells(self) -> Iterable[TextCell]:
+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from pypdfium."""
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
@ -203,30 +179,76 @@ class PyPdfiumPageBackend(PdfPageBackend):

            return merged_cells

-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
+        return merge_horizontal_cells(cells)

-        # before merge:
-        # draw_clusters_and_cells()
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        page_size = self.get_size()
+        with pypdfium2_lock:
+            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
+                pos = obj.get_pos()
+                cropbox = BoundingBox.from_tuple(
+                    pos, origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height=page_size.height)

-        cells = merge_horizontal_cells(cells)
+                if cropbox.area() > AREA_THRESHOLD:
+                    cropbox = cropbox.scaled(scale=scale)

-        # after merge:
-        # draw_clusters_and_cells()
+                    yield cropbox

-        return cells
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        with pypdfium2_lock:
+            if not self.text_page:
+                self.text_page = self._ppage.get_textpage()
+
+        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
+            bbox = bbox.to_bottom_left_origin(self.get_size().height)
+
+        with pypdfium2_lock:
+            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
+
+        return text_piece
+
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        if not self.valid:
+            return None
+
+        page_size = self.get_size()
+        text_cells = self._compute_text_cells()
+
+        # Create page geometry
+        crop_bbox = BoundingBox(
+            l=0,
+            r=page_size.width,
+            t=0,
+            b=page_size.height,
+            coord_origin=CoordOrigin.TOPLEFT,
+        ).to_bottom_left_origin(page_size.height)
+
+        dimension = PdfPageGeometry(
+            angle=0.0,
+            rect=BoundingRectangle.from_bounding_box(crop_bbox),
+            boundary_type=PdfPageBoundaryType.CROP_BOX,
+            art_bbox=crop_bbox,
+            bleed_bbox=crop_bbox,
+            crop_bbox=crop_bbox,
+            media_bbox=crop_bbox,
+            trim_bbox=crop_bbox,
+        )
+
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_lines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()

    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -232,7 +232,6 @@ class Page(BaseModel):
    page_no: int
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
-    cells: List[TextCell] = []
    parsed_page: Optional[SegmentedPdfPage] = None
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None
@ -245,6 +244,14 @@ class Page(BaseModel):
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.

+    @property
+    def cells(self) -> List[TextCell]:
+        """Return text cells as a read-only view of parsed_page.textline_cells."""
+        if self.parsed_page is not None:
+            return self.parsed_page.textline_cells
+        else:
+            return []
+
    def get_image(
        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
    ) -> Optional[Image]:
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
        ),
    )

-    generate_parsed_pages: bool = False
+    generate_parsed_pages: Literal[True] = (
+        True  # Always True since parsed_page is now mandatory
+    )


 class PdfPipeline(str, Enum):
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -133,20 +133,19 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
    def post_process_cells(self, ocr_cells, page):
        r"""
        Post-process the OCR cells and update the page object.
-        Treats page.parsed_page as authoritative when available, with page.cells for compatibility.
+        Updates parsed_page.textline_cells directly since page.cells is now read-only.
        """
-        # Get existing cells (prefer parsed_page, fallback to page.cells)
-        existing_cells = self._get_existing_cells(page)
+        # Get existing cells from the read-only property
+        existing_cells = page.cells

        # Combine existing and OCR cells with overlap filtering
        final_cells = self._combine_cells(existing_cells, ocr_cells)

-        # Update both structures efficiently
-        self._update_page_structures(page, final_cells)
+        assert page.parsed_page is not None

-    def _get_existing_cells(self, page):
-        """Get existing cells, preferring parsed_page when available."""
-        return page.parsed_page.textline_cells if page.parsed_page else page.cells
+        # Update parsed_page.textline_cells directly
+        page.parsed_page.textline_cells = final_cells
+        page.parsed_page.has_lines = bool(final_cells)

    def _combine_cells(self, existing_cells, ocr_cells):
        """Combine existing and OCR cells with filtering and re-indexing."""
@ -162,18 +161,6 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):

        return combined

-    def _update_page_structures(self, page, final_cells):
-        """Update both page structures efficiently."""
-        if page.parsed_page:
-            # Update parsed_page as primary source
-            page.parsed_page.textline_cells = final_cells
-            page.parsed_page.has_lines = bool(final_cells)
-            # Sync to page.cells for compatibility
-            page.cells = final_cells
-        else:
-            # Legacy fallback: only page.cells available
-            page.cells = final_cells
-
    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
        image = copy.deepcopy(page.image)
        scale_x = image.width / page.size.width
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
                            )
                        )

-                    # page.cells is already updated by LayoutPostprocessor
                    page.predictions.layout = LayoutPrediction(
                        clusters=processed_clusters
                    )
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -2,7 +2,7 @@ import re
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional

 import numpy as np
 from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder

 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
-    create_parsed_page: bool


 class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None

-        page.cells = list(page._backend.get_text_cells())
-
-        if self.options.create_parsed_page:
-            page.parsed_page = page._backend.get_segmented_page()
+        page.parsed_page = page._backend.get_segmented_page()
+        assert page.parsed_page is not None

        # Rate the text quality from the PDF parser, and aggregate on page
        text_scores = []
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
                            all_ocr_cells.extend(cells)

                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page.cells)

                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
            PagePreprocessingModel(
                options=PagePreprocessingOptions(
                    images_scale=pipeline_options.images_scale,
-                    create_parsed_page=pipeline_options.generate_parsed_pages,
                )
            ),
            # OCR
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -196,8 +196,7 @@ class LayoutPostprocessor:

    def __init__(self, page, clusters: List[Cluster]):
        """Initialize processor with page and clusters."""
-        # Get cells from best available source (prefer parsed_page)
-        self.cells = self._get_page_cells(page)
+        self.cells = page.cells
        self.page = page
        self.page_size = page.size
        self.all_clusters = clusters
@ -215,24 +214,6 @@ class LayoutPostprocessor:
            [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
        )

-    def _get_page_cells(self, page):
-        """Get cells from best available source (prefer parsed_page)."""
-        return (
-            page.parsed_page.textline_cells
-            if page.parsed_page is not None
-            else page.cells
-        )
-
-    def _update_page_structures(self, final_cells):
-        """Update both page structures efficiently."""
-        if self.page.parsed_page is not None:
-            # Update parsed_page as primary source
-            self.page.parsed_page.textline_cells = final_cells
-            self.page.parsed_page.has_lines = len(final_cells) > 0
-
-        # Legacy fallback: only page.cells available
-        self.page.cells = final_cells
-
    def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
        """Main processing pipeline."""
        self.regular_clusters = self._process_regular_clusters()
@ -259,8 +240,9 @@ class LayoutPostprocessor:
            for child in cluster.children:
                child.cells = self._sort_cells(child.cells)

-        # Update page structures with processed cells
-        self._update_page_structures(self.cells)
+        assert self.page.parsed_page is not None
+        self.page.parsed_page.textline_cells = self.cells
+        self.page.parsed_page.has_lines = len(self.cells) > 0

        return final_clusters, self.cells

--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v1/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v1/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v2/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v2/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            69.6796630536824,
-            689.0124221922704,
-            504.8720051760782,
-            764.9216921155637
+            72.33333333333333,
+            691.5883585611979,
+            503.3333333333333,
+            763.9216918945312
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
@ -1,4 +1,4 @@
 <document>
-<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
+<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
 <paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            441.2561096985719,
-            131.89488404865142,
-            522.0347860494834,
-            151.87873262042876
+            444.6666666666667,
+            131.58835856119788,
+            521.6666666666666,
+            150.25502522786462
          ],
          "page": 1,
          "span": [
@ -67,10 +67,10 @@
      "prov": [
        {
          "bbox": [
-            89.23887497045128,
-            77.02339852098021,
-            523.208764293368,
-            124.75312428291147
+            92.0,
+            77.92169189453125,
+            523.0,
+            123.25502522786462
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
@ -1,3 +1,3 @@
 <document>
-<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
+<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            690.2441821046808,
-            442.39487414368364,
-            709.8255852011977,
-            523.076601235155
+            691.6666666666666,
+            444.53450520833337,
+            710.3333333333334,
+            521.5345052083334
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
@ -1,3 +1,3 @@
 <document>
-<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
+<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
@ -44,10 +44,10 @@
      "prov": [
        {
          "bbox": [
-            131.21306574279092,
-            74.12495603322407,
-            152.19606490864376,
-            154.19400205373182
+            131.66666666666666,
+            73.53450520833337,
+            150.33333333333334,
+            150.53450520833331
          ],
          "page": 1,
          "span": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt
@ -1,2 +1,2 @@
-<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
+<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
@ -42,10 +42,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 69.6796630536824,
-            "t": 764.9216921155637,
-            "r": 504.8720051760782,
-            "b": 689.0124221922704,
+            "l": 72.33333333333333,
+            "t": 763.9216918945312,
+            "r": 503.3333333333333,
+            "b": 691.5883585611979,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt
@ -1,3 +1,3 @@
-<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
-<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
+<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
+<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json
@ -45,10 +45,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 441.2561096985719,
-            "t": 151.87873262042876,
-            "r": 522.0347860494834,
-            "b": 131.89488404865142,
+            "l": 444.6666666666667,
+            "t": 150.25502522786462,
+            "r": 521.6666666666666,
+            "b": 131.58835856119788,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
@ -74,10 +74,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 89.23887497045128,
-            "t": 124.75312428291147,
-            "r": 523.208764293368,
-            "b": 77.02339852098021,
+            "l": 92.0,
+            "t": 123.25502522786462,
+            "r": 523.0,
+            "b": 77.92169189453125,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt
@ -1,3 +1,3 @@
-<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
-<text><loc_410><loc_61><loc_422><loc_128>package</text>
+<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
+<text><loc_411><loc_62><loc_422><loc_127>package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json
@ -45,10 +45,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 717.168585936602,
-            "t": 524.2990550512769,
-            "r": 764.8982839673505,
-            "b": 90.3291657283603,
+            "l": 718.6666666666666,
+            "t": 522.8678385416666,
+            "r": 764.0,
+            "b": 91.86783854166669,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
@ -74,10 +74,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 690.2441821046808,
-            "t": 523.076601235155,
-            "r": 709.8255852011977,
-            "b": 442.39487414368364,
+            "l": 691.6666666666666,
+            "t": 521.5345052083334,
+            "r": 710.3333333333334,
+            "b": 444.53450520833337,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt
@ -1,3 +1,3 @@
-<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
-<text><loc_78><loc_370><loc_90><loc_438>package</text>
+<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
+<text><loc_78><loc_374><loc_89><loc_438>package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json
@ -45,10 +45,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 77.10171545548258,
-            "t": 506.0744964609271,
-            "r": 126.08064862014129,
-            "b": 71.87755635676046,
+            "l": 78.0,
+            "t": 503.201171875,
+            "r": 123.33333333333333,
+            "b": 72.201171875,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
@ -74,10 +74,10 @@
        {
          "page_no": 1,
          "bbox": {
-            "l": 131.21306574279092,
-            "t": 154.19400205373182,
-            "r": 152.19606490864376,
-            "b": 74.12495603322407,
+            "l": 131.66666666666666,
+            "t": 150.53450520833331,
+            "r": 150.33333333333334,
+            "b": 73.53450520833337,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json