feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)

* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Make page.parsed_page the only source of truth for text cells Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Small fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Correctly compute PDF boxes from pymupdf Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use different OCR engine order Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add type hints and fix mypy Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * One more test fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove with pypdfium2_lock from caller sites Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix typing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-12 06:38:10 +00:00 · 2025-06-13 19:01:55 +02:00
parent 0432a31b2f
commit 7d3302cb48
50 changed files with 339091 additions and 330047 deletions
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
 from docling_core.types.doc.page import TextCell
 from rtree import index

-from docling.datamodel.base_models import BoundingBox, Cluster
+from docling.datamodel.base_models import BoundingBox, Cluster, Page

 _log = logging.getLogger(__name__)

@@ -194,11 +194,11 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }

-    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
-        """Initialize processor with cells and clusters."""
-        """Initialize processor with cells and spatial indices."""
-        self.cells = cells
-        self.page_size = page_size
+    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+        """Initialize processor with page and clusters."""
+        self.cells = page.cells
+        self.page = page
+        self.page_size = page.size
        self.all_clusters = clusters
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
@@ -240,6 +240,10 @@ class LayoutPostprocessor:
            for child in cluster.children:
                child.cells = self._sort_cells(child.cells)

+        assert self.page.parsed_page is not None
+        self.page.parsed_page.textline_cells = self.cells
+        self.page.parsed_page.has_lines = len(self.cells) > 0
+
        return final_clusters, self.cells

    def _process_regular_clusters(self) -> List[Cluster]:
@@ -301,6 +305,7 @@ class LayoutPostprocessor:
        special_clusters = self._handle_cross_type_overlaps(special_clusters)

        # Calculate page area from known page size
+        assert self.page_size is not None
        page_area = self.page_size.width * self.page_size.height
        if page_area > 0:
            # Filter out full-page pictures