experimental, wip - adding optional word-level cells to page_processing and table model

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-07-30 22:14:37 +00:00 · 2024-11-13 09:45:23 +01:00 · 2024-11-13 09:45:23 +01:00 · e4383cee56
commit e4383cee56
parent fb8ba861e2
5 changed files with 45 additions and 19 deletions
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -2,7 +2,7 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):

        return text_piece

-    def get_text_cells(self) -> Iterable[Cell]:
+    def make_cells(self, page_size, cells_data, cells_header):
        cells: List[Cell] = []
        cell_counter = 0
-
-        if not self.valid:
-            return cells
-
-        page_size = self.get_size()
-
-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
-
-        cells_data = self._dpage["sanitized"]["cells"]["data"]
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
-
        for i, cell_data in enumerate(cells_data):
            x0 = cell_data[cells_header.index("x0")]
            y0 = cell_data[cells_header.index("y0")]
@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                )
            )
            cell_counter += 1
+        return cells
+
+    def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]:
+        cells: List[Cell] = []
+        word_cells: List[Cell] = []
+
+        if not self.valid:
+            return cells, word_cells
+
+        page_size = self.get_size()
+
+        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+
+        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        cells_header = self._dpage["sanitized"]["cells"]["header"]
+
+        word_cells_data = self._dpage["original"]["cells"]["data"]
+        word_cells_header = self._dpage["original"]["cells"]["header"]
+
+        cells = self.make_cells(page_size, cells_data, cells_header)
+        word_cells = self.make_cells(page_size, word_cells_data, word_cells_header)

        def draw_clusters_and_cells():
            image = (
@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):

        # draw_clusters_and_cells()

-        return cells
+        return cells, word_cells

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 32 * 32
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Set, Union
+from typing import Iterable, Optional, Set, Tuple, Union

 from docling_core.types.doc import BoundingBox, Size
 from PIL import Image
@ -18,7 +18,7 @@ class PdfPageBackend(ABC):
        pass

    @abstractmethod
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]:
        pass

    @abstractmethod
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -180,6 +180,7 @@ class Page(BaseModel):
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
    cells: List[Cell] = []
+    word_cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None

--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel):
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None

-        page.cells = list(page._backend.get_text_cells())
+        # page.cells = list(page._backend.get_text_cells())
+        cells_result = page._backend.get_text_cells()
+        if type(cells_result) is not tuple:
+            # Backend supports just PDF cells
+            page.cells = list(cells_result)
+        else:
+            # Backend also supports word cells
+            pdf_cells, word_cells = cells_result
+            page.cells = list(pdf_cells)
+            # preserve word_cells for usage in tables
+            page.word_cells = list(word_cells)

        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel):
                        continue

                    tokens = []
-                    for c in page.cells:
+
+                    page_cells = page.cells
+                    if hasattr(page, "word_cells"):
+                        page_cells = page.word_cells
+
+                    for c in page_cells:
                        for cluster, _ in in_tables:
                            if c.bbox.area() > 0:
                                if (