experimental, wip - adding optional word-level cells to page_processing and table model

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-08-02 15:32:30 +00:00 · 2024-11-13 09:45:23 +01:00 · 2024-11-13 09:45:23 +01:00 · e4383cee56
commit e4383cee56
parent fb8ba861e2
5 changed files with 45 additions and 19 deletions
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -2,7 +2,7 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        return text_piece
-    def get_text_cells(self) -> Iterable[Cell]:
+    def make_cells(self, page_size, cells_data, cells_header):
        cells: List[Cell] = []
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["sanitized"]["dimension"]["width"]
        parser_height = self._dpage["sanitized"]["dimension"]["height"]
        cells_data = self._dpage["sanitized"]["cells"]["data"]
        cells_header = self._dpage["sanitized"]["cells"]["header"]
        for i, cell_data in enumerate(cells_data):
            x0 = cell_data[cells_header.index("x0")]
            y0 = cell_data[cells_header.index("y0")]
@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                )
            )
            cell_counter += 1
        return cells
    def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]:
        cells: List[Cell] = []
        word_cells: List[Cell] = []
        if not self.valid:
            return cells, word_cells
        page_size = self.get_size()
        parser_width = self._dpage["sanitized"]["dimension"]["width"]
        parser_height = self._dpage["sanitized"]["dimension"]["height"]
        cells_data = self._dpage["sanitized"]["cells"]["data"]
        cells_header = self._dpage["sanitized"]["cells"]["header"]
        word_cells_data = self._dpage["original"]["cells"]["data"]
        word_cells_header = self._dpage["original"]["cells"]["header"]
        cells = self.make_cells(page_size, cells_data, cells_header)
        word_cells = self.make_cells(page_size, word_cells_data, word_cells_header)
        def draw_clusters_and_cells():
            image = (
@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        # draw_clusters_and_cells()
-        return cells
+        return cells, word_cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 32 * 32
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Set, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 from docling_core.types.doc import BoundingBox, Size
 from PIL import Image
@ -18,7 +18,7 @@ class PdfPageBackend(ABC):
        pass
    @abstractmethod
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]:
        pass
    @abstractmethod
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -180,6 +180,7 @@ class Page(BaseModel):
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
    cells: List[Cell] = []
    word_cells: List[Cell] = []
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel):
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None
-        page.cells = list(page._backend.get_text_cells())
+        # page.cells = list(page._backend.get_text_cells())
        cells_result = page._backend.get_text_cells()
        if type(cells_result) is not tuple:
            # Backend supports just PDF cells
            page.cells = list(cells_result)
        else:
            # Backend also supports word cells
            pdf_cells, word_cells = cells_result
            page.cells = list(pdf_cells)
            # preserve word_cells for usage in tables
            page.word_cells = list(word_cells)
        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel):
                        continue
                    tokens = []
-                    for c in page.cells:
+
                    page_cells = page.cells
                    if hasattr(page, "word_cells"):
                        page_cells = page.word_cells
                    for c in page_cells:
                        for cluster, _ in in_tables:
                            if c.bbox.area() > 0:
                                if (