word-level pdf cells for tables

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-07-29 21:44:32 +00:00 · 2025-03-25 13:36:58 +01:00 · 2025-03-25 13:36:58 +01:00 · 787c6d8ace
commit 787c6d8ace
parent 82694b2136
1 changed files with 29 additions and 4 deletions
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union

 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
-from docling_core.types.doc.page import BoundingRectangle
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    SegmentedPdfPage,
+    TextCellUnit,
+)
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw

@ -218,9 +222,30 @@ class TableStructureModel(BasePageModel):

                    if len(table_bboxes):
                        for table_cluster, tbl_box in in_tables:
-
+                            # Check if word-level cells are available from backend:
+                            sp = page._backend.get_segmented_page()
+                            if sp is not None:
+                                tcells = sp.get_cells_in_bbox(
+                                    cell_unit=TextCellUnit.WORD,
+                                    bbox=table_cluster.bbox.to_bottom_left_origin(
+                                        page.size.height
+                                    ),
+                                )
+                                # Transform origin of returned cells:
+                                tcells2 = []
+                                for tcell in tcells:
+                                    # Do the copy to not affect cells that are in two (or more) table regions
+                                    tcell = tcell.model_copy(deep=True)
+                                    tcell.rect = tcell.rect.to_top_left_origin(
+                                        page.size.height
+                                    )
+                                    tcells2.append(tcell)
+                                tcells = tcells2
+                            else:
+                                print("Otherwise - we use normal (line/phrase) cells")
+                                tcells = table_cluster.cells
                            tokens = []
-                            for c in table_cluster.cells:
+                            for c in tcells:
                                # Only allow non empty stings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
@ -229,7 +254,6 @@ class TableStructureModel(BasePageModel):
                                            scale=self.scale
                                        )
                                    )
-
                                    tokens.append(
                                        {
                                            "id": new_cell.index,
@ -244,6 +268,7 @@ class TableStructureModel(BasePageModel):
                            )
                            table_out = tf_output[0]
                            table_cells = []
+                            # print("len(tf_responses)={}".format(len(table_out["tf_responses"])))
                            for element in table_out["tf_responses"]:

                                if not self.do_cell_matching: