word-level pdf cells for tables

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-07-30 14:04:27 +00:00 · 2025-03-25 13:36:58 +01:00 · 2025-03-25 13:36:58 +01:00 · 787c6d8ace
commit 787c6d8ace
parent 82694b2136
1 changed files with 29 additions and 4 deletions
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
-from docling_core.types.doc.page import BoundingRectangle
+from docling_core.types.doc.page import (
    BoundingRectangle,
    SegmentedPdfPage,
    TextCellUnit,
 )
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
@ -218,9 +222,30 @@ class TableStructureModel(BasePageModel):
                    if len(table_bboxes):
                        for table_cluster, tbl_box in in_tables:
-
+                            # Check if word-level cells are available from backend:
                            sp = page._backend.get_segmented_page()
                            if sp is not None:
                                tcells = sp.get_cells_in_bbox(
                                    cell_unit=TextCellUnit.WORD,
                                    bbox=table_cluster.bbox.to_bottom_left_origin(
                                        page.size.height
                                    ),
                                )
                                # Transform origin of returned cells:
                                tcells2 = []
                                for tcell in tcells:
                                    # Do the copy to not affect cells that are in two (or more) table regions
                                    tcell = tcell.model_copy(deep=True)
                                    tcell.rect = tcell.rect.to_top_left_origin(
                                        page.size.height
                                    )
                                    tcells2.append(tcell)
                                tcells = tcells2
                            else:
                                print("Otherwise - we use normal (line/phrase) cells")
                                tcells = table_cluster.cells
                            tokens = []
-                            for c in table_cluster.cells:
+                            for c in tcells:
                                # Only allow non empty stings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
@ -229,7 +254,6 @@ class TableStructureModel(BasePageModel):
                                            scale=self.scale
                                        )
                                    )
                                    tokens.append(
                                        {
                                            "id": new_cell.index,
@ -244,6 +268,7 @@ class TableStructureModel(BasePageModel):
                            )
                            table_out = tf_output[0]
                            table_cells = []
                            # print("len(tf_responses)={}".format(len(table_out["tf_responses"])))
                            for element in table_out["tf_responses"]:
                                if not self.do_cell_matching: