diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index db7fa259..d7f8b6a5 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell -from docling_core.types.doc.page import BoundingRectangle +from docling_core.types.doc.page import ( + BoundingRectangle, + SegmentedPdfPage, + TextCellUnit, +) from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw @@ -218,9 +222,30 @@ class TableStructureModel(BasePageModel): if len(table_bboxes): for table_cluster, tbl_box in in_tables: - + # Check if word-level cells are available from backend: + sp = page._backend.get_segmented_page() + if sp is not None: + tcells = sp.get_cells_in_bbox( + cell_unit=TextCellUnit.WORD, + bbox=table_cluster.bbox.to_bottom_left_origin( + page.size.height + ), + ) + # Transform origin of returned cells: + tcells2 = [] + for tcell in tcells: + # Do the copy to not affect cells that are in two (or more) table regions + tcell = tcell.model_copy(deep=True) + tcell.rect = tcell.rect.to_top_left_origin( + page.size.height + ) + tcells2.append(tcell) + tcells = tcells2 + else: + print("Otherwise - we use normal (line/phrase) cells") + tcells = table_cluster.cells tokens = [] - for c in table_cluster.cells: + for c in tcells: # Only allow non empty stings (spaces) into the cells of a table if len(c.text.strip()) > 0: new_cell = copy.deepcopy(c) @@ -229,7 +254,6 @@ class TableStructureModel(BasePageModel): scale=self.scale ) ) - tokens.append( { "id": new_cell.index, @@ -244,6 +268,7 @@ class TableStructureModel(BasePageModel): ) table_out = tf_output[0] table_cells = [] + # print("len(tf_responses)={}".format(len(table_out["tf_responses"]))) for element in table_out["tf_responses"]: if not self.do_cell_matching: