mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
word-level pdf cells for tables
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
82694b2136
commit
787c6d8ace
@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
|
|||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||||
from docling_core.types.doc.page import BoundingRectangle
|
from docling_core.types.doc.page import (
|
||||||
|
BoundingRectangle,
|
||||||
|
SegmentedPdfPage,
|
||||||
|
TextCellUnit,
|
||||||
|
)
|
||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
@ -218,9 +222,30 @@ class TableStructureModel(BasePageModel):
|
|||||||
|
|
||||||
if len(table_bboxes):
|
if len(table_bboxes):
|
||||||
for table_cluster, tbl_box in in_tables:
|
for table_cluster, tbl_box in in_tables:
|
||||||
|
# Check if word-level cells are available from backend:
|
||||||
|
sp = page._backend.get_segmented_page()
|
||||||
|
if sp is not None:
|
||||||
|
tcells = sp.get_cells_in_bbox(
|
||||||
|
cell_unit=TextCellUnit.WORD,
|
||||||
|
bbox=table_cluster.bbox.to_bottom_left_origin(
|
||||||
|
page.size.height
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# Transform origin of returned cells:
|
||||||
|
tcells2 = []
|
||||||
|
for tcell in tcells:
|
||||||
|
# Do the copy to not affect cells that are in two (or more) table regions
|
||||||
|
tcell = tcell.model_copy(deep=True)
|
||||||
|
tcell.rect = tcell.rect.to_top_left_origin(
|
||||||
|
page.size.height
|
||||||
|
)
|
||||||
|
tcells2.append(tcell)
|
||||||
|
tcells = tcells2
|
||||||
|
else:
|
||||||
|
print("Otherwise - we use normal (line/phrase) cells")
|
||||||
|
tcells = table_cluster.cells
|
||||||
tokens = []
|
tokens = []
|
||||||
for c in table_cluster.cells:
|
for c in tcells:
|
||||||
# Only allow non empty stings (spaces) into the cells of a table
|
# Only allow non empty stings (spaces) into the cells of a table
|
||||||
if len(c.text.strip()) > 0:
|
if len(c.text.strip()) > 0:
|
||||||
new_cell = copy.deepcopy(c)
|
new_cell = copy.deepcopy(c)
|
||||||
@ -229,7 +254,6 @@ class TableStructureModel(BasePageModel):
|
|||||||
scale=self.scale
|
scale=self.scale
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
tokens.append(
|
tokens.append(
|
||||||
{
|
{
|
||||||
"id": new_cell.index,
|
"id": new_cell.index,
|
||||||
@ -244,6 +268,7 @@ class TableStructureModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
table_out = tf_output[0]
|
table_out = tf_output[0]
|
||||||
table_cells = []
|
table_cells = []
|
||||||
|
# print("len(tf_responses)={}".format(len(table_out["tf_responses"])))
|
||||||
for element in table_out["tf_responses"]:
|
for element in table_out["tf_responses"]:
|
||||||
|
|
||||||
if not self.do_cell_matching:
|
if not self.do_cell_matching:
|
||||||
|
Loading…
Reference in New Issue
Block a user