fix: Correct text extraction for table cells (#21)

* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False
- Corrected examples/convert.py with appropriate parameter, for good quality example conversion

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>

* Completed checks

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2024-07-30 14:51:47 +02:00
committed by GitHub
parent b07c4a7a4a
commit f4bf3d25b9
2 changed files with 12 additions and 3 deletions

View File

@@ -114,12 +114,15 @@ class TableStructureModel:
for element in table_out["tf_responses"]:
if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate(element["bbox"])
the_bbox = BoundingBox.model_validate(
element["bbox"]
).scaled(1 / self.scale)
text_piece = page._backend.get_text_in_rect(the_bbox)
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element)
tc.bbox = tc.bbox.scaled(1 / self.scale)
if self.do_cell_matching:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)
# Retrieving cols/rows, after post processing: