mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
fix: Correct text extraction for table cells (#21)
* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> * Completed checks Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
@@ -114,12 +114,15 @@ class TableStructureModel:
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(element["bbox"])
|
||||
the_bbox = BoundingBox.model_validate(
|
||||
element["bbox"]
|
||||
).scaled(1 / self.scale)
|
||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
if self.do_cell_matching:
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
table_cells.append(tc)
|
||||
|
||||
# Retrieving cols/rows, after post processing:
|
||||
|
||||
Reference in New Issue
Block a user