From cfdfda362980cd46d248496c06323520e433c4b6 Mon Sep 17 00:00:00 2001 From: Maxim Lysak Date: Tue, 30 Jul 2024 13:43:50 +0200 Subject: [PATCH] - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak --- docling/models/table_structure_model.py | 5 +++-- examples/convert.py | 9 ++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 132b141c..6b18eb04 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -114,12 +114,13 @@ class TableStructureModel: for element in table_out["tf_responses"]: if not self.do_cell_matching: - the_bbox = BoundingBox.model_validate(element["bbox"]) + the_bbox = BoundingBox.model_validate(element["bbox"]).scaled(1 / self.scale) text_piece = page._backend.get_text_in_rect(the_bbox) element["bbox"]["token"] = text_piece tc = TableCell.model_validate(element) - tc.bbox = tc.bbox.scaled(1 / self.scale) + if self.do_cell_matching: + tc.bbox = tc.bbox.scaled(1 / self.scale) table_cells.append(tc) # Retrieving cols/rows, after post processing: diff --git a/examples/convert.py b/examples/convert.py index 26a38c51..5380c412 100644 --- a/examples/convert.py +++ b/examples/convert.py @@ -53,7 +53,14 @@ def main(): artifacts_path = DocumentConverter.download_models_hf() - doc_converter = DocumentConverter(artifacts_path=artifacts_path) + pipeline_options = PipelineOptions(do_table_structure=True) + # use text cells predicted from table structure model, instead of matching with pdf cells + pipeline_options.table_structure_options.do_cell_matching = False + + doc_converter = DocumentConverter( + artifacts_path=artifacts_path, + pipeline_options=pipeline_options + ) input = DocumentConversionInput.from_paths(input_doc_paths)