mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
- Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False
- Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
b07c4a7a4a
commit
cfdfda3629
@ -114,12 +114,13 @@ class TableStructureModel:
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(element["bbox"])
|
||||
the_bbox = BoundingBox.model_validate(element["bbox"]).scaled(1 / self.scale)
|
||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
if self.do_cell_matching:
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
table_cells.append(tc)
|
||||
|
||||
# Retrieving cols/rows, after post processing:
|
||||
|
@ -53,7 +53,14 @@ def main():
|
||||
|
||||
artifacts_path = DocumentConverter.download_models_hf()
|
||||
|
||||
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
# use text cells predicted from table structure model, instead of matching with pdf cells
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path,
|
||||
pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user