fix: Correct text extraction for table cells (#21)

* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> * Completed checks Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2024-07-30 14:51:47 +02:00
parent b07c4a7a4a
commit f4bf3d25b9
2 changed files with 12 additions and 3 deletions
--- a/examples/convert.py
+++ b/examples/convert.py
@@ -53,7 +53,13 @@ def main():

    artifacts_path = DocumentConverter.download_models_hf()

-    doc_converter = DocumentConverter(artifacts_path=artifacts_path)
+    pipeline_options = PipelineOptions(do_table_structure=True)
+    # use text cells predicted from table structure model, instead of matching with pdf cells
+    pipeline_options.table_structure_options.do_cell_matching = False
+
+    doc_converter = DocumentConverter(
+        artifacts_path=artifacts_path, pipeline_options=pipeline_options
+    )

    input = DocumentConversionInput.from_paths(input_doc_paths)