mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix: Correct text extraction for table cells (#21)
* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> * Completed checks Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
@@ -53,7 +53,13 @@ def main():
|
||||
|
||||
artifacts_path = DocumentConverter.download_models_hf()
|
||||
|
||||
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
# use text cells predicted from table structure model, instead of matching with pdf cells
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user