From 6ea6f2951697e00363b4921023e537b844159ed0 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Thu, 19 Sep 2024 16:59:16 +0200 Subject: [PATCH] Update docs Signed-off-by: Christoph Auer --- README.md | 16 ++++++++++++++++ examples/custom_convert.py | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0cb0c95c..fa20456d 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,8 @@ This can improve output quality if you find that multiple columns in extracted t ```python +from docling.datamodel.pipeline_options import PipelineOptions + pipeline_options = PipelineOptions(do_table_structure=True) pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model @@ -119,6 +121,20 @@ doc_converter = DocumentConverter( ) ``` +Since docling 1.14.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures. + +```python +from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode + +pipeline_options = PipelineOptions(do_table_structure=True) +pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model + +doc_converter = DocumentConverter( + artifacts_path=artifacts_path, + pipeline_options=pipeline_options, +) +``` + ### Impose limits on the document size You can limit the file size and number of pages which should be allowed to process per document: diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 2c0fac7a..b561d9d2 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -83,9 +83,9 @@ def main(): # PyPdfium with OCR # ----------------- # pipeline_options = PipelineOptions() - # pipeline_options.do_ocr=False + # pipeline_options.do_ocr=True # pipeline_options.do_table_structure=True - # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.table_structure_options.do_cell_matching = False # doc_converter = DocumentConverter( # pipeline_options=pipeline_options,