Update docs

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-09-19 16:59:16 +02:00
parent d8163b0865
commit 6ea6f29516
2 changed files with 18 additions and 2 deletions

View File

@ -110,6 +110,8 @@ This can improve output quality if you find that multiple columns in extracted t
```python ```python
from docling.datamodel.pipeline_options import PipelineOptions
pipeline_options = PipelineOptions(do_table_structure=True) pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
@ -119,6 +121,20 @@ doc_converter = DocumentConverter(
) )
``` ```
Since docling 1.14.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
```python
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
doc_converter = DocumentConverter(
artifacts_path=artifacts_path,
pipeline_options=pipeline_options,
)
```
### Impose limits on the document size ### Impose limits on the document size
You can limit the file size and number of pages which should be allowed to process per document: You can limit the file size and number of pages which should be allowed to process per document:

View File

@ -83,9 +83,9 @@ def main():
# PyPdfium with OCR # PyPdfium with OCR
# ----------------- # -----------------
# pipeline_options = PipelineOptions() # pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False # pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True # pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter( # doc_converter = DocumentConverter(
# pipeline_options=pipeline_options, # pipeline_options=pipeline_options,