reorder sections in custom_convert

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-08 09:53:52 +02:00
parent 8ec8c38de8
commit 471daee277

View File

@ -72,7 +72,7 @@ def main():
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# PyPdfium without OCR
# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
@ -84,7 +84,7 @@ def main():
# pdf_backend=PyPdfiumDocumentBackend,
# )
# PyPdfium with OCR
# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
@ -96,7 +96,7 @@ def main():
# pdf_backend=PyPdfiumDocumentBackend,
# )
# Docling Parse without OCR
# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
@ -108,7 +108,7 @@ def main():
pdf_backend=DoclingParseDocumentBackend,
)
# Docling Parse with OCR
# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
@ -120,7 +120,15 @@ def main():
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract OCR
# Docling Parse with Tesseract
# ----------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesserOcrOptions()
# Docling Parse with Tesseract CLI
# ----------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
@ -128,14 +136,6 @@ def main():
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesseractOcrOptions()
# Docling Parse with TesserOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesserOcrOptions()
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,