reorder sections in custom_convert

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-08 09:53:52 +02:00
parent 8ec8c38de8
commit 471daee277

View File

@ -72,7 +72,7 @@ def main():
# and PDF Backends for various configurations. # and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output. # Uncomment one section at the time to see the differences in the output.
# PyPdfium without OCR # PyPdfium without EasyOCR
# -------------------- # --------------------
# pipeline_options = PipelineOptions() # pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False # pipeline_options.do_ocr=False
@ -84,7 +84,7 @@ def main():
# pdf_backend=PyPdfiumDocumentBackend, # pdf_backend=PyPdfiumDocumentBackend,
# ) # )
# PyPdfium with OCR # PyPdfium with EasyOCR
# ----------------- # -----------------
# pipeline_options = PipelineOptions() # pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True # pipeline_options.do_ocr=True
@ -96,7 +96,7 @@ def main():
# pdf_backend=PyPdfiumDocumentBackend, # pdf_backend=PyPdfiumDocumentBackend,
# ) # )
# Docling Parse without OCR # Docling Parse without EasyOCR
# ------------------------- # -------------------------
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
@ -108,7 +108,7 @@ def main():
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,
) )
# Docling Parse with OCR # Docling Parse with EasyOCR
# ---------------------- # ----------------------
# pipeline_options = PipelineOptions() # pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True # pipeline_options.do_ocr=True
@ -120,7 +120,15 @@ def main():
# pdf_backend=DoclingParseDocumentBackend, # pdf_backend=DoclingParseDocumentBackend,
# ) # )
# Docling Parse with Tesseract OCR # Docling Parse with Tesseract
# ----------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesserOcrOptions()
# Docling Parse with Tesseract CLI
# ---------------------- # ----------------------
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True pipeline_options.do_ocr = True
@ -128,14 +136,6 @@ def main():
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesseractOcrOptions() pipeline_options.ocr_options = TesseractOcrOptions()
# Docling Parse with TesserOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesserOcrOptions()
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,