diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 8e6fb6d8..4971abb2 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -72,7 +72,7 @@ def main(): # and PDF Backends for various configurations. # Uncomment one section at the time to see the differences in the output. - # PyPdfium without OCR + # PyPdfium without EasyOCR # -------------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=False @@ -84,7 +84,7 @@ def main(): # pdf_backend=PyPdfiumDocumentBackend, # ) - # PyPdfium with OCR + # PyPdfium with EasyOCR # ----------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=True @@ -96,7 +96,7 @@ def main(): # pdf_backend=PyPdfiumDocumentBackend, # ) - # Docling Parse without OCR + # Docling Parse without EasyOCR # ------------------------- pipeline_options = PipelineOptions() pipeline_options.do_ocr = False @@ -108,7 +108,7 @@ def main(): pdf_backend=DoclingParseDocumentBackend, ) - # Docling Parse with OCR + # Docling Parse with EasyOCR # ---------------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=True @@ -120,7 +120,15 @@ def main(): # pdf_backend=DoclingParseDocumentBackend, # ) - # Docling Parse with Tesseract OCR + # Docling Parse with Tesseract + # ---------------------- + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + pipeline_options.ocr_options = TesserOcrOptions() + + # Docling Parse with Tesseract CLI # ---------------------- pipeline_options = PipelineOptions() pipeline_options.do_ocr = True @@ -128,14 +136,6 @@ def main(): pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.ocr_options = TesseractOcrOptions() - # Docling Parse with TesserOCR - # ---------------------- - # pipeline_options = PipelineOptions() - # pipeline_options.do_ocr=True - # pipeline_options.do_table_structure=True - # pipeline_options.table_structure_options.do_cell_matching = True - # pipeline_options.ocr_options = TesserOcrOptions() - doc_converter = DocumentConverter( pipeline_options=pipeline_options, pdf_backend=DoclingParseDocumentBackend,