feat: add options for choosing OCR engines (#118)

--------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2024-10-08 19:07:08 +02:00
parent d412c363d7
commit f96ea86a00
20 changed files with 699 additions and 32 deletions
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -8,6 +8,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import (
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
 from docling.document_converter import DocumentConverter

 _log = logging.getLogger(__name__)
@@ -71,7 +75,7 @@ def main():
    # and PDF Backends for various configurations.
    # Uncomment one section at the time to see the differences in the output.

-    # PyPdfium without OCR
+    # PyPdfium without EasyOCR
    # --------------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=False
@@ -83,7 +87,7 @@ def main():
    #     pdf_backend=PyPdfiumDocumentBackend,
    # )

-    # PyPdfium with OCR
+    # PyPdfium with EasyOCR
    # -----------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=True
@@ -95,7 +99,7 @@ def main():
    #     pdf_backend=PyPdfiumDocumentBackend,
    # )

-    # Docling Parse without OCR
+    # Docling Parse without EasyOCR
    # -------------------------
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
@@ -107,7 +111,7 @@ def main():
        pdf_backend=DoclingParseDocumentBackend,
    )

-    # Docling Parse with OCR
+    # Docling Parse with EasyOCR
    # ----------------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=True
@@ -119,6 +123,32 @@ def main():
    #     pdf_backend=DoclingParseDocumentBackend,
    # )

+    # Docling Parse with Tesseract
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesseractOcrOptions()
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    # Docling Parse with Tesseract CLI
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesseractCliOcrOptions()
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
    ###########################################################################

    # Define input files