From cdb57e0ba3945d375ab857d58de3872ad6a51b95 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 24 Jan 2025 12:38:03 +0100 Subject: [PATCH] docs: Add example how to use "auto" language with tesseract OCR engines Signed-off-by: Nikos Livathinos --- docs/examples/tesseract_lang_detection.py | 35 +++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 36 insertions(+) create mode 100644 docs/examples/tesseract_lang_detection.py diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py new file mode 100644 index 00000000..856d50ce --- /dev/null +++ b/docs/examples/tesseract_lang_detection.py @@ -0,0 +1,35 @@ +from pathlib import Path + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + input_doc = Path("./tests/data/2206.01062.pdf") + + # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions + # ocr_options = TesseractOcrOptions(lang=["auto"]) + ocr_options = TesseractCliOcrOptions(lang=["auto"]) + + pipeline_options = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_options) + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + + doc = converter.convert(input_doc).document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 8f8d86d9..4356ddc7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,6 +75,7 @@ nav: - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py - "Force full page OCR": examples/full_page_ocr.py + - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py - "Accelerator options": examples/run_with_accelerator.py - "Simple translation": examples/translate.py - ✂️ Chunking: