From 1963e7145bb1d7751c0b10c846ce7dd1c17303a0 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Sun, 10 Nov 2024 16:09:53 +0100 Subject: [PATCH] chore(examples): Add example how to force OCR Signed-off-by: Nikos Livathinos --- docs/examples/full_page_ocr.py | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 docs/examples/full_page_ocr.py diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py new file mode 100644 index 00000000..0a61507e --- /dev/null +++ b/docs/examples/full_page_ocr.py @@ -0,0 +1,43 @@ +from pathlib import Path + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + input_doc = Path("./tests/data/2206.01062.pdf") + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions + # ocr_options = EasyOcrOptions(force_full_page_ocr=True) + # ocr_options = TesseractOcrOptions(force_full_page_ocr=True) + ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) + pipeline_options.ocr_options = ocr_options + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=DoclingParseDocumentBackend, + ) + } + ) + + doc = converter.convert(input_doc).document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main()