diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py new file mode 100644 index 00000000..0a61507e --- /dev/null +++ b/docs/examples/full_page_ocr.py @@ -0,0 +1,43 @@ +from pathlib import Path + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + input_doc = Path("./tests/data/2206.01062.pdf") + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions + # ocr_options = EasyOcrOptions(force_full_page_ocr=True) + # ocr_options = TesseractOcrOptions(force_full_page_ocr=True) + ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) + pipeline_options.ocr_options = ocr_options + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=DoclingParseDocumentBackend, + ) + } + ) + + doc = converter.convert(input_doc).document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main()