From 1963e7145bb1d7751c0b10c846ce7dd1c17303a0 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Sun, 10 Nov 2024 16:09:53 +0100
Subject: [PATCH] chore(examples): Add example how to force OCR

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docs/examples/full_page_ocr.py | 43 ++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 docs/examples/full_page_ocr.py

diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py
new file mode 100644
index 00000000..0a61507e
--- /dev/null
+++ b/docs/examples/full_page_ocr.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    input_doc = Path("./tests/data/2206.01062.pdf")
+
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
+    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
+    # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
+    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
+    pipeline_options.ocr_options = ocr_options
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=DoclingParseDocumentBackend,
+            )
+        }
+    )
+
+    doc = converter.convert(input_doc).document
+    md = doc.export_to_markdown()
+    print(md)
+
+
+if __name__ == "__main__":
+    main()