feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2024-11-12 09:46:14 +01:00
parent 81c8243a8b
commit c6b3763ecb
10 changed files with 100 additions and 62 deletions
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption

 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

-GENERATE = False
-
-
-# Debug
-def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
-    r""" """
-    import json
-    import os
-
-    parent = pdf_path.parent
-    eng = "" if engine is None else f".{engine}"
-
-    dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
-    with open(dict_fn, "w") as fd:
-        json.dump(doc_result.legacy_document.export_to_dict(), fd)
-
-    pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
-    pages = [p.model_dump() for p in doc_result.pages]
-    with open(pages_fn, "w") as fd:
-        json.dump(pages, fd)
-
-    doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
-    with open(doctags_fn, "w") as fd:
-        fd.write(doc_result.legacy_document.export_to_doctags())
-
-    md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
-    with open(md_fn, "w") as fd:
-        fd.write(doc_result.legacy_document.export_to_markdown())
+GENERATE_V1 = False
+GENERATE_V2 = False


 def get_pdf_paths():
@@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):


 def test_e2e_conversions():
-
    pdf_paths = get_pdf_paths()

    engines: List[OcrOptions] = [
        EasyOcrOptions(),
        TesseractOcrOptions(),
        TesseractCliOcrOptions(),
+        EasyOcrOptions(force_full_page_ocr=True),
+        TesseractOcrOptions(force_full_page_ocr=True),
+        TesseractCliOcrOptions(force_full_page_ocr=True),
    ]

    for ocr_options in engines:
@@ -91,20 +67,16 @@ def test_e2e_conversions():

            doc_result: ConversionResult = converter.convert(pdf_path)

-            # Save conversions
-            # save_output(pdf_path, doc_result, None)
-
-            # Debug
            verify_conversion_result_v1(
                input_path=pdf_path,
                doc_result=doc_result,
-                generate=GENERATE,
+                generate=GENERATE_V1,
                fuzzy=True,
            )

            verify_conversion_result_v2(
                input_path=pdf_path,
                doc_result=doc_result,
-                generate=GENERATE,
+                generate=GENERATE_V2,
                fuzzy=True,
            )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -256,15 +256,19 @@ def verify_conversion_result_v1(
    dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")

    if generate:  # only used when re-generating truth
+        pages_path.parent.mkdir(parents=True, exist_ok=True)
        with open(pages_path, "w") as fw:
            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))

+        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder))

+        md_path.parent.mkdir(parents=True, exist_ok=True)
        with open(md_path, "w") as fw:
            fw.write(doc_pred_md)

+        dt_path.parent.mkdir(parents=True, exist_ok=True)
        with open(dt_path, "w") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
@@ -328,15 +332,19 @@ def verify_conversion_result_v2(
    dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")

    if generate:  # only used when re-generating truth
+        pages_path.parent.mkdir(parents=True, exist_ok=True)
        with open(pages_path, "w") as fw:
            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))

+        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder))

+        md_path.parent.mkdir(parents=True, exist_ok=True)
        with open(md_path, "w") as fw:
            fw.write(doc_pred_md)

+        dt_path.parent.mkdir(parents=True, exist_ok=True)
        with open(dt_path, "w") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test