Merge from main, update OCR model and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 14:18:30 +00:00 · 2024-10-09 16:04:19 +02:00
parent 0dfbd0b6fc 6924999f1f
commit b5a27386c1
20 changed files with 814 additions and 129 deletions
--- a/tests/data_scanned/ocr_test.doctags.txt
+++ b/tests/data_scanned/ocr_test.doctags.txt
@@ -0,0 +1,3 @@
+<document>
+<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
+</document>
--- a/tests/data_scanned/ocr_test.json
+++ b/tests/data_scanned/ocr_test.json
@@ -0,0 +1 @@
+{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
--- a/tests/data_scanned/ocr_test.md
+++ b/tests/data_scanned/ocr_test.md
@@ -0,0 +1 @@
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
--- a/tests/data_scanned/ocr_test.pages.json
+++ b/tests/data_scanned/ocr_test.pages.json
--- a/tests/data_scanned/ocr_test.pdf
+++ b/tests/data_scanned/ocr_test.pdf
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+from typing import List
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PdfPipelineOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+from .verify_utils import verify_conversion_result
+
+GENERATE = True
+
+
+# Debug
+def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
+    r""" """
+    import json
+    import os
+
+    parent = pdf_path.parent
+    eng = "" if engine is None else f".{engine}"
+
+    dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
+    with open(dict_fn, "w") as fd:
+        json.dump(doc_result.render_as_dict(), fd)
+
+    pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
+    pages = [p.model_dump() for p in doc_result.pages]
+    with open(pages_fn, "w") as fd:
+        json.dump(pages, fd)
+
+    doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
+    with open(doctags_fn, "w") as fd:
+        fd.write(doc_result.render_as_doctags())
+
+    md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
+    with open(md_fn, "w") as fd:
+        fd.write(doc_result.render_as_markdown())
+
+
+def get_pdf_paths():
+    # Define the directory you want to search
+    directory = Path("./tests/data_scanned")
+
+    # List all PDF files in the directory and its subdirectories
+    pdf_files = sorted(directory.rglob("*.pdf"))
+    return pdf_files
+
+
+def get_converter(ocr_options: OcrOptions):
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+    pipeline_options.ocr_options = ocr_options
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=DoclingParseDocumentBackend,
+            )
+        }
+    )
+
+    return converter
+
+
+def test_e2e_conversions():
+
+    pdf_paths = get_pdf_paths()
+
+    engines: List[OcrOptions] = [
+        EasyOcrOptions(),
+        TesseractOcrOptions(),
+        TesseractCliOcrOptions(),
+    ]
+
+    for ocr_options in engines:
+        print(f"Converting with ocr_engine: {ocr_options.kind}")
+        converter = get_converter(ocr_options=ocr_options)
+        for pdf_path in pdf_paths:
+            print(f"converting {pdf_path}")
+
+            doc_result: ConversionResult = converter.convert_single(pdf_path)
+
+            # Save conversions
+            # save_output(pdf_path, doc_result, None)
+
+            # Debug
+            verify_conversion_result(
+                input_path=pdf_path,
+                doc_result=doc_result,
+                generate=GENERATE,
+                skip_cells=True,
+            )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):


 def verify_conversion_result(
-    input_path: Path, doc_result: ConversionResult, generate=False
+    input_path: Path,
+    doc_result: ConversionResult,
+    generate: bool = False,
+    ocr_engine: str = None,
+    skip_cells: bool = False,
 ):
    PageList = TypeAdapter(List[Page])

@@ -143,10 +147,11 @@ def verify_conversion_result(
    doc_pred_md = doc_result.render_as_markdown()
    doc_pred_dt = doc_result.render_as_doctags()

-    pages_path = input_path.with_suffix(".pages.json")
-    json_path = input_path.with_suffix(".json")
-    md_path = input_path.with_suffix(".md")
-    dt_path = input_path.with_suffix(".doctags.txt")
+    engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
+    pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
+    json_path = input_path.with_suffix(f"{engine_suffix}.json")
+    md_path = input_path.with_suffix(f"{engine_suffix}.md")
+    dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")

    if generate:  # only used when re-generating truth
        with open(pages_path, "w") as fw:
@@ -173,9 +178,10 @@ def verify_conversion_result(
        with open(dt_path, "r") as fr:
            doc_true_dt = fr.read()

-        assert verify_cells(
-            doc_pred_pages, doc_true_pages
-        ), f"Mismatch in PDF cell prediction for {input_path}"
+        if not skip_cells:
+            assert verify_cells(
+                doc_pred_pages, doc_true_pages
+            ), f"Mismatch in PDF cell prediction for {input_path}"

        # assert verify_output(
        #    doc_pred, doc_true
				`@@ -0,0 +1 @@`
				{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
				`@@ -0,0 +1 @@`
				`Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package`