test: update test case of converting image/webp file with more ocr engines

Signed-off-by: Elwin <hzywong@gmail.com>
2025-07-27 04:24:45 +00:00 · 2025-04-23 22:04:02 +08:00 · 2025-04-23 22:04:02 +08:00 · e40a6d1e4f
commit e40a6d1e4f
parent a243c80eb8
7 changed files with 46 additions and 25 deletions
--- a/tests/data/webp/20250422-155738.webp
+++ b/tests/data/webp/20250422-155738.webp
--- a/tests/data/webp/groundtruth/docling_v2/test.doctags.txt
+++ b/tests/data/webp/groundtruth/docling_v2/test.doctags.txt
@ -0,0 +1,2 @@
+<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
+</doctag>
--- a/tests/data/webp/groundtruth/docling_v2/test.json
+++ b/tests/data/webp/groundtruth/docling_v2/test.json
@ -0,0 +1 @@
+{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
--- a/tests/data/webp/groundtruth/docling_v2/test.md
+++ b/tests/data/webp/groundtruth/docling_v2/test.md
@ -0,0 +1 @@
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
--- a/tests/data/webp/groundtruth/docling_v2/test.pages.json
+++ b/tests/data/webp/groundtruth/docling_v2/test.pages.json
--- a/tests/data/webp/test.webp
+++ b/tests/data/webp/test.webp
--- a/tests/test_backend_webp.py
+++ b/tests/test_backend_webp.py
@ -1,12 +1,16 @@
+import sys
 from pathlib import Path
+from typing import List
+
+from tests.verify_utils import verify_conversion_result_v2

 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult, DoclingDocument
-from docling.datamodel.pipeline_options import RapidOcrOptions
+from docling.datamodel.pipeline_options import RapidOcrOptions, OcrOptions, EasyOcrOptions, TesseractOcrOptions, \
+    TesseractCliOcrOptions, OcrMacOptions
 from docling.document_converter import DocumentConverter, ImageFormatOption

 from .test_data_gen_flag import GEN_TEST_DATA
-from .verify_utils import verify_document, verify_export

 GENERATE = GEN_TEST_DATA

@ -20,9 +24,10 @@ def get_webp_paths():
    return webp_files


-def get_converter():
+def get_converter(ocr_options: OcrOptions):
    image_format_option = ImageFormatOption()
-    image_format_option.pipeline_options.ocr_options = RapidOcrOptions()
+    image_format_option.pipeline_options.ocr_options = ocr_options
+
    converter = DocumentConverter(
        format_options={InputFormat.IMAGE: image_format_option},
        allowed_formats=[InputFormat.IMAGE],
@ -33,29 +38,40 @@ def get_converter():

 def test_e2e_webp_conversions():
    webp_paths = get_webp_paths()
-    converter = get_converter()

-    for webp_path in webp_paths:
-        print(f"converting {webp_path}")
+    engines: List[OcrOptions] = [
+        EasyOcrOptions(),
+        TesseractOcrOptions(),
+        TesseractCliOcrOptions(),
+        EasyOcrOptions(force_full_page_ocr=True),
+        TesseractOcrOptions(force_full_page_ocr=True),
+        TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+        TesseractCliOcrOptions(force_full_page_ocr=True),
+        TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+    ]

-        gt_path = (
-            webp_path.parent.parent / "groundtruth" / "docling_v2" / webp_path.name
+    # rapidocr is only available for Python >=3.6,<3.13
+    if sys.version_info < (3, 13):
+        engines.append(RapidOcrOptions())
+        engines.append(RapidOcrOptions(force_full_page_ocr=True))
+
+    # only works on mac
+    if "darwin" == sys.platform:
+        engines.append(OcrMacOptions())
+        engines.append(OcrMacOptions(force_full_page_ocr=True))
+    for ocr_options in engines:
+        print(
+            f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
        )
+        converter = get_converter(ocr_options=ocr_options)
+        for webp_path in webp_paths:
+            print(f"converting {webp_path}")

-        conv_result: ConversionResult = converter.convert(webp_path)
+            doc_result: ConversionResult = converter.convert(webp_path)

-        doc: DoclingDocument = conv_result.document
-
-        pred_md: str = doc.export_to_markdown()
-        assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
-
-        pred_itxt: str = doc._export_to_indented_text(
-            max_text_len=70, explicit_tables=False
-        )
-        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
-            "export to indented-text"
-        )
-
-        assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
-            "document document"
-        )
+            verify_conversion_result_v2(
+                input_path=webp_path,
+                doc_result=doc_result,
+                generate=GENERATE,
+                fuzzy=True,
+            )
				`@ -0,0 +1 @@`
				{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
				`@ -0,0 +1 @@`
				`Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package`