test: update test case of converting image/webp file with more ocr engines

Signed-off-by: Elwin <hzywong@gmail.com>
This commit is contained in:
Elwin 2025-04-23 22:04:02 +08:00
parent a243c80eb8
commit e40a6d1e4f
7 changed files with 46 additions and 25 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

View File

@ -0,0 +1,2 @@
<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@ -0,0 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -0,0 +1 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

File diff suppressed because one or more lines are too long

BIN
tests/data/webp/test.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -1,12 +1,16 @@
import sys
from pathlib import Path
from typing import List
from tests.verify_utils import verify_conversion_result_v2
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.datamodel.pipeline_options import RapidOcrOptions
from docling.datamodel.pipeline_options import RapidOcrOptions, OcrOptions, EasyOcrOptions, TesseractOcrOptions, \
TesseractCliOcrOptions, OcrMacOptions
from docling.document_converter import DocumentConverter, ImageFormatOption
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
@ -20,9 +24,10 @@ def get_webp_paths():
return webp_files
def get_converter():
def get_converter(ocr_options: OcrOptions):
image_format_option = ImageFormatOption()
image_format_option.pipeline_options.ocr_options = RapidOcrOptions()
image_format_option.pipeline_options.ocr_options = ocr_options
converter = DocumentConverter(
format_options={InputFormat.IMAGE: image_format_option},
allowed_formats=[InputFormat.IMAGE],
@ -33,29 +38,40 @@ def get_converter():
def test_e2e_webp_conversions():
webp_paths = get_webp_paths()
converter = get_converter()
for webp_path in webp_paths:
print(f"converting {webp_path}")
engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
]
gt_path = (
webp_path.parent.parent / "groundtruth" / "docling_v2" / webp_path.name
# rapidocr is only available for Python >=3.6,<3.13
if sys.version_info < (3, 13):
engines.append(RapidOcrOptions())
engines.append(RapidOcrOptions(force_full_page_ocr=True))
# only works on mac
if "darwin" == sys.platform:
engines.append(OcrMacOptions())
engines.append(OcrMacOptions(force_full_page_ocr=True))
for ocr_options in engines:
print(
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
)
converter = get_converter(ocr_options=ocr_options)
for webp_path in webp_paths:
print(f"converting {webp_path}")
conv_result: ConversionResult = converter.convert(webp_path)
doc_result: ConversionResult = converter.convert(webp_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
"document document"
)
verify_conversion_result_v2(
input_path=webp_path,
doc_result=doc_result,
generate=GENERATE,
fuzzy=True,
)