mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
* feat: exploring new version * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 5815c8f81b0e5ce400332597b6795e5a97ecf775 Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * chore: autoformat DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 5815c8f81b0e5ce400332597b6795e5a97ecf775 * feat: enable configurable runtime for rapidocr and handle new result better; Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * chore: fix linter Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * chore: use new server model * chore: change default engine type to onnx * chore: tests update for new rapidocr * fix: rebase from main and fix clashes * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 5815c8f81b0e5ce400332597b6795e5a97ecf775 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 02f9db85f562e5cdfda40c52fee55cfd4030d70a I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: a7bcb205faedb881f94a89b3bbd29cb31ccd54f0 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: a39482a98cbcff7a825c8321134732af0c65930a I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 63e9d717fa26951566b02761f3fdfc752c31f805 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: ef12a6ec1ea2846a8a8e2e776eeaa59c2a0c4dfe Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 2222d2340387f8d9d66f3ca9d8e21a0945a44e7a I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: bc6a1dc507d7f146ec4797a2d3840414f46ac64d I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 56e0d67da7c57d4b5caf8eaef8dff7056c3efd32 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 871ca21271412006c76acf3c19426140efed3d50 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 7b1b77159da729d483a581a86c7309acba1712a7 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: a792a714a43e19a91b2b782f54621c1c5efda632 Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: d1fed26323ff829b716bc667fe69532839363e45 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 346ec1cad943765f886e5d17fb0a54221124689c I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 4d0bbe5bd6e9f7261b97362ff8823af244267089 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 34a5ad53892a7064a6bf35f890d344d464c78b2f I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 9151959db3ad53535011d1cfdcf9181fdf936bb1 I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit: 8ef5536f2c098826c6c0a05190f8a80614c3f3cb Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * DCO Remediation Commit for Georg Heiler <georg.kf.heiler@gmail.com> I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit:7e18637a35I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit:63fb8ff599I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit:0cb9444fb8I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit:38940d9978I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit:b6d461ac42I, Georg Heiler <georg.kf.heiler@gmail.com>, hereby add my Signed-off-by to this commit:ee55eb3408Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> --------- Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
import sys
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import ConversionResult, DoclingDocument
|
|
from docling.datamodel.pipeline_options import (
|
|
EasyOcrOptions,
|
|
OcrMacOptions,
|
|
OcrOptions,
|
|
RapidOcrOptions,
|
|
TesseractCliOcrOptions,
|
|
TesseractOcrOptions,
|
|
)
|
|
from docling.document_converter import DocumentConverter, ImageFormatOption
|
|
from tests.verify_utils import verify_conversion_result_v2
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
|
|
GENERATE = GEN_TEST_DATA
|
|
|
|
|
|
def get_webp_paths():
|
|
# Define the directory you want to search
|
|
directory = Path("./tests/data/webp/")
|
|
|
|
# List all WEBP files in the directory and its subdirectories
|
|
webp_files = sorted(directory.rglob("*.webp"))
|
|
return webp_files
|
|
|
|
|
|
def get_converter(ocr_options: OcrOptions):
|
|
image_format_option = ImageFormatOption()
|
|
image_format_option.pipeline_options.ocr_options = ocr_options
|
|
|
|
converter = DocumentConverter(
|
|
format_options={InputFormat.IMAGE: image_format_option},
|
|
allowed_formats=[InputFormat.IMAGE],
|
|
)
|
|
|
|
return converter
|
|
|
|
|
|
def test_e2e_webp_conversions():
|
|
webp_paths = get_webp_paths()
|
|
|
|
engines: List[OcrOptions] = [
|
|
EasyOcrOptions(),
|
|
TesseractOcrOptions(),
|
|
TesseractCliOcrOptions(),
|
|
EasyOcrOptions(force_full_page_ocr=True),
|
|
TesseractOcrOptions(force_full_page_ocr=True),
|
|
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
|
TesseractCliOcrOptions(force_full_page_ocr=True),
|
|
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
|
]
|
|
|
|
# rapidocr is only available for Python >=3.6,<3.14
|
|
if sys.version_info < (3, 14):
|
|
engines.append(RapidOcrOptions())
|
|
engines.append(RapidOcrOptions(force_full_page_ocr=True))
|
|
|
|
# only works on mac
|
|
if "darwin" == sys.platform:
|
|
engines.append(OcrMacOptions())
|
|
engines.append(OcrMacOptions(force_full_page_ocr=True))
|
|
for ocr_options in engines:
|
|
print(
|
|
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
|
|
)
|
|
converter = get_converter(ocr_options=ocr_options)
|
|
for webp_path in webp_paths:
|
|
print(f"converting {webp_path}")
|
|
|
|
doc_result: ConversionResult = converter.convert(webp_path)
|
|
|
|
verify_conversion_result_v2(
|
|
input_path=webp_path,
|
|
doc_result=doc_result,
|
|
generate=GENERATE,
|
|
fuzzy=True,
|
|
)
|