mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
Merge from main, update OCR model and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
3
tests/data_scanned/ocr_test.doctags.txt
Normal file
3
tests/data_scanned/ocr_test.doctags.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
|
||||
</document>
|
||||
1
tests/data_scanned/ocr_test.json
Normal file
1
tests/data_scanned/ocr_test.json
Normal file
@@ -0,0 +1 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
||||
1
tests/data_scanned/ocr_test.md
Normal file
1
tests/data_scanned/ocr_test.md
Normal file
@@ -0,0 +1 @@
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
||||
1
tests/data_scanned/ocr_test.pages.json
Normal file
1
tests/data_scanned/ocr_test.pages.json
Normal file
File diff suppressed because one or more lines are too long
BIN
tests/data_scanned/ocr_test.pdf
Normal file
BIN
tests/data_scanned/ocr_test.pdf
Normal file
Binary file not shown.
104
tests/test_e2e_ocr_conversion.py
Normal file
104
tests/test_e2e_ocr_conversion.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
GENERATE = True
|
||||
|
||||
|
||||
# Debug
|
||||
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
||||
r""" """
|
||||
import json
|
||||
import os
|
||||
|
||||
parent = pdf_path.parent
|
||||
eng = "" if engine is None else f".{engine}"
|
||||
|
||||
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
||||
with open(dict_fn, "w") as fd:
|
||||
json.dump(doc_result.render_as_dict(), fd)
|
||||
|
||||
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
|
||||
pages = [p.model_dump() for p in doc_result.pages]
|
||||
with open(pages_fn, "w") as fd:
|
||||
json.dump(pages, fd)
|
||||
|
||||
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
||||
with open(doctags_fn, "w") as fd:
|
||||
fd.write(doc_result.render_as_doctags())
|
||||
|
||||
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
||||
with open(md_fn, "w") as fd:
|
||||
fd.write(doc_result.render_as_markdown())
|
||||
|
||||
|
||||
def get_pdf_paths():
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data_scanned")
|
||||
|
||||
# List all PDF files in the directory and its subdirectories
|
||||
pdf_files = sorted(directory.rglob("*.pdf"))
|
||||
return pdf_files
|
||||
|
||||
|
||||
def get_converter(ocr_options: OcrOptions):
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
pipeline_options.ocr_options = ocr_options
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_conversions():
|
||||
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
engines: List[OcrOptions] = [
|
||||
EasyOcrOptions(),
|
||||
TesseractOcrOptions(),
|
||||
TesseractCliOcrOptions(),
|
||||
]
|
||||
|
||||
for ocr_options in engines:
|
||||
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||
converter = get_converter(ocr_options=ocr_options)
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
|
||||
# Save conversions
|
||||
# save_output(pdf_path, doc_result, None)
|
||||
|
||||
# Debug
|
||||
verify_conversion_result(
|
||||
input_path=pdf_path,
|
||||
doc_result=doc_result,
|
||||
generate=GENERATE,
|
||||
skip_cells=True,
|
||||
)
|
||||
@@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
|
||||
|
||||
|
||||
def verify_conversion_result(
|
||||
input_path: Path, doc_result: ConversionResult, generate=False
|
||||
input_path: Path,
|
||||
doc_result: ConversionResult,
|
||||
generate: bool = False,
|
||||
ocr_engine: str = None,
|
||||
skip_cells: bool = False,
|
||||
):
|
||||
PageList = TypeAdapter(List[Page])
|
||||
|
||||
@@ -143,10 +147,11 @@ def verify_conversion_result(
|
||||
doc_pred_md = doc_result.render_as_markdown()
|
||||
doc_pred_dt = doc_result.render_as_doctags()
|
||||
|
||||
pages_path = input_path.with_suffix(".pages.json")
|
||||
json_path = input_path.with_suffix(".json")
|
||||
md_path = input_path.with_suffix(".md")
|
||||
dt_path = input_path.with_suffix(".doctags.txt")
|
||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
|
||||
json_path = input_path.with_suffix(f"{engine_suffix}.json")
|
||||
md_path = input_path.with_suffix(f"{engine_suffix}.md")
|
||||
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
with open(pages_path, "w") as fw:
|
||||
@@ -173,9 +178,10 @@ def verify_conversion_result(
|
||||
with open(dt_path, "r") as fr:
|
||||
doc_true_dt = fr.read()
|
||||
|
||||
assert verify_cells(
|
||||
doc_pred_pages, doc_true_pages
|
||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||
if not skip_cells:
|
||||
assert verify_cells(
|
||||
doc_pred_pages, doc_true_pages
|
||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||
|
||||
# assert verify_output(
|
||||
# doc_pred, doc_true
|
||||
|
||||
Reference in New Issue
Block a user