Merge from main, update OCR model and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-09 16:04:19 +02:00
20 changed files with 814 additions and 129 deletions

View File

@@ -0,0 +1,3 @@
<document>
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
</document>

View File

@@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.41791534423828, 690.8074951171875, 509.4447021484375, 767.422119140625], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@@ -0,0 +1 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@@ -0,0 +1,104 @@
from pathlib import Path
from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
PipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result
GENERATE = True
# Debug
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
r""" """
import json
import os
parent = pdf_path.parent
eng = "" if engine is None else f".{engine}"
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd:
json.dump(doc_result.render_as_dict(), fd)
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
pages = [p.model_dump() for p in doc_result.pages]
with open(pages_fn, "w") as fd:
json.dump(pages, fd)
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.render_as_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.render_as_markdown())
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data_scanned")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def get_converter(ocr_options: OcrOptions):
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = ocr_options
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend,
)
}
)
return converter
def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
]
for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}")
converter = get_converter(ocr_options=ocr_options)
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
# Save conversions
# save_output(pdf_path, doc_result, None)
# Debug
verify_conversion_result(
input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
skip_cells=True,
)

View File

@@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False
input_path: Path,
doc_result: ConversionResult,
generate: bool = False,
ocr_engine: str = None,
skip_cells: bool = False,
):
PageList = TypeAdapter(List[Page])
@@ -143,10 +147,11 @@ def verify_conversion_result(
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
pages_path = input_path.with_suffix(".pages.json")
json_path = input_path.with_suffix(".json")
md_path = input_path.with_suffix(".md")
dt_path = input_path.with_suffix(".doctags.txt")
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
json_path = input_path.with_suffix(f"{engine_suffix}.json")
md_path = input_path.with_suffix(f"{engine_suffix}.md")
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth
with open(pages_path, "w") as fw:
@@ -173,9 +178,10 @@ def verify_conversion_result(
with open(dt_path, "r") as fr:
doc_true_dt = fr.read()
assert verify_cells(
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}"
if not skip_cells:
assert verify_cells(
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}"
# assert verify_output(
# doc_pred, doc_true