docling/tests/test_toplevel_functions.py
Peter Staar b7debe7250 need to start running all tests successfully
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2024-08-26 17:50:39 +02:00

82 lines
2.0 KiB
Python

import glob
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.document_converter import DocumentConverter
GENERATE = True
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def verify_json(doc_pred_json, doc_true_json):
return True
def verify_md(doc_pred_md, doc_true_md):
return doc_pred_md == doc_true_md
def test_conversions():
pdf_paths = get_pdf_paths()
print(f"#-documents: {pdf_paths}")
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
for path in pdf_paths:
doc_pred_json = None
try:
print(f"converting {path}")
doc_pred_json = converter.convert_single(path)
except:
continue
doc_pred_md = doc_pred_json.render_as_markdown()
json_path = path.with_suffix(".json")
md_path = path.with_suffix(".md")
if GENERATE:
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred_json, indent=2))
with open(md_path, "w") as fw:
fw.write(doc_pred_md)
else:
with open(path, "r") as fr:
doc_true_json = json.load(fr)
with open(path, "r") as fr:
doc_true_md = json.load(fr)
assert verify_json(
doc_pred_json, doc_true_json
), f"failed json prediction for {path}"
assert verify_md(
doc_pred_md, doc_true_md
), f"failed md prediction for {path}"