test: validate actual docitems in tests (#966)

* validate actual docitems in tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove verbose print

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* disable test generation

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-02-14 17:47:53 +01:00
committed by GitHub
parent ffbde1d1b0
commit e1436a8b05
5 changed files with 168 additions and 58 deletions

View File

@@ -12,6 +12,8 @@ from docling.datamodel.document import (
)
from docling.document_converter import DocumentConverter
from .verify_utils import verify_docitems
GENERATE = False
@@ -72,6 +74,20 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
def verify_document(pred_doc: DoclingDocument, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2)
return True
else:
with open(gtfile, "r") as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)
def test_e2e_docx_conversions():
docx_paths = get_docx_paths()
@@ -98,8 +114,7 @@ def test_e2e_docx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
assert verify_document(doc, str(gt_path) + ".json"), "document document"
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()