mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
test: avoid testing exact JSON (#1027)
* test: avoid testing exact JSON Avoid testing exact JSON output in html and xml backends. Reuse the JSON verify helper function among backend test files. Improve type annotations in html backend. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Update tests/test_backend_patent_uspto.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
6796f0a132
commit
1ac010354f
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@@ -12,7 +11,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_docitems
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@@ -74,20 +73,6 @@ def verify_export(pred_text: str, gtfile: str):
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def verify_document(pred_doc: DoclingDocument, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
||||
|
||||
return True
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
|
||||
|
||||
def test_e2e_docx_conversions():
|
||||
|
||||
docx_paths = get_docx_paths()
|
||||
@@ -114,7 +99,9 @@ def test_e2e_docx_conversions():
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json"), "document document"
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
pred_html: str = doc.export_to_html()
|
||||
|
||||
Reference in New Issue
Block a user