test: avoid testing exact JSON (#1027)

* test: avoid testing exact JSON

Avoid testing exact JSON output in html and xml backends.
Reuse the JSON verify helper function among backend test files.
Improve type annotations in html backend.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Update tests/test_backend_patent_uspto.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-02-20 16:20:07 +01:00
committed by GitHub
parent 6796f0a132
commit 1ac010354f
8 changed files with 54 additions and 78 deletions

View File

@@ -1,4 +1,5 @@
import json
import os
import warnings
from pathlib import Path
from typing import List, Optional
@@ -457,3 +458,17 @@ def verify_conversion_result_v2(
assert verify_dt(
doc_pred_dt, doc_true_dt, fuzzy=fuzzy
), f"Mismatch in DocTags prediction for {input_path}"
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
if not os.path.exists(gtfile) or generate:
with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2)
return True
else:
with open(gtfile) as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)