mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix(test): Introduce parameter in verify_conversion_result()
to allow skipping the verification of the cells. It is used in case of OCR tests.
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
072aaf6bb1
commit
29e65e911b
@ -24,7 +24,7 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
parent = pdf_path.parent
|
parent = pdf_path.parent
|
||||||
eng = "" if engine is None else ".{engine}"
|
eng = "" if engine is None else f".{engine}"
|
||||||
|
|
||||||
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
||||||
with open(dict_fn, "w") as fd:
|
with open(dict_fn, "w") as fd:
|
||||||
@ -94,4 +94,5 @@ def test_e2e_conversions():
|
|||||||
input_path=pdf_path,
|
input_path=pdf_path,
|
||||||
doc_result=doc_result,
|
doc_result=doc_result,
|
||||||
generate=GENERATE,
|
generate=GENERATE,
|
||||||
|
skip_cells=True,
|
||||||
)
|
)
|
||||||
|
@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
|
|||||||
|
|
||||||
|
|
||||||
def verify_conversion_result(
|
def verify_conversion_result(
|
||||||
input_path: Path, doc_result: ConversionResult, generate=False
|
input_path: Path,
|
||||||
|
doc_result: ConversionResult,
|
||||||
|
generate: bool = False,
|
||||||
|
ocr_engine: str = None,
|
||||||
|
skip_cells: bool = False,
|
||||||
):
|
):
|
||||||
PageList = TypeAdapter(List[Page])
|
PageList = TypeAdapter(List[Page])
|
||||||
|
|
||||||
@ -143,10 +147,11 @@ def verify_conversion_result(
|
|||||||
doc_pred_md = doc_result.render_as_markdown()
|
doc_pred_md = doc_result.render_as_markdown()
|
||||||
doc_pred_dt = doc_result.render_as_doctags()
|
doc_pred_dt = doc_result.render_as_doctags()
|
||||||
|
|
||||||
pages_path = input_path.with_suffix(".pages.json")
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
json_path = input_path.with_suffix(".json")
|
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
|
||||||
md_path = input_path.with_suffix(".md")
|
json_path = input_path.with_suffix(f"{engine_suffix}.json")
|
||||||
dt_path = input_path.with_suffix(".doctags.txt")
|
md_path = input_path.with_suffix(f"{engine_suffix}.md")
|
||||||
|
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, "w") as fw:
|
||||||
@ -173,9 +178,10 @@ def verify_conversion_result(
|
|||||||
with open(dt_path, "r") as fr:
|
with open(dt_path, "r") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
assert verify_cells(
|
if not skip_cells:
|
||||||
doc_pred_pages, doc_true_pages
|
assert verify_cells(
|
||||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
doc_pred_pages, doc_true_pages
|
||||||
|
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||||
|
|
||||||
# assert verify_output(
|
# assert verify_output(
|
||||||
# doc_pred, doc_true
|
# doc_pred, doc_true
|
||||||
|
Loading…
Reference in New Issue
Block a user