fix(test): Introduce parameter in verify_conversion_result() to allow skipping the verification of the cells. It is used in case of OCR tests.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-10-08 14:14:43 +02:00
parent 072aaf6bb1
commit 29e65e911b
2 changed files with 16 additions and 9 deletions

View File

@ -24,7 +24,7 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
import os import os
parent = pdf_path.parent parent = pdf_path.parent
eng = "" if engine is None else ".{engine}" eng = "" if engine is None else f".{engine}"
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json") dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd: with open(dict_fn, "w") as fd:
@ -94,4 +94,5 @@ def test_e2e_conversions():
input_path=pdf_path, input_path=pdf_path,
doc_result=doc_result, doc_result=doc_result,
generate=GENERATE, generate=GENERATE,
skip_cells=True,
) )

View File

@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
def verify_conversion_result( def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False input_path: Path,
doc_result: ConversionResult,
generate: bool = False,
ocr_engine: str = None,
skip_cells: bool = False,
): ):
PageList = TypeAdapter(List[Page]) PageList = TypeAdapter(List[Page])
@ -143,10 +147,11 @@ def verify_conversion_result(
doc_pred_md = doc_result.render_as_markdown() doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags() doc_pred_dt = doc_result.render_as_doctags()
pages_path = input_path.with_suffix(".pages.json") engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
json_path = input_path.with_suffix(".json") pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
md_path = input_path.with_suffix(".md") json_path = input_path.with_suffix(f"{engine_suffix}.json")
dt_path = input_path.with_suffix(".doctags.txt") md_path = input_path.with_suffix(f"{engine_suffix}.md")
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth if generate: # only used when re-generating truth
with open(pages_path, "w") as fw: with open(pages_path, "w") as fw:
@ -173,6 +178,7 @@ def verify_conversion_result(
with open(dt_path, "r") as fr: with open(dt_path, "r") as fr:
doc_true_dt = fr.read() doc_true_dt = fr.read()
if not skip_cells:
assert verify_cells( assert verify_cells(
doc_pred_pages, doc_true_pages doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}" ), f"Mismatch in PDF cell prediction for {input_path}"