From 29e65e911bdf08624278c58cc7405d67bbce6a0c Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Tue, 8 Oct 2024 14:14:43 +0200 Subject: [PATCH] fix(test): Introduce parameter in `verify_conversion_result()` to allow skipping the verification of the cells. It is used in case of OCR tests. Signed-off-by: Nikos Livathinos --- tests/test_e2e_ocr_conversion.py | 3 ++- tests/verify_utils.py | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index ad88f65c..c7a1147d 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -24,7 +24,7 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): import os parent = pdf_path.parent - eng = "" if engine is None else ".{engine}" + eng = "" if engine is None else f".{engine}" dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json") with open(dict_fn, "w") as fd: @@ -94,4 +94,5 @@ def test_e2e_conversions(): input_path=pdf_path, doc_result=doc_result, generate=GENERATE, + skip_cells=True, ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index a0b0f0e6..082b7c78 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt): def verify_conversion_result( - input_path: Path, doc_result: ConversionResult, generate=False + input_path: Path, + doc_result: ConversionResult, + generate: bool = False, + ocr_engine: str = None, + skip_cells: bool = False, ): PageList = TypeAdapter(List[Page]) @@ -143,10 +147,11 @@ def verify_conversion_result( doc_pred_md = doc_result.render_as_markdown() doc_pred_dt = doc_result.render_as_doctags() - pages_path = input_path.with_suffix(".pages.json") - json_path = input_path.with_suffix(".json") - md_path = input_path.with_suffix(".md") - dt_path = input_path.with_suffix(".doctags.txt") + engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json") + json_path = input_path.with_suffix(f"{engine_suffix}.json") + md_path = input_path.with_suffix(f"{engine_suffix}.md") + dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth with open(pages_path, "w") as fw: @@ -173,9 +178,10 @@ def verify_conversion_result( with open(dt_path, "r") as fr: doc_true_dt = fr.read() - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + if not skip_cells: + assert verify_cells( + doc_pred_pages, doc_true_pages + ), f"Mismatch in PDF cell prediction for {input_path}" # assert verify_output( # doc_pred, doc_true