diff --git a/tests/data/test_01.asciidoc b/tests/data/asciidoc/test_01.asciidoc similarity index 100% rename from tests/data/test_01.asciidoc rename to tests/data/asciidoc/test_01.asciidoc diff --git a/tests/data/test_02.asciidoc b/tests/data/asciidoc/test_02.asciidoc similarity index 100% rename from tests/data/test_02.asciidoc rename to tests/data/asciidoc/test_02.asciidoc diff --git a/tests/data/2203.01017v2.pdf b/tests/data/pdf/2203.01017v2.pdf similarity index 100% rename from tests/data/2203.01017v2.pdf rename to tests/data/pdf/2203.01017v2.pdf diff --git a/tests/data/2206.01062.pdf b/tests/data/pdf/2206.01062.pdf similarity index 100% rename from tests/data/2206.01062.pdf rename to tests/data/pdf/2206.01062.pdf diff --git a/tests/data/2305.03393v1-pg9.pdf b/tests/data/pdf/2305.03393v1-pg9.pdf similarity index 100% rename from tests/data/2305.03393v1-pg9.pdf rename to tests/data/pdf/2305.03393v1-pg9.pdf diff --git a/tests/data/2305.03393v1.pdf b/tests/data/pdf/2305.03393v1.pdf similarity index 100% rename from tests/data/2305.03393v1.pdf rename to tests/data/pdf/2305.03393v1.pdf diff --git a/tests/data/amt_handbook_sample.pdf b/tests/data/pdf/amt_handbook_sample.pdf similarity index 100% rename from tests/data/amt_handbook_sample.pdf rename to tests/data/pdf/amt_handbook_sample.pdf diff --git a/tests/data/code_and_formula.pdf b/tests/data/pdf/code_and_formula.pdf similarity index 100% rename from tests/data/code_and_formula.pdf rename to tests/data/pdf/code_and_formula.pdf diff --git a/tests/data/picture_classification.pdf b/tests/data/pdf/picture_classification.pdf similarity index 100% rename from tests/data/picture_classification.pdf rename to tests/data/pdf/picture_classification.pdf diff --git a/tests/data/redp5110_sampled.pdf b/tests/data/pdf/redp5110_sampled.pdf similarity index 100% rename from tests/data/redp5110_sampled.pdf rename to tests/data/pdf/redp5110_sampled.pdf diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index e4fae312..4574a228 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -20,7 +20,7 @@ def _get_backend(fname): def test_asciidocs_examples(): - fnames = sorted(glob.glob("./tests/data/*.asciidoc")) + fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc")) for fname in fnames: print(f"reading {fname}") diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 66e7771d..3c214791 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -28,7 +28,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index 087272bf..ee0e5c75 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -27,7 +27,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index b2a77dcd..10a2b9e7 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -28,7 +28,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_cli.py b/tests/test_cli.py index 71d14457..4364df8b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,7 +18,7 @@ def test_cli_version(): def test_cli_convert(tmp_path): - source = "./tests/data/2305.03393v1-pg9.pdf" + source = "./tests/data/pdf/2305.03393v1-pg9.pdf" output = tmp_path / "out" output.mkdir() result = runner.invoke(app, [source, "--output", str(output)]) diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index 05e87246..a5e6100c 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -36,7 +36,7 @@ def get_converter(): def test_code_and_formula_conversion(): - pdf_path = Path("tests/data/code_and_formula.pdf") + pdf_path = Path("tests/data/pdf/code_and_formula.pdf") converter = get_converter() print(f"converting {pdf_path}") diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 0ad87e96..6ca54d63 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -37,7 +37,7 @@ def get_converter(): def test_picture_classifier(): - pdf_path = Path("tests/data/picture_classification.pdf") + pdf_path = Path("tests/data/pdf/picture_classification.pdf") converter = get_converter() print(f"converting {pdf_path}") diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 0c572595..d2215d61 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -15,7 +15,7 @@ GENERATE_V2 = False def get_pdf_paths(): # Define the directory you want to search - directory = Path("./tests/data") + directory = Path("./tests/data/pdf/") # List all PDF files in the directory and its subdirectories pdf_files = sorted(directory.rglob("*.pdf")) diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index efecb81e..c21b6c43 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -9,7 +9,7 @@ from docling.datamodel.settings import DocumentLimits def test_in_doc_from_valid_path(): - test_doc_path = Path("./tests/data/2206.01062.pdf") + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") doc = _make_input_doc(test_doc_path) assert doc.valid == True @@ -24,7 +24,7 @@ def test_in_doc_from_invalid_path(): def test_in_doc_from_valid_buf(): - buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) + buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) @@ -41,7 +41,7 @@ def test_in_doc_from_invalid_buf(): def test_in_doc_with_page_range(): - test_doc_path = Path("./tests/data/2206.01062.pdf") + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() limits.page_range = (1, 10) @@ -81,10 +81,10 @@ def test_guess_format(tmp_path): temp_dir.mkdir() # Valid PDF - buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) + buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) assert dci._guess_format(stream) == InputFormat.PDF - doc_path = Path("./tests/data/2206.01062.pdf") + doc_path = Path("./tests/data/pdf/2206.01062.pdf") assert dci._guess_format(doc_path) == InputFormat.PDF # Valid MS Office diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 23bc3345..1978bc74 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -15,7 +15,7 @@ GENERATE = False def get_pdf_path(): - pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf") + pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index f40d79e4..68716cba 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -9,7 +9,7 @@ from docling.document_converter import ConversionError, DocumentConverter def get_pdf_path(): - pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf") + pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index 28800edd..215253d2 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -16,7 +16,7 @@ def test_doc_paths(): Path("tests/data/docx/lorem_ipsum.docx"), Path("tests/data/pptx/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), - Path("tests/data/2206.01062.pdf"), + Path("tests/data/pdf/2206.01062.pdf"), ] diff --git a/tests/test_options.py b/tests/test_options.py index 1dd3bbc8..c8701a1b 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -17,7 +17,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def get_converters_with_table_options(): diff --git a/tests/verify_utils.py b/tests/verify_utils.py index c444266b..ee94519c 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -248,8 +248,13 @@ def verify_conversion_result_v1( doc_pred_md = doc_result.legacy_document.export_to_markdown() doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() + engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name + if str(input_path.parent).endswith("pdf"): + gt_subpath = input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name + pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json") json_path = gt_subpath.with_suffix(f"{engine_suffix}.json") md_path = gt_subpath.with_suffix(f"{engine_suffix}.md") @@ -325,7 +330,11 @@ def verify_conversion_result_v2( doc_pred_dt = doc_result.document.export_to_document_tokens() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name + if str(input_path.parent).endswith("pdf"): + gt_subpath = input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name + pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json") json_path = gt_subpath.with_suffix(f"{engine_suffix}.json") md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")