From 12eea8495f36472bd0947552b3fd8d501b7bd06a Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 26 Aug 2024 17:00:30 +0200 Subject: [PATCH] renamed the test folder and added the toplevel test Signed-off-by: Peter Staar --- .pre-commit-config.yaml | 4 +- {test => tests}/data/2203.01017v2.pdf | Bin {test => tests}/data/2206.01062.pdf | Bin {test => tests}/data/2305.03393v1.pdf | Bin {test => tests}/data/redp5110.pdf | Bin {test => tests}/data/redp5695.pdf | Bin {test => tests}/test_backend_docling_parse.py | 4 ++ {test => tests}/test_backend_pdfium.py | 0 tests/test_toplevel_functions.py | 68 ++++++++++++++++++ 9 files changed, 74 insertions(+), 2 deletions(-) rename {test => tests}/data/2203.01017v2.pdf (100%) rename {test => tests}/data/2206.01062.pdf (100%) rename {test => tests}/data/2305.03393v1.pdf (100%) rename {test => tests}/data/redp5110.pdf (100%) rename {test => tests}/data/redp5695.pdf (100%) rename {test => tests}/test_backend_docling_parse.py (94%) rename {test => tests}/test_backend_pdfium.py (100%) create mode 100644 tests/test_toplevel_functions.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ee1599b..0f95e067 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: system name: Black - entry: poetry run black docling examples + entry: poetry run black docling examples tests pass_filenames: false language: system files: '\.py$' @@ -12,7 +12,7 @@ repos: hooks: - id: system name: isort - entry: poetry run isort docling examples + entry: poetry run isort docling examples tests pass_filenames: false language: system files: '\.py$' diff --git a/test/data/2203.01017v2.pdf b/tests/data/2203.01017v2.pdf similarity index 100% rename from test/data/2203.01017v2.pdf rename to tests/data/2203.01017v2.pdf diff --git a/test/data/2206.01062.pdf b/tests/data/2206.01062.pdf similarity index 100% rename from test/data/2206.01062.pdf rename to tests/data/2206.01062.pdf diff --git a/test/data/2305.03393v1.pdf b/tests/data/2305.03393v1.pdf similarity index 100% rename from test/data/2305.03393v1.pdf rename to tests/data/2305.03393v1.pdf diff --git a/test/data/redp5110.pdf b/tests/data/redp5110.pdf similarity index 100% rename from test/data/redp5110.pdf rename to tests/data/redp5110.pdf diff --git a/test/data/redp5695.pdf b/tests/data/redp5695.pdf similarity index 100% rename from test/data/redp5695.pdf rename to tests/data/redp5695.pdf diff --git a/test/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py similarity index 94% rename from test/test_backend_docling_parse.py rename to tests/test_backend_docling_parse.py index c8d08e0e..8f42e1ba 100644 --- a/test/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -1,3 +1,5 @@ +import glob + from pathlib import Path import pytest @@ -5,6 +7,7 @@ import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend from docling.datamodel.base_models import BoundingBox +from docling.document_converter import DocumentConverter @pytest.fixture def test_doc_path(): @@ -31,3 +34,4 @@ def test_crop_page_image(test_doc_path): def test_num_pages(test_doc_path): doc_backend = DoclingParseDocumentBackend(test_doc_path) doc_backend.page_count() == 9 + diff --git a/test/test_backend_pdfium.py b/tests/test_backend_pdfium.py similarity index 100% rename from test/test_backend_pdfium.py rename to tests/test_backend_pdfium.py diff --git a/tests/test_toplevel_functions.py b/tests/test_toplevel_functions.py new file mode 100644 index 00000000..b3b7c0c0 --- /dev/null +++ b/tests/test_toplevel_functions.py @@ -0,0 +1,68 @@ +import glob + +from pathlib import Path + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.document_converter import DocumentConverter + +GENERATE=True + +def get_pdf_paths(): + + # Define the directory you want to search + directory = Path('./data') + + # List all PDF files in the directory and its subdirectories + pdf_files = sorted(directory.rglob('*.pdf')) + return pdf_files + +def verify_json(doc_pred_json, doc_true_json): + return True + +def verify_md(doc_pred_md, doc_true_md): + return (doc_pred_md==doc_true_md) + +def test_conversions(): + + pdf_paths = get_pdf_paths() + + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = False + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, + ) + + for path in pdf_paths: + + doc_pred_json = converter.convert_single(path) + + doc_pred_md = doc.render_as_markdown() + + json_path = path.with_suffix(".json") + md_path = path.with_suffix(".md") + + if GENERATE: + + with open(json_path, "w") as fw: + fw.write(json.dumps(doc_pred_json, indent=2)) + + with open(md_path, "w") as fw: + fw.write(doc_pred_md) + + else: + + with open(path, "r") as fr: + doc_true_json = json.load(fr) + + with open(path, "r") as fr: + doc_true_md = json.load(fr) + + assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}" + + assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}" +