renamed the test folder and added the toplevel test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-08-26 17:00:30 +02:00 · 2024-08-26 17:00:30 +02:00 · 12eea8495f
commit 12eea8495f
parent f5eb49a811
9 changed files with 74 additions and 2 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -4,7 +4,7 @@ repos:
    hooks:
      - id: system
        name: Black
-        entry: poetry run black docling examples
+        entry: poetry run black docling examples tests
        pass_filenames: false
        language: system
        files: '\.py$'
@ -12,7 +12,7 @@ repos:
    hooks:
      - id: system
        name: isort
-        entry: poetry run isort docling examples
+        entry: poetry run isort docling examples tests
        pass_filenames: false
        language: system
        files: '\.py$'
--- a/tests/data/2203.01017v2.pdf
+++ b/tests/data/2203.01017v2.pdf
--- a/tests/data/2206.01062.pdf
+++ b/tests/data/2206.01062.pdf
--- a/tests/data/2305.03393v1.pdf
+++ b/tests/data/2305.03393v1.pdf
--- a/tests/data/redp5110.pdf
+++ b/tests/data/redp5110.pdf
--- a/tests/data/redp5695.pdf
+++ b/tests/data/redp5695.pdf
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@ -1,3 +1,5 @@
 import glob
 from pathlib import Path
 import pytest
@ -5,6 +7,7 @@ import pytest
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
 from docling.datamodel.base_models import BoundingBox
 from docling.document_converter import DocumentConverter
@pytest.fixture
 def test_doc_path():
@ -31,3 +34,4 @@ def test_crop_page_image(test_doc_path):
 def test_num_pages(test_doc_path):
    doc_backend = DoclingParseDocumentBackend(test_doc_path)
    doc_backend.page_count() == 9
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
--- a/tests/test_toplevel_functions.py
+++ b/tests/test_toplevel_functions.py
@ -0,0 +1,68 @@
 import glob
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.document_converter import DocumentConverter
 GENERATE=True
 def get_pdf_paths():
    # Define the directory you want to search
    directory = Path('./data')
    # List all PDF files in the directory and its subdirectories
    pdf_files = sorted(directory.rglob('*.pdf'))
    return pdf_files
 def verify_json(doc_pred_json, doc_true_json):
    return True
 def verify_md(doc_pred_md, doc_true_md):
    return (doc_pred_md==doc_true_md)
 def test_conversions():
    pdf_paths = get_pdf_paths()
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
    for path in pdf_paths:
        doc_pred_json = converter.convert_single(path)        
        doc_pred_md = doc.render_as_markdown()
        json_path = path.with_suffix(".json")
        md_path = path.with_suffix(".md")
        if GENERATE:
            with open(json_path, "w") as fw:
                fw.write(json.dumps(doc_pred_json, indent=2))
            with open(md_path, "w") as fw:
                fw.write(doc_pred_md)
        else:
            with open(path, "r") as fr:
                doc_true_json = json.load(fr)
            with open(path, "r") as fr:
                doc_true_md = json.load(fr)        
            assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
            assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"