renamed the test folder and added the toplevel test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2024-08-26 17:00:30 +02:00
parent f5eb49a811
commit 12eea8495f
9 changed files with 74 additions and 2 deletions
--- a/tests/data/2203.01017v2.pdf
+++ b/tests/data/2203.01017v2.pdf
--- a/tests/data/2206.01062.pdf
+++ b/tests/data/2206.01062.pdf
--- a/tests/data/2305.03393v1.pdf
+++ b/tests/data/2305.03393v1.pdf
--- a/tests/data/redp5110.pdf
+++ b/tests/data/redp5110.pdf
--- a/tests/data/redp5695.pdf
+++ b/tests/data/redp5695.pdf
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@@ -0,0 +1,37 @@
+import glob
+
+from pathlib import Path
+
+import pytest
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
+from docling.datamodel.base_models import BoundingBox
+
+from docling.document_converter import DocumentConverter
+
+@pytest.fixture
+def test_doc_path():
+    return Path("./data/2206.01062.pdf")
+
+def test_get_text_from_rect(test_doc_path):
+    doc_backend = DoclingParseDocumentBackend(test_doc_path)
+    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
+
+    # Get the title text of the DocLayNet paper
+    textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
+    ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
+
+    assert textpiece.strip() == ref
+
+def test_crop_page_image(test_doc_path):
+    doc_backend = DoclingParseDocumentBackend(test_doc_path)
+    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
+
+    # Crop out "Figure 1" from the DocLayNet paper
+    im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
+    # im.show()
+
+def test_num_pages(test_doc_path):
+    doc_backend = DoclingParseDocumentBackend(test_doc_path)
+    doc_backend.page_count() == 9
+
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import pytest
+
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
+from docling.datamodel.base_models import BoundingBox
+
+
+@pytest.fixture
+def test_doc_path():
+    return Path("./data/2206.01062.pdf")
+
+def test_get_text_from_rect(test_doc_path):
+    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
+    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
+
+    # Get the title text of the DocLayNet paper
+    textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
+    ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
+
+    assert textpiece.strip() == ref
+
+def test_crop_page_image(test_doc_path):
+    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
+    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
+
+    # Crop out "Figure 1" from the DocLayNet paper
+    im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
+    # im.show()
+
+def test_num_pages(test_doc_path):
+    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
+    doc_backend.page_count() == 9
--- a/tests/test_toplevel_functions.py
+++ b/tests/test_toplevel_functions.py
@@ -0,0 +1,68 @@
+import glob
+
+from pathlib import Path
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.document_converter import DocumentConverter
+
+GENERATE=True
+
+def get_pdf_paths():
+
+    # Define the directory you want to search
+    directory = Path('./data')
+
+    # List all PDF files in the directory and its subdirectories
+    pdf_files = sorted(directory.rglob('*.pdf'))
+    return pdf_files
+
+def verify_json(doc_pred_json, doc_true_json):
+    return True
+
+def verify_md(doc_pred_md, doc_true_md):
+    return (doc_pred_md==doc_true_md)
+    
+def test_conversions():
+    
+    pdf_paths = get_pdf_paths()
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+    
+    for path in pdf_paths:
+
+        doc_pred_json = converter.convert_single(path)        
+        
+        doc_pred_md = doc.render_as_markdown()
+
+        json_path = path.with_suffix(".json")
+        md_path = path.with_suffix(".md")
+            
+        if GENERATE:
+            
+            with open(json_path, "w") as fw:
+                fw.write(json.dumps(doc_pred_json, indent=2))
+
+            with open(md_path, "w") as fw:
+                fw.write(doc_pred_md)
+                
+        else:
+
+            with open(path, "r") as fr:
+                doc_true_json = json.load(fr)
+
+            with open(path, "r") as fr:
+                doc_true_md = json.load(fr)        
+
+            assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
+
+            assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"
+