mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
renamed the test folder and added the toplevel test
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
f5eb49a811
commit
12eea8495f
@ -4,7 +4,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: system
|
- id: system
|
||||||
name: Black
|
name: Black
|
||||||
entry: poetry run black docling examples
|
entry: poetry run black docling examples tests
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
language: system
|
language: system
|
||||||
files: '\.py$'
|
files: '\.py$'
|
||||||
@ -12,7 +12,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: system
|
- id: system
|
||||||
name: isort
|
name: isort
|
||||||
entry: poetry run isort docling examples
|
entry: poetry run isort docling examples tests
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
language: system
|
language: system
|
||||||
files: '\.py$'
|
files: '\.py$'
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import glob
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -5,6 +7,7 @@ import pytest
|
|||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
|
||||||
from docling.datamodel.base_models import BoundingBox
|
from docling.datamodel.base_models import BoundingBox
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
@ -31,3 +34,4 @@ def test_crop_page_image(test_doc_path):
|
|||||||
def test_num_pages(test_doc_path):
|
def test_num_pages(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||||
doc_backend.page_count() == 9
|
doc_backend.page_count() == 9
|
||||||
|
|
68
tests/test_toplevel_functions.py
Normal file
68
tests/test_toplevel_functions.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import glob
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
GENERATE=True
|
||||||
|
|
||||||
|
def get_pdf_paths():
|
||||||
|
|
||||||
|
# Define the directory you want to search
|
||||||
|
directory = Path('./data')
|
||||||
|
|
||||||
|
# List all PDF files in the directory and its subdirectories
|
||||||
|
pdf_files = sorted(directory.rglob('*.pdf'))
|
||||||
|
return pdf_files
|
||||||
|
|
||||||
|
def verify_json(doc_pred_json, doc_true_json):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def verify_md(doc_pred_md, doc_true_md):
|
||||||
|
return (doc_pred_md==doc_true_md)
|
||||||
|
|
||||||
|
def test_conversions():
|
||||||
|
|
||||||
|
pdf_paths = get_pdf_paths()
|
||||||
|
|
||||||
|
pipeline_options = PipelineOptions()
|
||||||
|
pipeline_options.do_ocr = False
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
for path in pdf_paths:
|
||||||
|
|
||||||
|
doc_pred_json = converter.convert_single(path)
|
||||||
|
|
||||||
|
doc_pred_md = doc.render_as_markdown()
|
||||||
|
|
||||||
|
json_path = path.with_suffix(".json")
|
||||||
|
md_path = path.with_suffix(".md")
|
||||||
|
|
||||||
|
if GENERATE:
|
||||||
|
|
||||||
|
with open(json_path, "w") as fw:
|
||||||
|
fw.write(json.dumps(doc_pred_json, indent=2))
|
||||||
|
|
||||||
|
with open(md_path, "w") as fw:
|
||||||
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
with open(path, "r") as fr:
|
||||||
|
doc_true_json = json.load(fr)
|
||||||
|
|
||||||
|
with open(path, "r") as fr:
|
||||||
|
doc_true_md = json.load(fr)
|
||||||
|
|
||||||
|
assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
|
||||||
|
|
||||||
|
assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user