renamed the test folder and added the toplevel test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-08-26 17:00:30 +02:00
parent f5eb49a811
commit 12eea8495f
9 changed files with 74 additions and 2 deletions

View File

@ -4,7 +4,7 @@ repos:
hooks: hooks:
- id: system - id: system
name: Black name: Black
entry: poetry run black docling examples entry: poetry run black docling examples tests
pass_filenames: false pass_filenames: false
language: system language: system
files: '\.py$' files: '\.py$'
@ -12,7 +12,7 @@ repos:
hooks: hooks:
- id: system - id: system
name: isort name: isort
entry: poetry run isort docling examples entry: poetry run isort docling examples tests
pass_filenames: false pass_filenames: false
language: system language: system
files: '\.py$' files: '\.py$'

View File

@ -1,3 +1,5 @@
import glob
from pathlib import Path from pathlib import Path
import pytest import pytest
@ -5,6 +7,7 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
from docling.datamodel.base_models import BoundingBox from docling.datamodel.base_models import BoundingBox
from docling.document_converter import DocumentConverter
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
@ -31,3 +34,4 @@ def test_crop_page_image(test_doc_path):
def test_num_pages(test_doc_path): def test_num_pages(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path) doc_backend = DoclingParseDocumentBackend(test_doc_path)
doc_backend.page_count() == 9 doc_backend.page_count() == 9

View File

@ -0,0 +1,68 @@
import glob
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.document_converter import DocumentConverter
GENERATE=True
def get_pdf_paths():
# Define the directory you want to search
directory = Path('./data')
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob('*.pdf'))
return pdf_files
def verify_json(doc_pred_json, doc_true_json):
return True
def verify_md(doc_pred_md, doc_true_md):
return (doc_pred_md==doc_true_md)
def test_conversions():
pdf_paths = get_pdf_paths()
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
for path in pdf_paths:
doc_pred_json = converter.convert_single(path)
doc_pred_md = doc.render_as_markdown()
json_path = path.with_suffix(".json")
md_path = path.with_suffix(".md")
if GENERATE:
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred_json, indent=2))
with open(md_path, "w") as fw:
fw.write(doc_pred_md)
else:
with open(path, "r") as fr:
doc_true_json = json.load(fr)
with open(path, "r") as fr:
doc_true_md = json.load(fr)
assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"