mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
renamed the test folder and added the toplevel test
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
BIN
tests/data/2203.01017v2.pdf
Normal file
BIN
tests/data/2203.01017v2.pdf
Normal file
Binary file not shown.
BIN
tests/data/2206.01062.pdf
Normal file
BIN
tests/data/2206.01062.pdf
Normal file
Binary file not shown.
BIN
tests/data/2305.03393v1.pdf
Normal file
BIN
tests/data/2305.03393v1.pdf
Normal file
Binary file not shown.
BIN
tests/data/redp5110.pdf
Normal file
BIN
tests/data/redp5110.pdf
Normal file
Binary file not shown.
BIN
tests/data/redp5695.pdf
Normal file
BIN
tests/data/redp5695.pdf
Normal file
Binary file not shown.
37
tests/test_backend_docling_parse.py
Normal file
37
tests/test_backend_docling_parse.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import glob
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
|
||||
from docling.datamodel.base_models import BoundingBox
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
@pytest.fixture
|
||||
def test_doc_path():
|
||||
return Path("./data/2206.01062.pdf")
|
||||
|
||||
def test_get_text_from_rect(test_doc_path):
|
||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Get the title text of the DocLayNet paper
|
||||
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
|
||||
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
|
||||
|
||||
assert textpiece.strip() == ref
|
||||
|
||||
def test_crop_page_image(test_doc_path):
|
||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
|
||||
# im.show()
|
||||
|
||||
def test_num_pages(test_doc_path):
|
||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||
doc_backend.page_count() == 9
|
||||
|
||||
33
tests/test_backend_pdfium.py
Normal file
33
tests/test_backend_pdfium.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_doc_path():
|
||||
return Path("./data/2206.01062.pdf")
|
||||
|
||||
def test_get_text_from_rect(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Get the title text of the DocLayNet paper
|
||||
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
|
||||
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
|
||||
|
||||
assert textpiece.strip() == ref
|
||||
|
||||
def test_crop_page_image(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
|
||||
# im.show()
|
||||
|
||||
def test_num_pages(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||
doc_backend.page_count() == 9
|
||||
68
tests/test_toplevel_functions.py
Normal file
68
tests/test_toplevel_functions.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import glob
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE=True
|
||||
|
||||
def get_pdf_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path('./data')
|
||||
|
||||
# List all PDF files in the directory and its subdirectories
|
||||
pdf_files = sorted(directory.rglob('*.pdf'))
|
||||
return pdf_files
|
||||
|
||||
def verify_json(doc_pred_json, doc_true_json):
|
||||
return True
|
||||
|
||||
def verify_md(doc_pred_md, doc_true_md):
|
||||
return (doc_pred_md==doc_true_md)
|
||||
|
||||
def test_conversions():
|
||||
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
for path in pdf_paths:
|
||||
|
||||
doc_pred_json = converter.convert_single(path)
|
||||
|
||||
doc_pred_md = doc.render_as_markdown()
|
||||
|
||||
json_path = path.with_suffix(".json")
|
||||
md_path = path.with_suffix(".md")
|
||||
|
||||
if GENERATE:
|
||||
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_json, indent=2))
|
||||
|
||||
with open(md_path, "w") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
else:
|
||||
|
||||
with open(path, "r") as fr:
|
||||
doc_true_json = json.load(fr)
|
||||
|
||||
with open(path, "r") as fr:
|
||||
doc_true_md = json.load(fr)
|
||||
|
||||
assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
|
||||
|
||||
assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"
|
||||
|
||||
Reference in New Issue
Block a user