need to start running all tests successfully

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-08-26 17:50:39 +02:00
parent 2c66075390
commit b7debe7250
3 changed files with 53 additions and 29 deletions

View File

@ -1,37 +1,45 @@
import glob import glob
from pathlib import Path from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend from docling.backend.docling_parse_backend import (
DoclingParseDocumentBackend,
DoclingParsePageBackend,
)
from docling.datamodel.base_models import BoundingBox from docling.datamodel.base_models import BoundingBox
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("./data/2206.01062.pdf") return Path("./data/2206.01062.pdf")
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path) doc_backend = DoclingParseDocumentBackend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper # Get the title text of the DocLayNet paper
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124)) textpiece = page_backend.get_text_in_rect(
bbox=BoundingBox(l=102, t=77, r=511, b=124)
)
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
assert textpiece.strip() == ref assert textpiece.strip() == ref
def test_crop_page_image(test_doc_path): def test_crop_page_image(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path) doc_backend = DoclingParseDocumentBackend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527)) im = page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show() # im.show()
def test_num_pages(test_doc_path): def test_num_pages(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path) doc_backend = DoclingParseDocumentBackend(test_doc_path)
doc_backend.page_count() == 9 doc_backend.page_count() == 9

View File

@ -2,7 +2,10 @@ from pathlib import Path
import pytest import pytest
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,
PyPdfiumPageBackend,
)
from docling.datamodel.base_models import BoundingBox from docling.datamodel.base_models import BoundingBox
@ -10,24 +13,31 @@ from docling.datamodel.base_models import BoundingBox
def test_doc_path(): def test_doc_path():
return Path("./data/2206.01062.pdf") return Path("./data/2206.01062.pdf")
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path) doc_backend = PyPdfiumDocumentBackend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper # Get the title text of the DocLayNet paper
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124)) textpiece = page_backend.get_text_in_rect(
bbox=BoundingBox(l=102, t=77, r=511, b=124)
)
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis" ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
assert textpiece.strip() == ref assert textpiece.strip() == ref
def test_crop_page_image(test_doc_path): def test_crop_page_image(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path) doc_backend = PyPdfiumDocumentBackend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527)) im = page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show() # im.show()
def test_num_pages(test_doc_path): def test_num_pages(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path) doc_backend = PyPdfiumDocumentBackend(test_doc_path)
doc_backend.page_count() == 9 doc_backend.page_count() == 9

View File

@ -1,5 +1,4 @@
import glob import glob
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@ -8,20 +7,24 @@ from docling.document_converter import DocumentConverter
GENERATE = True GENERATE = True
def get_pdf_paths(): def get_pdf_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path('./tests/data') directory = Path("./tests/data")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob('*.pdf')) pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files return pdf_files
def verify_json(doc_pred_json, doc_true_json): def verify_json(doc_pred_json, doc_true_json):
return True return True
def verify_md(doc_pred_md, doc_true_md): def verify_md(doc_pred_md, doc_true_md):
return (doc_pred_md==doc_true_md) return doc_pred_md == doc_true_md
def test_conversions(): def test_conversions():
@ -69,7 +72,10 @@ def test_conversions():
with open(path, "r") as fr: with open(path, "r") as fr:
doc_true_md = json.load(fr) doc_true_md = json.load(fr)
assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}" assert verify_json(
doc_pred_json, doc_true_json
assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}" ), f"failed json prediction for {path}"
assert verify_md(
doc_pred_md, doc_true_md
), f"failed md prediction for {path}"