mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
need to start running all tests successfully
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
2c66075390
commit
b7debe7250
@ -1,37 +1,45 @@
|
|||||||
import glob
|
import glob
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
|
from docling.backend.docling_parse_backend import (
|
||||||
|
DoclingParseDocumentBackend,
|
||||||
|
DoclingParsePageBackend,
|
||||||
|
)
|
||||||
from docling.datamodel.base_models import BoundingBox
|
from docling.datamodel.base_models import BoundingBox
|
||||||
|
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./data/2206.01062.pdf")
|
return Path("./data/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def test_get_text_from_rect(test_doc_path):
|
def test_get_text_from_rect(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Get the title text of the DocLayNet paper
|
# Get the title text of the DocLayNet paper
|
||||||
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
|
textpiece = page_backend.get_text_in_rect(
|
||||||
|
bbox=BoundingBox(l=102, t=77, r=511, b=124)
|
||||||
|
)
|
||||||
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
|
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
|
||||||
|
|
||||||
assert textpiece.strip() == ref
|
assert textpiece.strip() == ref
|
||||||
|
|
||||||
|
|
||||||
def test_crop_page_image(test_doc_path):
|
def test_crop_page_image(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
|
im = page_backend.get_page_image(
|
||||||
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
|
||||||
|
|
||||||
def test_num_pages(test_doc_path):
|
def test_num_pages(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
doc_backend = DoclingParseDocumentBackend(test_doc_path)
|
||||||
doc_backend.page_count() == 9
|
doc_backend.page_count() == 9
|
||||||
|
|
||||||
|
@ -2,7 +2,10 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
|
from docling.backend.pypdfium2_backend import (
|
||||||
|
PyPdfiumDocumentBackend,
|
||||||
|
PyPdfiumPageBackend,
|
||||||
|
)
|
||||||
from docling.datamodel.base_models import BoundingBox
|
from docling.datamodel.base_models import BoundingBox
|
||||||
|
|
||||||
|
|
||||||
@ -10,24 +13,31 @@ from docling.datamodel.base_models import BoundingBox
|
|||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./data/2206.01062.pdf")
|
return Path("./data/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def test_get_text_from_rect(test_doc_path):
|
def test_get_text_from_rect(test_doc_path):
|
||||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Get the title text of the DocLayNet paper
|
# Get the title text of the DocLayNet paper
|
||||||
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
|
textpiece = page_backend.get_text_in_rect(
|
||||||
|
bbox=BoundingBox(l=102, t=77, r=511, b=124)
|
||||||
|
)
|
||||||
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
|
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
|
||||||
|
|
||||||
assert textpiece.strip() == ref
|
assert textpiece.strip() == ref
|
||||||
|
|
||||||
|
|
||||||
def test_crop_page_image(test_doc_path):
|
def test_crop_page_image(test_doc_path):
|
||||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
|
im = page_backend.get_page_image(
|
||||||
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
|
||||||
|
|
||||||
def test_num_pages(test_doc_path):
|
def test_num_pages(test_doc_path):
|
||||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||||
doc_backend.page_count() == 9
|
doc_backend.page_count() == 9
|
||||||
|
@ -1,27 +1,30 @@
|
|||||||
import glob
|
import glob
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE=True
|
GENERATE = True
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path('./tests/data')
|
directory = Path("./tests/data")
|
||||||
|
|
||||||
# List all PDF files in the directory and its subdirectories
|
# List all PDF files in the directory and its subdirectories
|
||||||
pdf_files = sorted(directory.rglob('*.pdf'))
|
pdf_files = sorted(directory.rglob("*.pdf"))
|
||||||
return pdf_files
|
return pdf_files
|
||||||
|
|
||||||
|
|
||||||
def verify_json(doc_pred_json, doc_true_json):
|
def verify_json(doc_pred_json, doc_true_json):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def verify_md(doc_pred_md, doc_true_md):
|
def verify_md(doc_pred_md, doc_true_md):
|
||||||
return (doc_pred_md==doc_true_md)
|
return doc_pred_md == doc_true_md
|
||||||
|
|
||||||
|
|
||||||
def test_conversions():
|
def test_conversions():
|
||||||
|
|
||||||
@ -40,7 +43,7 @@ def test_conversions():
|
|||||||
|
|
||||||
for path in pdf_paths:
|
for path in pdf_paths:
|
||||||
|
|
||||||
doc_pred_json=None
|
doc_pred_json = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f"converting {path}")
|
print(f"converting {path}")
|
||||||
@ -69,7 +72,10 @@ def test_conversions():
|
|||||||
with open(path, "r") as fr:
|
with open(path, "r") as fr:
|
||||||
doc_true_md = json.load(fr)
|
doc_true_md = json.load(fr)
|
||||||
|
|
||||||
assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}"
|
assert verify_json(
|
||||||
|
doc_pred_json, doc_true_json
|
||||||
assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}"
|
), f"failed json prediction for {path}"
|
||||||
|
|
||||||
|
assert verify_md(
|
||||||
|
doc_pred_md, doc_true_md
|
||||||
|
), f"failed md prediction for {path}"
|
||||||
|
Loading…
Reference in New Issue
Block a user