From b7debe72508563c7e4e6e9e084ffdde3a1c5f1ef Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 26 Aug 2024 17:50:39 +0200 Subject: [PATCH] need to start running all tests successfully Signed-off-by: Peter Staar --- tests/test_backend_docling_parse.py | 20 +++++++++---- tests/test_backend_pdfium.py | 16 ++++++++-- tests/test_toplevel_functions.py | 46 ++++++++++++++++------------- 3 files changed, 53 insertions(+), 29 deletions(-) diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 8f42e1ba..dd98157d 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -1,37 +1,45 @@ import glob - from pathlib import Path import pytest -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend +from docling.backend.docling_parse_backend import ( + DoclingParseDocumentBackend, + DoclingParsePageBackend, +) from docling.datamodel.base_models import BoundingBox - from docling.document_converter import DocumentConverter + @pytest.fixture def test_doc_path(): return Path("./data/2206.01062.pdf") + def test_get_text_from_rect(test_doc_path): doc_backend = DoclingParseDocumentBackend(test_doc_path) page_backend: DoclingParsePageBackend = doc_backend.load_page(0) # Get the title text of the DocLayNet paper - textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124)) + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=102, t=77, r=511, b=124) + ) ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" assert textpiece.strip() == ref + def test_crop_page_image(test_doc_path): doc_backend = DoclingParseDocumentBackend(test_doc_path) page_backend: DoclingParsePageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527)) + im = page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) + ) # im.show() + def test_num_pages(test_doc_path): doc_backend = DoclingParseDocumentBackend(test_doc_path) doc_backend.page_count() == 9 - diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 2ba239c1..4db7a376 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -2,7 +2,10 @@ from pathlib import Path import pytest -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend +from docling.backend.pypdfium2_backend import ( + PyPdfiumDocumentBackend, + PyPdfiumPageBackend, +) from docling.datamodel.base_models import BoundingBox @@ -10,24 +13,31 @@ from docling.datamodel.base_models import BoundingBox def test_doc_path(): return Path("./data/2206.01062.pdf") + def test_get_text_from_rect(test_doc_path): doc_backend = PyPdfiumDocumentBackend(test_doc_path) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) # Get the title text of the DocLayNet paper - textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124)) + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=102, t=77, r=511, b=124) + ) ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis" assert textpiece.strip() == ref + def test_crop_page_image(test_doc_path): doc_backend = PyPdfiumDocumentBackend(test_doc_path) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527)) + im = page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) + ) # im.show() + def test_num_pages(test_doc_path): doc_backend = PyPdfiumDocumentBackend(test_doc_path) doc_backend.page_count() == 9 diff --git a/tests/test_toplevel_functions.py b/tests/test_toplevel_functions.py index 5d1059da..5638d884 100644 --- a/tests/test_toplevel_functions.py +++ b/tests/test_toplevel_functions.py @@ -1,33 +1,36 @@ import glob - from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.document_converter import DocumentConverter -GENERATE=True +GENERATE = True + def get_pdf_paths(): # Define the directory you want to search - directory = Path('./tests/data') + directory = Path("./tests/data") # List all PDF files in the directory and its subdirectories - pdf_files = sorted(directory.rglob('*.pdf')) + pdf_files = sorted(directory.rglob("*.pdf")) return pdf_files + def verify_json(doc_pred_json, doc_true_json): return True + def verify_md(doc_pred_md, doc_true_md): - return (doc_pred_md==doc_true_md) - + return doc_pred_md == doc_true_md + + def test_conversions(): - + pdf_paths = get_pdf_paths() print(f"#-documents: {pdf_paths}") - + pipeline_options = PipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True @@ -37,39 +40,42 @@ def test_conversions(): pipeline_options=pipeline_options, pdf_backend=DoclingParseDocumentBackend, ) - + for path in pdf_paths: - doc_pred_json=None - + doc_pred_json = None + try: print(f"converting {path}") - doc_pred_json = converter.convert_single(path) + doc_pred_json = converter.convert_single(path) except: continue - + doc_pred_md = doc_pred_json.render_as_markdown() json_path = path.with_suffix(".json") md_path = path.with_suffix(".md") - + if GENERATE: - + with open(json_path, "w") as fw: fw.write(json.dumps(doc_pred_json, indent=2)) with open(md_path, "w") as fw: fw.write(doc_pred_md) - + else: with open(path, "r") as fr: doc_true_json = json.load(fr) with open(path, "r") as fr: - doc_true_md = json.load(fr) + doc_true_md = json.load(fr) - assert verify_json(doc_pred_json, doc_true_json), f"failed json prediction for {path}" + assert verify_json( + doc_pred_json, doc_true_json + ), f"failed json prediction for {path}" - assert verify_md(doc_pred_md, doc_true_md), f"failed md prediction for {path}" - + assert verify_md( + doc_pred_md, doc_true_md + ), f"failed md prediction for {path}"