point towards the real places of the test data

This commit is contained in:
Matthias Günter 2025-05-09 16:39:31 +02:00
parent 3220a592e7
commit 46d6cf078e
26 changed files with 56 additions and 56 deletions

View File

@ -122,10 +122,10 @@ def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"),
Path("../../tests/data/pdf/2206.01062.pdf"),
Path("../../tests/data/pdf/2203.01017v2.pdf"),
Path("../../tests/data/pdf/2305.03393v1.pdf"),
Path("../../tests/data/pdf/redp5110_sampled.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())

View File

@ -17,7 +17,7 @@ _log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
###########################################################################

View File

@ -71,7 +71,7 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
input_doc_path = Path("../../tests/data/pdf/2203.01017v2.pdf")
pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
pipeline_options.do_formula_understanding = True

View File

@ -76,7 +76,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0

View File

@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -12,7 +12,7 @@ _log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")
doc_converter = DocumentConverter()

View File

@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
input_doc = Path("../../tests/data/pdf/2206.01062.pdf")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True

View File

@ -67,7 +67,7 @@ def watsonx_vlm_options():
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
pipeline_options = PdfPipelineOptions(
enable_remote_services=True # <-- this is required!

View File

@ -11,7 +11,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
input_doc = Path("../../tests/data/pdf/2206.01062.pdf")
# Explicitly set the accelerator
# accelerator_options = AcceleratorOptions(

View File

@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
input_doc = Path("../../tests/data/pdf/2206.01062.pdf")
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"])

View File

@ -31,7 +31,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -69,8 +69,8 @@ def watsonx_vlm_options(model: str, prompt: str):
def main():
logging.basicConfig(level=logging.INFO)
# input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
# input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
pipeline_options = VlmPipelineOptions(
enable_remote_services=True # <-- this is required!

View File

@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture
def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf")
return Path("../../tests/data/pdf/2206.01062.pdf")
def _get_backend(pdf_doc):
@ -27,7 +27,7 @@ def _get_backend(pdf_doc):
def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf")
doc_backend = _get_backend(pdf_doc)

View File

@ -106,7 +106,7 @@ def test_ordered_lists():
def get_html_paths():
# Define the directory you want to search
directory = Path("./tests/data/html/")
directory = Path("../../tests/data/html/")
# List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html"))

View File

@ -18,7 +18,7 @@ GENERATE = GEN_TEST_DATA
def get_xlsx_paths():
# Define the directory you want to search
directory = Path("./tests/data/xlsx/")
directory = Path("../../tests/data/xlsx/")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.xlsx"))

View File

@ -43,7 +43,7 @@ def test_heading_levels():
def get_docx_paths():
# Define the directory you want to search
directory = Path("./tests/data/docx/")
directory = Path("../../tests/data/docx/")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.docx"))

View File

@ -17,8 +17,8 @@ from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document
GENERATE: bool = GEN_TEST_DATA
DATA_PATH: Path = Path("./tests/data/uspto/")
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
DATA_PATH: Path = Path("../../tests/data/uspto/")
GT_PATH: Path = Path("../../tests/data/groundtruth/docling_v2/")
def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None:

View File

@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture
def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf")
return Path("../../tests/data/pdf/2206.01062.pdf")
def _get_backend(pdf_doc):
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf")
doc_backend = _get_backend(pdf_doc)

View File

@ -12,7 +12,7 @@ GENERATE = GEN_TEST_DATA
def get_pptx_paths():
# Define the directory you want to search
directory = Path("./tests/data/pptx/")
directory = Path("../../tests/data/pptx/")
# List all PPTX files in the directory and its subdirectories
pptx_files = sorted(directory.rglob("*.pptx"))

View File

@ -18,7 +18,7 @@ def test_cli_version():
def test_cli_convert(tmp_path):
source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
source = "../../tests/data/pdf/2305.03393v1-pg9.pdf"
output = tmp_path / "out"
output.mkdir()
result = runner.invoke(app, [source, "--output", str(output)])

View File

@ -15,7 +15,7 @@ GENERATE_V2 = GEN_TEST_DATA
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data/pdf/")
directory = Path("../../tests/data/pdf/")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))

View File

@ -12,7 +12,7 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path)
assert doc.valid is True
@ -26,7 +26,7 @@ def test_in_doc_from_invalid_path():
def test_in_doc_from_valid_buf():
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream)
@ -72,7 +72,7 @@ def test_image_in_pdf_backend():
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits()
limits.page_range = (1, 10)
@ -112,72 +112,72 @@ def test_guess_format(tmp_path):
temp_dir.mkdir()
# Valid PDF
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf)
assert dci._guess_format(stream) == InputFormat.PDF
doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
assert dci._guess_format(doc_path) == InputFormat.PDF
# Valid MS Office
buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read())
buf = BytesIO(Path("../../tests/data/docx/lorem_ipsum.docx").open("rb").read())
stream = DocumentStream(name="lorem_ipsum.docx", stream=buf)
assert dci._guess_format(stream) == InputFormat.DOCX
doc_path = Path("./tests/data/docx/lorem_ipsum.docx")
doc_path = Path("../../tests/data/docx/lorem_ipsum.docx")
assert dci._guess_format(doc_path) == InputFormat.DOCX
# Valid HTML
buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read())
buf = BytesIO(Path("../../tests/data/html/wiki_duck.html").open("rb").read())
stream = DocumentStream(name="wiki_duck.html", stream=buf)
assert dci._guess_format(stream) == InputFormat.HTML
doc_path = Path("./tests/data/html/wiki_duck.html")
doc_path = Path("../../tests/data/html/wiki_duck.html")
assert dci._guess_format(doc_path) == InputFormat.HTML
# Valid MD
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
buf = BytesIO(Path("../../tests/data/md/wiki.md").open("rb").read())
stream = DocumentStream(name="wiki.md", stream=buf)
assert dci._guess_format(stream) == InputFormat.MD
doc_path = Path("./tests/data/md/wiki.md")
doc_path = Path("../../tests/data/md/wiki.md")
assert dci._guess_format(doc_path) == InputFormat.MD
# Valid CSV
buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read())
buf = BytesIO(Path("../../tests/data/csv/csv-comma.csv").open("rb").read())
stream = DocumentStream(name="csv-comma.csv", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV
stream = DocumentStream(name="test-comma", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV
doc_path = Path("./tests/data/csv/csv-comma.csv")
doc_path = Path("../../tests/data/csv/csv-comma.csv")
assert dci._guess_format(doc_path) == InputFormat.CSV
# Valid XML USPTO patent
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
buf = BytesIO(Path("../../tests/data/uspto/ipa20110039701.xml").open("rb").read())
stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_USPTO
doc_path = Path("./tests/data/uspto/ipa20110039701.xml")
doc_path = Path("../../tests/data/uspto/ipa20110039701.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read())
buf = BytesIO(Path("../../tests/data/uspto/pftaps057006474.txt").open("rb").read())
stream = DocumentStream(name="pftaps057006474.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_USPTO
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
doc_path = Path("../../tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
buf = BytesIO(Path("../../tests/data/jats/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.xml")
doc_path = Path("../../tests/data/jats/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
buf = BytesIO(Path("../../tests/data/jats/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.nxml")
doc_path = Path("../../tests/data/jats/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
buf = BytesIO(Path("../../tests/data/jats/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.txt")
doc_path = Path("../../tests/data/jats/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
# Valid XML, non-supported flavor

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pdf_path():
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
return pdf_path

View File

@ -8,7 +8,7 @@ from docling.document_converter import ConversionError, DocumentConverter
def get_pdf_path():
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
return pdf_path

View File

@ -20,7 +20,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture
def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf")
return Path("../../tests/data/pdf/2206.01062.pdf")
def get_converters_with_table_options():
@ -159,7 +159,7 @@ def test_parser_backends(test_doc_path):
}
)
test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf")
test_doc_path = Path("../../tests/data/pdf/code_and_formula.pdf")
doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS