point towards the real places of the test data

2025-07-26 20:14:47 +00:00 · 2025-05-09 16:39:31 +02:00 · 2025-05-09 16:39:31 +02:00 · 46d6cf078e
commit 46d6cf078e
parent 3220a592e7
26 changed files with 56 additions and 56 deletions
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -122,10 +122,10 @@ def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_paths = [
-        Path("./tests/data/pdf/2206.01062.pdf"),
-        Path("./tests/data/pdf/2203.01017v2.pdf"),
-        Path("./tests/data/pdf/2305.03393v1.pdf"),
-        Path("./tests/data/pdf/redp5110_sampled.pdf"),
+        Path("../../tests/data/pdf/2206.01062.pdf"),
+        Path("../../tests/data/pdf/2203.01017v2.pdf"),
+        Path("../../tests/data/pdf/2305.03393v1.pdf"),
+        Path("../../tests/data/pdf/redp5110_sampled.pdf"),
    ]

    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -17,7 +17,7 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")

    ###########################################################################

--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -71,7 +71,7 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2203.01017v2.pdf")

    pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
    pipeline_options.do_formula_understanding = True
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -76,7 +76,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")

    pipeline_options = ExamplePictureClassifierPipelineOptions()
    pipeline_options.images_scale = 2.0
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -12,7 +12,7 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc = Path("../../tests/data/pdf/2206.01062.pdf")

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -67,7 +67,7 @@ def watsonx_vlm_options():
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")

    pipeline_options = PdfPipelineOptions(
        enable_remote_services=True  # <-- this is required!
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -11,7 +11,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc = Path("../../tests/data/pdf/2206.01062.pdf")

    # Explicitly set the accelerator
    # accelerator_options = AcceleratorOptions(
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc = Path("../../tests/data/pdf/2206.01062.pdf")

    # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
    # ocr_options = TesseractOcrOptions(lang=["auto"])
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -31,7 +31,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -69,8 +69,8 @@ def watsonx_vlm_options(model: str, prompt: str):
 def main():
    logging.basicConfig(level=logging.INFO)

-    # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
-    input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
+    # input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
+    input_doc_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")

    pipeline_options = VlmPipelineOptions(
        enable_remote_services=True  # <-- this is required!
--- a/tests/test_backend_docling_parse_v4.py
+++ b/tests/test_backend_docling_parse_v4.py
@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument

@pytest.fixture
 def test_doc_path():
-    return Path("./tests/data/pdf/2206.01062.pdf")
+    return Path("../../tests/data/pdf/2206.01062.pdf")


 def _get_backend(pdf_doc):
@ -27,7 +27,7 @@ def _get_backend(pdf_doc):


 def test_text_cell_counts():
-    pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
+    pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf")

    doc_backend = _get_backend(pdf_doc)

--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@ -106,7 +106,7 @@ def test_ordered_lists():

 def get_html_paths():
    # Define the directory you want to search
-    directory = Path("./tests/data/html/")
+    directory = Path("../../tests/data/html/")

    # List all HTML files in the directory and its subdirectories
    html_files = sorted(directory.rglob("*.html"))
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@ -18,7 +18,7 @@ GENERATE = GEN_TEST_DATA

 def get_xlsx_paths():
    # Define the directory you want to search
-    directory = Path("./tests/data/xlsx/")
+    directory = Path("../../tests/data/xlsx/")

    # List all PDF files in the directory and its subdirectories
    pdf_files = sorted(directory.rglob("*.xlsx"))
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -43,7 +43,7 @@ def test_heading_levels():

 def get_docx_paths():
    # Define the directory you want to search
-    directory = Path("./tests/data/docx/")
+    directory = Path("../../tests/data/docx/")

    # List all PDF files in the directory and its subdirectories
    pdf_files = sorted(directory.rglob("*.docx"))
--- a/tests/test_backend_patent_uspto.py
+++ b/tests/test_backend_patent_uspto.py
@ -17,8 +17,8 @@ from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document

 GENERATE: bool = GEN_TEST_DATA
-DATA_PATH: Path = Path("./tests/data/uspto/")
-GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
+DATA_PATH: Path = Path("../../tests/data/uspto/")
+GT_PATH: Path = Path("../../tests/data/groundtruth/docling_v2/")


 def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None:
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument

@pytest.fixture
 def test_doc_path():
-    return Path("./tests/data/pdf/2206.01062.pdf")
+    return Path("../../tests/data/pdf/2206.01062.pdf")


 def _get_backend(pdf_doc):
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):


 def test_text_cell_counts():
-    pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
+    pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf")

    doc_backend = _get_backend(pdf_doc)

--- a/tests/test_backend_pptx.py
+++ b/tests/test_backend_pptx.py
@ -12,7 +12,7 @@ GENERATE = GEN_TEST_DATA

 def get_pptx_paths():
    # Define the directory you want to search
-    directory = Path("./tests/data/pptx/")
+    directory = Path("../../tests/data/pptx/")

    # List all PPTX files in the directory and its subdirectories
    pptx_files = sorted(directory.rglob("*.pptx"))
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -18,7 +18,7 @@ def test_cli_version():


 def test_cli_convert(tmp_path):
-    source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
+    source = "../../tests/data/pdf/2305.03393v1-pg9.pdf"
    output = tmp_path / "out"
    output.mkdir()
    result = runner.invoke(app, [source, "--output", str(output)])
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -15,7 +15,7 @@ GENERATE_V2 = GEN_TEST_DATA

 def get_pdf_paths():
    # Define the directory you want to search
-    directory = Path("./tests/data/pdf/")
+    directory = Path("../../tests/data/pdf/")

    # List all PDF files in the directory and its subdirectories
    pdf_files = sorted(directory.rglob("*.pdf"))
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -12,7 +12,7 @@ from docling.document_converter import PdfFormatOption


 def test_in_doc_from_valid_path():
-    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    doc = _make_input_doc(test_doc_path)
    assert doc.valid is True

@ -26,7 +26,7 @@ def test_in_doc_from_invalid_path():


 def test_in_doc_from_valid_buf():
-    buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read())
    stream = DocumentStream(name="my_doc.pdf", stream=buf)

    doc = _make_input_doc_from_stream(stream)
@ -72,7 +72,7 @@ def test_image_in_pdf_backend():


 def test_in_doc_with_page_range():
-    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    limits = DocumentLimits()
    limits.page_range = (1, 10)

@ -112,72 +112,72 @@ def test_guess_format(tmp_path):
    temp_dir.mkdir()

    # Valid PDF
-    buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read())
    stream = DocumentStream(name="my_doc.pdf", stream=buf)
    assert dci._guess_format(stream) == InputFormat.PDF
-    doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
    assert dci._guess_format(doc_path) == InputFormat.PDF

    # Valid MS Office
-    buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/docx/lorem_ipsum.docx").open("rb").read())
    stream = DocumentStream(name="lorem_ipsum.docx", stream=buf)
    assert dci._guess_format(stream) == InputFormat.DOCX
-    doc_path = Path("./tests/data/docx/lorem_ipsum.docx")
+    doc_path = Path("../../tests/data/docx/lorem_ipsum.docx")
    assert dci._guess_format(doc_path) == InputFormat.DOCX

    # Valid HTML
-    buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/html/wiki_duck.html").open("rb").read())
    stream = DocumentStream(name="wiki_duck.html", stream=buf)
    assert dci._guess_format(stream) == InputFormat.HTML
-    doc_path = Path("./tests/data/html/wiki_duck.html")
+    doc_path = Path("../../tests/data/html/wiki_duck.html")
    assert dci._guess_format(doc_path) == InputFormat.HTML

    # Valid MD
-    buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/md/wiki.md").open("rb").read())
    stream = DocumentStream(name="wiki.md", stream=buf)
    assert dci._guess_format(stream) == InputFormat.MD
-    doc_path = Path("./tests/data/md/wiki.md")
+    doc_path = Path("../../tests/data/md/wiki.md")
    assert dci._guess_format(doc_path) == InputFormat.MD

    # Valid CSV
-    buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/csv/csv-comma.csv").open("rb").read())
    stream = DocumentStream(name="csv-comma.csv", stream=buf)
    assert dci._guess_format(stream) == InputFormat.CSV
    stream = DocumentStream(name="test-comma", stream=buf)
    assert dci._guess_format(stream) == InputFormat.CSV
-    doc_path = Path("./tests/data/csv/csv-comma.csv")
+    doc_path = Path("../../tests/data/csv/csv-comma.csv")
    assert dci._guess_format(doc_path) == InputFormat.CSV

    # Valid XML USPTO patent
-    buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/uspto/ipa20110039701.xml").open("rb").read())
    stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
    assert dci._guess_format(stream) == InputFormat.XML_USPTO
-    doc_path = Path("./tests/data/uspto/ipa20110039701.xml")
+    doc_path = Path("../../tests/data/uspto/ipa20110039701.xml")
    assert dci._guess_format(doc_path) == InputFormat.XML_USPTO

-    buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/uspto/pftaps057006474.txt").open("rb").read())
    stream = DocumentStream(name="pftaps057006474.txt", stream=buf)
    assert dci._guess_format(stream) == InputFormat.XML_USPTO
-    doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
+    doc_path = Path("../../tests/data/uspto/pftaps057006474.txt")
    assert dci._guess_format(doc_path) == InputFormat.XML_USPTO

    # Valid XML JATS
-    buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/jats/elife-56337.xml").open("rb").read())
    stream = DocumentStream(name="elife-56337.xml", stream=buf)
    assert dci._guess_format(stream) == InputFormat.XML_JATS
-    doc_path = Path("./tests/data/jats/elife-56337.xml")
+    doc_path = Path("../../tests/data/jats/elife-56337.xml")
    assert dci._guess_format(doc_path) == InputFormat.XML_JATS

-    buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/jats/elife-56337.nxml").open("rb").read())
    stream = DocumentStream(name="elife-56337.nxml", stream=buf)
    assert dci._guess_format(stream) == InputFormat.XML_JATS
-    doc_path = Path("./tests/data/jats/elife-56337.nxml")
+    doc_path = Path("../../tests/data/jats/elife-56337.nxml")
    assert dci._guess_format(doc_path) == InputFormat.XML_JATS

-    buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
+    buf = BytesIO(Path("../../tests/data/jats/elife-56337.txt").open("rb").read())
    stream = DocumentStream(name="elife-56337.txt", stream=buf)
    assert dci._guess_format(stream) == InputFormat.XML_JATS
-    doc_path = Path("./tests/data/jats/elife-56337.txt")
+    doc_path = Path("../../tests/data/jats/elife-56337.txt")
    assert dci._guess_format(doc_path) == InputFormat.XML_JATS

    # Valid XML, non-supported flavor
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA


 def get_pdf_path():
-    pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
+    pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
    return pdf_path


--- a/tests/test_invalid_input.py
+++ b/tests/test_invalid_input.py
@ -8,7 +8,7 @@ from docling.document_converter import ConversionError, DocumentConverter


 def get_pdf_path():
-    pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
+    pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
    return pdf_path


--- a/tests/test_options.py
+++ b/tests/test_options.py
@ -20,7 +20,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption

@pytest.fixture
 def test_doc_path():
-    return Path("./tests/data/pdf/2206.01062.pdf")
+    return Path("../../tests/data/pdf/2206.01062.pdf")


 def get_converters_with_table_options():
@ -159,7 +159,7 @@ def test_parser_backends(test_doc_path):
            }
        )

-        test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf")
+        test_doc_path = Path("../../tests/data/pdf/code_and_formula.pdf")
        doc_result: ConversionResult = converter.convert(test_doc_path)

        assert doc_result.status == ConversionStatus.SUCCESS