From 46d6cf078e9c82ffe1831daf9acf06081b0703ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20G=C3=BCnter?= Date: Fri, 9 May 2025 16:39:31 +0200 Subject: [PATCH] point towards the real places of the test data --- docs/examples/batch_convert.py | 8 ++-- docs/examples/custom_convert.py | 2 +- .../examples/develop_formula_understanding.py | 2 +- docs/examples/develop_picture_enrichment.py | 2 +- docs/examples/export_figures.py | 2 +- docs/examples/export_multimodal.py | 2 +- docs/examples/export_tables.py | 2 +- docs/examples/full_page_ocr.py | 2 +- docs/examples/pictures_description_api.py | 2 +- docs/examples/run_with_accelerator.py | 2 +- docs/examples/tesseract_lang_detection.py | 2 +- docs/examples/translate.py | 2 +- docs/examples/vlm_pipeline_api_model.py | 4 +- tests/test_backend_docling_parse_v4.py | 4 +- tests/test_backend_html.py | 2 +- tests/test_backend_msexcel.py | 2 +- tests/test_backend_msword.py | 2 +- tests/test_backend_patent_uspto.py | 4 +- tests/test_backend_pdfium.py | 4 +- tests/test_backend_pptx.py | 2 +- tests/test_cli.py | 2 +- tests/test_e2e_conversion.py | 2 +- tests/test_input_doc.py | 46 +++++++++---------- tests/test_interfaces.py | 2 +- tests/test_invalid_input.py | 2 +- tests/test_options.py | 4 +- 26 files changed, 56 insertions(+), 56 deletions(-) diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index 25eb2bac..e62a7bff 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -122,10 +122,10 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - Path("./tests/data/pdf/2206.01062.pdf"), - Path("./tests/data/pdf/2203.01017v2.pdf"), - Path("./tests/data/pdf/2305.03393v1.pdf"), - Path("./tests/data/pdf/redp5110_sampled.pdf"), + Path("../../tests/data/pdf/2206.01062.pdf"), + Path("../../tests/data/pdf/2203.01017v2.pdf"), + Path("../../tests/data/pdf/2305.03393v1.pdf"), + Path("../../tests/data/pdf/redp5110_sampled.pdf"), ] # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 3b8ae6df..46dc60c9 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -17,7 +17,7 @@ _log = logging.getLogger(__name__) def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") ########################################################################### diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index beb1575a..58919443 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -71,7 +71,7 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf") + input_doc_path = Path("../../tests/data/pdf/2203.01017v2.pdf") pipeline_options = ExampleFormulaUnderstandingPipelineOptions() pipeline_options.do_formula_understanding = True diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py index 9e3d3067..c7bd390d 100644 --- a/docs/examples/develop_picture_enrichment.py +++ b/docs/examples/develop_picture_enrichment.py @@ -76,7 +76,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") pipeline_options = ExamplePictureClassifierPipelineOptions() pipeline_options.images_scale = 2.0 diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py index 8ed14a70..03aa57e5 100644 --- a/docs/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") output_dir = Path("scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index bef74bfa..fcccc584 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") output_dir = Path("scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py index 9a911d84..43e19a0a 100644 --- a/docs/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -12,7 +12,7 @@ _log = logging.getLogger(__name__) def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") output_dir = Path("scratch") doc_converter = DocumentConverter() diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index 5525e87e..98cf9452 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption def main(): - input_doc = Path("./tests/data/pdf/2206.01062.pdf") + input_doc = Path("../../tests/data/pdf/2206.01062.pdf") pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py index 8e105d24..f1226ad0 100644 --- a/docs/examples/pictures_description_api.py +++ b/docs/examples/pictures_description_api.py @@ -67,7 +67,7 @@ def watsonx_vlm_options(): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") pipeline_options = PdfPipelineOptions( enable_remote_services=True # <-- this is required! diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py index a5380740..73c37056 100644 --- a/docs/examples/run_with_accelerator.py +++ b/docs/examples/run_with_accelerator.py @@ -11,7 +11,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption def main(): - input_doc = Path("./tests/data/pdf/2206.01062.pdf") + input_doc = Path("../../tests/data/pdf/2206.01062.pdf") # Explicitly set the accelerator # accelerator_options = AcceleratorOptions( diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py index 37859b97..18a40f7a 100644 --- a/docs/examples/tesseract_lang_detection.py +++ b/docs/examples/tesseract_lang_detection.py @@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption def main(): - input_doc = Path("./tests/data/pdf/2206.01062.pdf") + input_doc = Path("../../tests/data/pdf/2206.01062.pdf") # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions # ocr_options = TesseractOcrOptions(lang=["auto"]) diff --git a/docs/examples/translate.py b/docs/examples/translate.py index 229d5451..62473699 100644 --- a/docs/examples/translate.py +++ b/docs/examples/translate.py @@ -31,7 +31,7 @@ def translate(text: str, src: str = "en", dest: str = "de"): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") output_dir = Path("scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 504cecc5..263627b1 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -69,8 +69,8 @@ def watsonx_vlm_options(model: str, prompt: str): def main(): logging.basicConfig(level=logging.INFO) - # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") - input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") + # input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf") pipeline_options = VlmPipelineOptions( enable_remote_services=True # <-- this is required! diff --git a/tests/test_backend_docling_parse_v4.py b/tests/test_backend_docling_parse_v4.py index 35c4eab7..957261d3 100644 --- a/tests/test_backend_docling_parse_v4.py +++ b/tests/test_backend_docling_parse_v4.py @@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/pdf/2206.01062.pdf") + return Path("../../tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -27,7 +27,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") + pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 18254a78..a10a3400 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -106,7 +106,7 @@ def test_ordered_lists(): def get_html_paths(): # Define the directory you want to search - directory = Path("./tests/data/html/") + directory = Path("../../tests/data/html/") # List all HTML files in the directory and its subdirectories html_files = sorted(directory.rglob("*.html")) diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 65f636e0..807904c7 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -18,7 +18,7 @@ GENERATE = GEN_TEST_DATA def get_xlsx_paths(): # Define the directory you want to search - directory = Path("./tests/data/xlsx/") + directory = Path("../../tests/data/xlsx/") # List all PDF files in the directory and its subdirectories pdf_files = sorted(directory.rglob("*.xlsx")) diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index c50e0718..e9e5379b 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -43,7 +43,7 @@ def test_heading_levels(): def get_docx_paths(): # Define the directory you want to search - directory = Path("./tests/data/docx/") + directory = Path("../../tests/data/docx/") # List all PDF files in the directory and its subdirectories pdf_files = sorted(directory.rglob("*.docx")) diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py index ace6d3a2..3bc0a846 100644 --- a/tests/test_backend_patent_uspto.py +++ b/tests/test_backend_patent_uspto.py @@ -17,8 +17,8 @@ from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document GENERATE: bool = GEN_TEST_DATA -DATA_PATH: Path = Path("./tests/data/uspto/") -GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/") +DATA_PATH: Path = Path("../../tests/data/uspto/") +GT_PATH: Path = Path("../../tests/data/groundtruth/docling_v2/") def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None: diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 317cdeed..7505f171 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/pdf/2206.01062.pdf") + return Path("../../tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -28,7 +28,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") + pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 4f73c870..c6f7d74a 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -12,7 +12,7 @@ GENERATE = GEN_TEST_DATA def get_pptx_paths(): # Define the directory you want to search - directory = Path("./tests/data/pptx/") + directory = Path("../../tests/data/pptx/") # List all PPTX files in the directory and its subdirectories pptx_files = sorted(directory.rglob("*.pptx")) diff --git a/tests/test_cli.py b/tests/test_cli.py index 4364df8b..daf5c4e5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,7 +18,7 @@ def test_cli_version(): def test_cli_convert(tmp_path): - source = "./tests/data/pdf/2305.03393v1-pg9.pdf" + source = "../../tests/data/pdf/2305.03393v1-pg9.pdf" output = tmp_path / "out" output.mkdir() result = runner.invoke(app, [source, "--output", str(output)]) diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 5dc2e89a..70c6542d 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -15,7 +15,7 @@ GENERATE_V2 = GEN_TEST_DATA def get_pdf_paths(): # Define the directory you want to search - directory = Path("./tests/data/pdf/") + directory = Path("../../tests/data/pdf/") # List all PDF files in the directory and its subdirectories pdf_files = sorted(directory.rglob("*.pdf")) diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 94a68873..6a575a48 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -12,7 +12,7 @@ from docling.document_converter import PdfFormatOption def test_in_doc_from_valid_path(): - test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") doc = _make_input_doc(test_doc_path) assert doc.valid is True @@ -26,7 +26,7 @@ def test_in_doc_from_invalid_path(): def test_in_doc_from_valid_buf(): - buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) + buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) @@ -72,7 +72,7 @@ def test_image_in_pdf_backend(): def test_in_doc_with_page_range(): - test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() limits.page_range = (1, 10) @@ -112,72 +112,72 @@ def test_guess_format(tmp_path): temp_dir.mkdir() # Valid PDF - buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) + buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) assert dci._guess_format(stream) == InputFormat.PDF - doc_path = Path("./tests/data/pdf/2206.01062.pdf") + doc_path = Path("../../tests/data/pdf/2206.01062.pdf") assert dci._guess_format(doc_path) == InputFormat.PDF # Valid MS Office - buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read()) + buf = BytesIO(Path("../../tests/data/docx/lorem_ipsum.docx").open("rb").read()) stream = DocumentStream(name="lorem_ipsum.docx", stream=buf) assert dci._guess_format(stream) == InputFormat.DOCX - doc_path = Path("./tests/data/docx/lorem_ipsum.docx") + doc_path = Path("../../tests/data/docx/lorem_ipsum.docx") assert dci._guess_format(doc_path) == InputFormat.DOCX # Valid HTML - buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read()) + buf = BytesIO(Path("../../tests/data/html/wiki_duck.html").open("rb").read()) stream = DocumentStream(name="wiki_duck.html", stream=buf) assert dci._guess_format(stream) == InputFormat.HTML - doc_path = Path("./tests/data/html/wiki_duck.html") + doc_path = Path("../../tests/data/html/wiki_duck.html") assert dci._guess_format(doc_path) == InputFormat.HTML # Valid MD - buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read()) + buf = BytesIO(Path("../../tests/data/md/wiki.md").open("rb").read()) stream = DocumentStream(name="wiki.md", stream=buf) assert dci._guess_format(stream) == InputFormat.MD - doc_path = Path("./tests/data/md/wiki.md") + doc_path = Path("../../tests/data/md/wiki.md") assert dci._guess_format(doc_path) == InputFormat.MD # Valid CSV - buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read()) + buf = BytesIO(Path("../../tests/data/csv/csv-comma.csv").open("rb").read()) stream = DocumentStream(name="csv-comma.csv", stream=buf) assert dci._guess_format(stream) == InputFormat.CSV stream = DocumentStream(name="test-comma", stream=buf) assert dci._guess_format(stream) == InputFormat.CSV - doc_path = Path("./tests/data/csv/csv-comma.csv") + doc_path = Path("../../tests/data/csv/csv-comma.csv") assert dci._guess_format(doc_path) == InputFormat.CSV # Valid XML USPTO patent - buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read()) + buf = BytesIO(Path("../../tests/data/uspto/ipa20110039701.xml").open("rb").read()) stream = DocumentStream(name="ipa20110039701.xml", stream=buf) assert dci._guess_format(stream) == InputFormat.XML_USPTO - doc_path = Path("./tests/data/uspto/ipa20110039701.xml") + doc_path = Path("../../tests/data/uspto/ipa20110039701.xml") assert dci._guess_format(doc_path) == InputFormat.XML_USPTO - buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read()) + buf = BytesIO(Path("../../tests/data/uspto/pftaps057006474.txt").open("rb").read()) stream = DocumentStream(name="pftaps057006474.txt", stream=buf) assert dci._guess_format(stream) == InputFormat.XML_USPTO - doc_path = Path("./tests/data/uspto/pftaps057006474.txt") + doc_path = Path("../../tests/data/uspto/pftaps057006474.txt") assert dci._guess_format(doc_path) == InputFormat.XML_USPTO # Valid XML JATS - buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read()) + buf = BytesIO(Path("../../tests/data/jats/elife-56337.xml").open("rb").read()) stream = DocumentStream(name="elife-56337.xml", stream=buf) assert dci._guess_format(stream) == InputFormat.XML_JATS - doc_path = Path("./tests/data/jats/elife-56337.xml") + doc_path = Path("../../tests/data/jats/elife-56337.xml") assert dci._guess_format(doc_path) == InputFormat.XML_JATS - buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read()) + buf = BytesIO(Path("../../tests/data/jats/elife-56337.nxml").open("rb").read()) stream = DocumentStream(name="elife-56337.nxml", stream=buf) assert dci._guess_format(stream) == InputFormat.XML_JATS - doc_path = Path("./tests/data/jats/elife-56337.nxml") + doc_path = Path("../../tests/data/jats/elife-56337.nxml") assert dci._guess_format(doc_path) == InputFormat.XML_JATS - buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read()) + buf = BytesIO(Path("../../tests/data/jats/elife-56337.txt").open("rb").read()) stream = DocumentStream(name="elife-56337.txt", stream=buf) assert dci._guess_format(stream) == InputFormat.XML_JATS - doc_path = Path("./tests/data/jats/elife-56337.txt") + doc_path = Path("../../tests/data/jats/elife-56337.txt") assert dci._guess_format(doc_path) == InputFormat.XML_JATS # Valid XML, non-supported flavor diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 8d68f299..4dc89691 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA def get_pdf_path(): - pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") + pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index 3cc7a630..6d8cebf3 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -8,7 +8,7 @@ from docling.document_converter import ConversionError, DocumentConverter def get_pdf_path(): - pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") + pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_options.py b/tests/test_options.py index 7b0b26d3..7441e401 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -20,7 +20,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption @pytest.fixture def test_doc_path(): - return Path("./tests/data/pdf/2206.01062.pdf") + return Path("../../tests/data/pdf/2206.01062.pdf") def get_converters_with_table_options(): @@ -159,7 +159,7 @@ def test_parser_backends(test_doc_path): } ) - test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf") + test_doc_path = Path("../../tests/data/pdf/code_and_formula.pdf") doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS