point towards the real places of the test data

This commit is contained in:
Matthias Günter 2025-05-09 16:39:31 +02:00
parent 3220a592e7
commit 46d6cf078e
26 changed files with 56 additions and 56 deletions

View File

@ -122,10 +122,10 @@ def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_paths = [
Path("./tests/data/pdf/2206.01062.pdf"), Path("../../tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"), Path("../../tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"), Path("../../tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"), Path("../../tests/data/pdf/redp5110_sampled.pdf"),
] ]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())

View File

@ -17,7 +17,7 @@ _log = logging.getLogger(__name__)
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
########################################################################### ###########################################################################

View File

@ -71,7 +71,7 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf") input_doc_path = Path("../../tests/data/pdf/2203.01017v2.pdf")
pipeline_options = ExampleFormulaUnderstandingPipelineOptions() pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
pipeline_options.do_formula_understanding = True pipeline_options.do_formula_understanding = True

View File

@ -76,7 +76,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
pipeline_options = ExamplePictureClassifierPipelineOptions() pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0 pipeline_options.images_scale = 2.0

View File

@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch") output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch") output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -12,7 +12,7 @@ _log = logging.getLogger(__name__)
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch") output_dir = Path("scratch")
doc_converter = DocumentConverter() doc_converter = DocumentConverter()

View File

@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf") input_doc = Path("../../tests/data/pdf/2206.01062.pdf")
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True pipeline_options.do_ocr = True

View File

@ -67,7 +67,7 @@ def watsonx_vlm_options():
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(
enable_remote_services=True # <-- this is required! enable_remote_services=True # <-- this is required!

View File

@ -11,7 +11,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf") input_doc = Path("../../tests/data/pdf/2206.01062.pdf")
# Explicitly set the accelerator # Explicitly set the accelerator
# accelerator_options = AcceleratorOptions( # accelerator_options = AcceleratorOptions(

View File

@ -9,7 +9,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf") input_doc = Path("../../tests/data/pdf/2206.01062.pdf")
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"]) # ocr_options = TesseractOcrOptions(lang=["auto"])

View File

@ -31,7 +31,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch") output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -69,8 +69,8 @@ def watsonx_vlm_options(model: str, prompt: str):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
# input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") # input_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") input_doc_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
pipeline_options = VlmPipelineOptions( pipeline_options = VlmPipelineOptions(
enable_remote_services=True # <-- this is required! enable_remote_services=True # <-- this is required!

View File

@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf") return Path("../../tests/data/pdf/2206.01062.pdf")
def _get_backend(pdf_doc): def _get_backend(pdf_doc):
@ -27,7 +27,7 @@ def _get_backend(pdf_doc):
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf")
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)

View File

@ -106,7 +106,7 @@ def test_ordered_lists():
def get_html_paths(): def get_html_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/html/") directory = Path("../../tests/data/html/")
# List all HTML files in the directory and its subdirectories # List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html")) html_files = sorted(directory.rglob("*.html"))

View File

@ -18,7 +18,7 @@ GENERATE = GEN_TEST_DATA
def get_xlsx_paths(): def get_xlsx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/xlsx/") directory = Path("../../tests/data/xlsx/")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.xlsx")) pdf_files = sorted(directory.rglob("*.xlsx"))

View File

@ -43,7 +43,7 @@ def test_heading_levels():
def get_docx_paths(): def get_docx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/docx/") directory = Path("../../tests/data/docx/")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.docx")) pdf_files = sorted(directory.rglob("*.docx"))

View File

@ -17,8 +17,8 @@ from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document from .verify_utils import verify_document
GENERATE: bool = GEN_TEST_DATA GENERATE: bool = GEN_TEST_DATA
DATA_PATH: Path = Path("./tests/data/uspto/") DATA_PATH: Path = Path("../../tests/data/uspto/")
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/") GT_PATH: Path = Path("../../tests/data/groundtruth/docling_v2/")
def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None: def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None:

View File

@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf") return Path("../../tests/data/pdf/2206.01062.pdf")
def _get_backend(pdf_doc): def _get_backend(pdf_doc):
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf")
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)

View File

@ -12,7 +12,7 @@ GENERATE = GEN_TEST_DATA
def get_pptx_paths(): def get_pptx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/pptx/") directory = Path("../../tests/data/pptx/")
# List all PPTX files in the directory and its subdirectories # List all PPTX files in the directory and its subdirectories
pptx_files = sorted(directory.rglob("*.pptx")) pptx_files = sorted(directory.rglob("*.pptx"))

View File

@ -18,7 +18,7 @@ def test_cli_version():
def test_cli_convert(tmp_path): def test_cli_convert(tmp_path):
source = "./tests/data/pdf/2305.03393v1-pg9.pdf" source = "../../tests/data/pdf/2305.03393v1-pg9.pdf"
output = tmp_path / "out" output = tmp_path / "out"
output.mkdir() output.mkdir()
result = runner.invoke(app, [source, "--output", str(output)]) result = runner.invoke(app, [source, "--output", str(output)])

View File

@ -15,7 +15,7 @@ GENERATE_V2 = GEN_TEST_DATA
def get_pdf_paths(): def get_pdf_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/pdf/") directory = Path("../../tests/data/pdf/")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf")) pdf_files = sorted(directory.rglob("*.pdf"))

View File

@ -12,7 +12,7 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path) doc = _make_input_doc(test_doc_path)
assert doc.valid is True assert doc.valid is True
@ -26,7 +26,7 @@ def test_in_doc_from_invalid_path():
def test_in_doc_from_valid_buf(): def test_in_doc_from_valid_buf():
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream) doc = _make_input_doc_from_stream(stream)
@ -72,7 +72,7 @@ def test_image_in_pdf_backend():
def test_in_doc_with_page_range(): def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits() limits = DocumentLimits()
limits.page_range = (1, 10) limits.page_range = (1, 10)
@ -112,72 +112,72 @@ def test_guess_format(tmp_path):
temp_dir.mkdir() temp_dir.mkdir()
# Valid PDF # Valid PDF
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
assert dci._guess_format(stream) == InputFormat.PDF assert dci._guess_format(stream) == InputFormat.PDF
doc_path = Path("./tests/data/pdf/2206.01062.pdf") doc_path = Path("../../tests/data/pdf/2206.01062.pdf")
assert dci._guess_format(doc_path) == InputFormat.PDF assert dci._guess_format(doc_path) == InputFormat.PDF
# Valid MS Office # Valid MS Office
buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read()) buf = BytesIO(Path("../../tests/data/docx/lorem_ipsum.docx").open("rb").read())
stream = DocumentStream(name="lorem_ipsum.docx", stream=buf) stream = DocumentStream(name="lorem_ipsum.docx", stream=buf)
assert dci._guess_format(stream) == InputFormat.DOCX assert dci._guess_format(stream) == InputFormat.DOCX
doc_path = Path("./tests/data/docx/lorem_ipsum.docx") doc_path = Path("../../tests/data/docx/lorem_ipsum.docx")
assert dci._guess_format(doc_path) == InputFormat.DOCX assert dci._guess_format(doc_path) == InputFormat.DOCX
# Valid HTML # Valid HTML
buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read()) buf = BytesIO(Path("../../tests/data/html/wiki_duck.html").open("rb").read())
stream = DocumentStream(name="wiki_duck.html", stream=buf) stream = DocumentStream(name="wiki_duck.html", stream=buf)
assert dci._guess_format(stream) == InputFormat.HTML assert dci._guess_format(stream) == InputFormat.HTML
doc_path = Path("./tests/data/html/wiki_duck.html") doc_path = Path("../../tests/data/html/wiki_duck.html")
assert dci._guess_format(doc_path) == InputFormat.HTML assert dci._guess_format(doc_path) == InputFormat.HTML
# Valid MD # Valid MD
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read()) buf = BytesIO(Path("../../tests/data/md/wiki.md").open("rb").read())
stream = DocumentStream(name="wiki.md", stream=buf) stream = DocumentStream(name="wiki.md", stream=buf)
assert dci._guess_format(stream) == InputFormat.MD assert dci._guess_format(stream) == InputFormat.MD
doc_path = Path("./tests/data/md/wiki.md") doc_path = Path("../../tests/data/md/wiki.md")
assert dci._guess_format(doc_path) == InputFormat.MD assert dci._guess_format(doc_path) == InputFormat.MD
# Valid CSV # Valid CSV
buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read()) buf = BytesIO(Path("../../tests/data/csv/csv-comma.csv").open("rb").read())
stream = DocumentStream(name="csv-comma.csv", stream=buf) stream = DocumentStream(name="csv-comma.csv", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV assert dci._guess_format(stream) == InputFormat.CSV
stream = DocumentStream(name="test-comma", stream=buf) stream = DocumentStream(name="test-comma", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV assert dci._guess_format(stream) == InputFormat.CSV
doc_path = Path("./tests/data/csv/csv-comma.csv") doc_path = Path("../../tests/data/csv/csv-comma.csv")
assert dci._guess_format(doc_path) == InputFormat.CSV assert dci._guess_format(doc_path) == InputFormat.CSV
# Valid XML USPTO patent # Valid XML USPTO patent
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read()) buf = BytesIO(Path("../../tests/data/uspto/ipa20110039701.xml").open("rb").read())
stream = DocumentStream(name="ipa20110039701.xml", stream=buf) stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_USPTO assert dci._guess_format(stream) == InputFormat.XML_USPTO
doc_path = Path("./tests/data/uspto/ipa20110039701.xml") doc_path = Path("../../tests/data/uspto/ipa20110039701.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read()) buf = BytesIO(Path("../../tests/data/uspto/pftaps057006474.txt").open("rb").read())
stream = DocumentStream(name="pftaps057006474.txt", stream=buf) stream = DocumentStream(name="pftaps057006474.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_USPTO assert dci._guess_format(stream) == InputFormat.XML_USPTO
doc_path = Path("./tests/data/uspto/pftaps057006474.txt") doc_path = Path("../../tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML JATS # Valid XML JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read()) buf = BytesIO(Path("../../tests/data/jats/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf) stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.xml") doc_path = Path("../../tests/data/jats/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read()) buf = BytesIO(Path("../../tests/data/jats/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf) stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.nxml") doc_path = Path("../../tests/data/jats/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read()) buf = BytesIO(Path("../../tests/data/jats/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf) stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.txt") doc_path = Path("../../tests/data/jats/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS assert dci._guess_format(doc_path) == InputFormat.XML_JATS
# Valid XML, non-supported flavor # Valid XML, non-supported flavor

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pdf_path(): def get_pdf_path():
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
return pdf_path return pdf_path

View File

@ -8,7 +8,7 @@ from docling.document_converter import ConversionError, DocumentConverter
def get_pdf_path(): def get_pdf_path():
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf")
return pdf_path return pdf_path

View File

@ -20,7 +20,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf") return Path("../../tests/data/pdf/2206.01062.pdf")
def get_converters_with_table_options(): def get_converters_with_table_options():
@ -159,7 +159,7 @@ def test_parser_backends(test_doc_path):
} }
) )
test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf") test_doc_path = Path("../../tests/data/pdf/code_and_formula.pdf")
doc_result: ConversionResult = converter.convert(test_doc_path) doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.status == ConversionStatus.SUCCESS