undo for test folder

This commit is contained in:
Matthias Günter 2025-05-09 16:54:04 +02:00
parent a194392d1a
commit 4378be1480
13 changed files with 39 additions and 39 deletions

View File

@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("../../tests/data/pdf/2206.01062.pdf") return Path("./tests/data/pdf/2206.01062.pdf")
def _get_backend(pdf_doc): def _get_backend(pdf_doc):
@ -27,7 +27,7 @@ def _get_backend(pdf_doc):
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf") pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)

View File

@ -106,7 +106,7 @@ def test_ordered_lists():
def get_html_paths(): def get_html_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("../../tests/data/html/") directory = Path("./tests/data/html/")
# List all HTML files in the directory and its subdirectories # List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html")) html_files = sorted(directory.rglob("*.html"))

View File

@ -18,7 +18,7 @@ GENERATE = GEN_TEST_DATA
def get_xlsx_paths(): def get_xlsx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("../../tests/data/xlsx/") directory = Path("./tests/data/xlsx/")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.xlsx")) pdf_files = sorted(directory.rglob("*.xlsx"))

View File

@ -43,7 +43,7 @@ def test_heading_levels():
def get_docx_paths(): def get_docx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("../../tests/data/docx/") directory = Path("./tests/data/docx/")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.docx")) pdf_files = sorted(directory.rglob("*.docx"))

View File

@ -17,8 +17,8 @@ from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document from .verify_utils import verify_document
GENERATE: bool = GEN_TEST_DATA GENERATE: bool = GEN_TEST_DATA
DATA_PATH: Path = Path("../../tests/data/uspto/") DATA_PATH: Path = Path("./tests/data/uspto/")
GT_PATH: Path = Path("../../tests/data/groundtruth/docling_v2/") GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None: def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None:

View File

@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("../../tests/data/pdf/2206.01062.pdf") return Path("./tests/data/pdf/2206.01062.pdf")
def _get_backend(pdf_doc): def _get_backend(pdf_doc):
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("../../tests/data/pdf/redp5110_sampled.pdf") pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)

View File

@ -12,7 +12,7 @@ GENERATE = GEN_TEST_DATA
def get_pptx_paths(): def get_pptx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("../../tests/data/pptx/") directory = Path("./tests/data/pptx/")
# List all PPTX files in the directory and its subdirectories # List all PPTX files in the directory and its subdirectories
pptx_files = sorted(directory.rglob("*.pptx")) pptx_files = sorted(directory.rglob("*.pptx"))

View File

@ -18,7 +18,7 @@ def test_cli_version():
def test_cli_convert(tmp_path): def test_cli_convert(tmp_path):
source = "../../tests/data/pdf/2305.03393v1-pg9.pdf" source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
output = tmp_path / "out" output = tmp_path / "out"
output.mkdir() output.mkdir()
result = runner.invoke(app, [source, "--output", str(output)]) result = runner.invoke(app, [source, "--output", str(output)])

View File

@ -15,7 +15,7 @@ GENERATE_V2 = GEN_TEST_DATA
def get_pdf_paths(): def get_pdf_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("../../tests/data/pdf/") directory = Path("./tests/data/pdf/")
# List all PDF files in the directory and its subdirectories # List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf")) pdf_files = sorted(directory.rglob("*.pdf"))

View File

@ -12,7 +12,7 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path) doc = _make_input_doc(test_doc_path)
assert doc.valid is True assert doc.valid is True
@ -26,7 +26,7 @@ def test_in_doc_from_invalid_path():
def test_in_doc_from_valid_buf(): def test_in_doc_from_valid_buf():
buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read()) buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream) doc = _make_input_doc_from_stream(stream)
@ -72,7 +72,7 @@ def test_image_in_pdf_backend():
def test_in_doc_with_page_range(): def test_in_doc_with_page_range():
test_doc_path = Path("../../tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits() limits = DocumentLimits()
limits.page_range = (1, 10) limits.page_range = (1, 10)
@ -112,72 +112,72 @@ def test_guess_format(tmp_path):
temp_dir.mkdir() temp_dir.mkdir()
# Valid PDF # Valid PDF
buf = BytesIO(Path("../../tests/data/pdf/2206.01062.pdf").open("rb").read()) buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
assert dci._guess_format(stream) == InputFormat.PDF assert dci._guess_format(stream) == InputFormat.PDF
doc_path = Path("../../tests/data/pdf/2206.01062.pdf") doc_path = Path("./tests/data/pdf/2206.01062.pdf")
assert dci._guess_format(doc_path) == InputFormat.PDF assert dci._guess_format(doc_path) == InputFormat.PDF
# Valid MS Office # Valid MS Office
buf = BytesIO(Path("../../tests/data/docx/lorem_ipsum.docx").open("rb").read()) buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read())
stream = DocumentStream(name="lorem_ipsum.docx", stream=buf) stream = DocumentStream(name="lorem_ipsum.docx", stream=buf)
assert dci._guess_format(stream) == InputFormat.DOCX assert dci._guess_format(stream) == InputFormat.DOCX
doc_path = Path("../../tests/data/docx/lorem_ipsum.docx") doc_path = Path("./tests/data/docx/lorem_ipsum.docx")
assert dci._guess_format(doc_path) == InputFormat.DOCX assert dci._guess_format(doc_path) == InputFormat.DOCX
# Valid HTML # Valid HTML
buf = BytesIO(Path("../../tests/data/html/wiki_duck.html").open("rb").read()) buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read())
stream = DocumentStream(name="wiki_duck.html", stream=buf) stream = DocumentStream(name="wiki_duck.html", stream=buf)
assert dci._guess_format(stream) == InputFormat.HTML assert dci._guess_format(stream) == InputFormat.HTML
doc_path = Path("../../tests/data/html/wiki_duck.html") doc_path = Path("./tests/data/html/wiki_duck.html")
assert dci._guess_format(doc_path) == InputFormat.HTML assert dci._guess_format(doc_path) == InputFormat.HTML
# Valid MD # Valid MD
buf = BytesIO(Path("../../tests/data/md/wiki.md").open("rb").read()) buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
stream = DocumentStream(name="wiki.md", stream=buf) stream = DocumentStream(name="wiki.md", stream=buf)
assert dci._guess_format(stream) == InputFormat.MD assert dci._guess_format(stream) == InputFormat.MD
doc_path = Path("../../tests/data/md/wiki.md") doc_path = Path("./tests/data/md/wiki.md")
assert dci._guess_format(doc_path) == InputFormat.MD assert dci._guess_format(doc_path) == InputFormat.MD
# Valid CSV # Valid CSV
buf = BytesIO(Path("../../tests/data/csv/csv-comma.csv").open("rb").read()) buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read())
stream = DocumentStream(name="csv-comma.csv", stream=buf) stream = DocumentStream(name="csv-comma.csv", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV assert dci._guess_format(stream) == InputFormat.CSV
stream = DocumentStream(name="test-comma", stream=buf) stream = DocumentStream(name="test-comma", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV assert dci._guess_format(stream) == InputFormat.CSV
doc_path = Path("../../tests/data/csv/csv-comma.csv") doc_path = Path("./tests/data/csv/csv-comma.csv")
assert dci._guess_format(doc_path) == InputFormat.CSV assert dci._guess_format(doc_path) == InputFormat.CSV
# Valid XML USPTO patent # Valid XML USPTO patent
buf = BytesIO(Path("../../tests/data/uspto/ipa20110039701.xml").open("rb").read()) buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
stream = DocumentStream(name="ipa20110039701.xml", stream=buf) stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_USPTO assert dci._guess_format(stream) == InputFormat.XML_USPTO
doc_path = Path("../../tests/data/uspto/ipa20110039701.xml") doc_path = Path("./tests/data/uspto/ipa20110039701.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
buf = BytesIO(Path("../../tests/data/uspto/pftaps057006474.txt").open("rb").read()) buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read())
stream = DocumentStream(name="pftaps057006474.txt", stream=buf) stream = DocumentStream(name="pftaps057006474.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_USPTO assert dci._guess_format(stream) == InputFormat.XML_USPTO
doc_path = Path("../../tests/data/uspto/pftaps057006474.txt") doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML JATS # Valid XML JATS
buf = BytesIO(Path("../../tests/data/jats/elife-56337.xml").open("rb").read()) buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf) stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("../../tests/data/jats/elife-56337.xml") doc_path = Path("./tests/data/jats/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("../../tests/data/jats/elife-56337.nxml").open("rb").read()) buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf) stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("../../tests/data/jats/elife-56337.nxml") doc_path = Path("./tests/data/jats/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("../../tests/data/jats/elife-56337.txt").open("rb").read()) buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf) stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_JATS assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("../../tests/data/jats/elife-56337.txt") doc_path = Path("./tests/data/jats/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS assert dci._guess_format(doc_path) == InputFormat.XML_JATS
# Valid XML, non-supported flavor # Valid XML, non-supported flavor

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pdf_path(): def get_pdf_path():
pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf") pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
return pdf_path return pdf_path

View File

@ -8,7 +8,7 @@ from docling.document_converter import ConversionError, DocumentConverter
def get_pdf_path(): def get_pdf_path():
pdf_path = Path("../../tests/data/pdf/2305.03393v1-pg9.pdf") pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
return pdf_path return pdf_path

View File

@ -20,7 +20,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture @pytest.fixture
def test_doc_path(): def test_doc_path():
return Path("../../tests/data/pdf/2206.01062.pdf") return Path("./tests/data/pdf/2206.01062.pdf")
def get_converters_with_table_options(): def get_converters_with_table_options():
@ -159,7 +159,7 @@ def test_parser_backends(test_doc_path):
} }
) )
test_doc_path = Path("../../tests/data/pdf/code_and_formula.pdf") test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf")
doc_result: ConversionResult = converter.convert(test_doc_path) doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.status == ConversionStatus.SUCCESS