mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
cleaned up the data folder in the tests
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
6d3fea0196
commit
5db82d5b67
@ -20,7 +20,7 @@ def _get_backend(fname):
|
|||||||
|
|
||||||
def test_asciidocs_examples():
|
def test_asciidocs_examples():
|
||||||
|
|
||||||
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
|
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
|
||||||
|
|
||||||
for fname in fnames:
|
for fname in fnames:
|
||||||
print(f"reading {fname}")
|
print(f"reading {fname}")
|
||||||
|
@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
|
|||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -27,7 +27,7 @@ def _get_backend(pdf_doc):
|
|||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
|
|||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ def test_cli_version():
|
|||||||
|
|
||||||
|
|
||||||
def test_cli_convert(tmp_path):
|
def test_cli_convert(tmp_path):
|
||||||
source = "./tests/data/2305.03393v1-pg9.pdf"
|
source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
|
||||||
output = tmp_path / "out"
|
output = tmp_path / "out"
|
||||||
output.mkdir()
|
output.mkdir()
|
||||||
result = runner.invoke(app, [source, "--output", str(output)])
|
result = runner.invoke(app, [source, "--output", str(output)])
|
||||||
|
@ -36,7 +36,7 @@ def get_converter():
|
|||||||
|
|
||||||
|
|
||||||
def test_code_and_formula_conversion():
|
def test_code_and_formula_conversion():
|
||||||
pdf_path = Path("tests/data/code_and_formula.pdf")
|
pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
@ -37,7 +37,7 @@ def get_converter():
|
|||||||
|
|
||||||
|
|
||||||
def test_picture_classifier():
|
def test_picture_classifier():
|
||||||
pdf_path = Path("tests/data/picture_classification.pdf")
|
pdf_path = Path("tests/data/pdf/picture_classification.pdf")
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
@ -15,7 +15,7 @@ GENERATE_V2 = False
|
|||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data")
|
directory = Path("./tests/data/pdf/")
|
||||||
|
|
||||||
# List all PDF files in the directory and its subdirectories
|
# List all PDF files in the directory and its subdirectories
|
||||||
pdf_files = sorted(directory.rglob("*.pdf"))
|
pdf_files = sorted(directory.rglob("*.pdf"))
|
||||||
|
@ -9,7 +9,7 @@ from docling.datamodel.settings import DocumentLimits
|
|||||||
|
|
||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
|
|
||||||
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
doc = _make_input_doc(test_doc_path)
|
doc = _make_input_doc(test_doc_path)
|
||||||
assert doc.valid == True
|
assert doc.valid == True
|
||||||
|
|
||||||
@ -24,7 +24,7 @@ def test_in_doc_from_invalid_path():
|
|||||||
|
|
||||||
def test_in_doc_from_valid_buf():
|
def test_in_doc_from_valid_buf():
|
||||||
|
|
||||||
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
doc = _make_input_doc_from_stream(stream)
|
doc = _make_input_doc_from_stream(stream)
|
||||||
@ -41,7 +41,7 @@ def test_in_doc_from_invalid_buf():
|
|||||||
|
|
||||||
|
|
||||||
def test_in_doc_with_page_range():
|
def test_in_doc_with_page_range():
|
||||||
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
limits = DocumentLimits()
|
limits = DocumentLimits()
|
||||||
limits.page_range = (1, 10)
|
limits.page_range = (1, 10)
|
||||||
|
|
||||||
@ -81,10 +81,10 @@ def test_guess_format(tmp_path):
|
|||||||
temp_dir.mkdir()
|
temp_dir.mkdir()
|
||||||
|
|
||||||
# Valid PDF
|
# Valid PDF
|
||||||
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
assert dci._guess_format(stream) == InputFormat.PDF
|
assert dci._guess_format(stream) == InputFormat.PDF
|
||||||
doc_path = Path("./tests/data/2206.01062.pdf")
|
doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
assert dci._guess_format(doc_path) == InputFormat.PDF
|
assert dci._guess_format(doc_path) == InputFormat.PDF
|
||||||
|
|
||||||
# Valid MS Office
|
# Valid MS Office
|
||||||
|
@ -15,7 +15,7 @@ GENERATE = False
|
|||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
|
|
||||||
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
|
||||||
return pdf_path
|
return pdf_path
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ from docling.document_converter import ConversionError, DocumentConverter
|
|||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
|
|
||||||
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
|
||||||
return pdf_path
|
return pdf_path
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ def test_doc_paths():
|
|||||||
Path("tests/data/docx/lorem_ipsum.docx"),
|
Path("tests/data/docx/lorem_ipsum.docx"),
|
||||||
Path("tests/data/pptx/powerpoint_sample.pptx"),
|
Path("tests/data/pptx/powerpoint_sample.pptx"),
|
||||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/pdf/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def get_converters_with_table_options():
|
def get_converters_with_table_options():
|
||||||
|
@ -248,8 +248,13 @@ def verify_conversion_result_v1(
|
|||||||
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
||||||
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
||||||
|
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
|
|
||||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||||
|
if str(input_path.parent).endswith("pdf"):
|
||||||
|
gt_subpath = input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||||
|
|
||||||
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
||||||
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
||||||
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
||||||
@ -325,7 +330,11 @@ def verify_conversion_result_v2(
|
|||||||
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
|
|
||||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name
|
||||||
|
if str(input_path.parent).endswith("pdf"):
|
||||||
|
gt_subpath = input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name
|
||||||
|
|
||||||
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
||||||
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
||||||
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
||||||
|
Loading…
Reference in New Issue
Block a user