mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
184 lines
5.0 KiB
Python
184 lines
5.0 KiB
Python
import io
|
|
from pathlib import Path
|
|
from textwrap import dedent
|
|
from typing import Annotated
|
|
|
|
import pytest
|
|
from _pytest.mark import ParameterSet
|
|
from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem
|
|
|
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import (
|
|
InputDocument,
|
|
)
|
|
from tests.conftest import TEST_DATA_DIR
|
|
from tests.verify_utils import CONFID_PREC, COORD_PREC
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
|
|
GENERATE = True or GEN_TEST_DATA
|
|
|
|
ALSO_GENERATE_YAML = ["inline_and_formatting"]
|
|
"""A list of document names that should also be generated as yaml"""
|
|
|
|
# Test Input Directories
|
|
INPUT_DIR = TEST_DATA_DIR / "md"
|
|
|
|
# Test Output Directories
|
|
SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2"
|
|
|
|
TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"]
|
|
|
|
|
|
def markdown_test_data() -> list[ParameterSet]:
|
|
"""Returns test cases for each of our input markdown files"""
|
|
|
|
test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name)
|
|
|
|
test_cases: list[ParameterSet] = []
|
|
|
|
for test_case_path in test_case_paths:
|
|
name: str = test_case_path.stem
|
|
|
|
markdown_document_path: Path = test_case_path.resolve()
|
|
|
|
markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md"
|
|
yaml_snapshot_path: Path | None = (
|
|
SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None
|
|
)
|
|
|
|
test_cases.append(
|
|
pytest.param(
|
|
markdown_document_path,
|
|
markdown_snapshot_path,
|
|
yaml_snapshot_path,
|
|
id=name,
|
|
)
|
|
)
|
|
|
|
return test_cases
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"),
|
|
markdown_test_data(),
|
|
)
|
|
def test_convert_markdown(
|
|
markdown_document_path: Path,
|
|
markdown_snapshot_path: Path,
|
|
yaml_snapshot_path: Path | None,
|
|
):
|
|
"""Test that the Markdown backend can:
|
|
1) convert the input markdown file to a DoclingDocument
|
|
2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot
|
|
"""
|
|
|
|
if not GENERATE and not markdown_snapshot_path.exists():
|
|
pytest.skip(
|
|
f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True"
|
|
)
|
|
|
|
document_backend = MarkdownDocumentBackend(
|
|
in_doc=InputDocument(
|
|
path_or_stream=markdown_document_path,
|
|
format=InputFormat.MD,
|
|
backend=MarkdownDocumentBackend,
|
|
),
|
|
path_or_stream=markdown_document_path,
|
|
)
|
|
|
|
assert document_backend.is_valid()
|
|
|
|
try:
|
|
out_docling_document: DoclingDocument = document_backend.convert()
|
|
except Exception as e:
|
|
pytest.skip(f"Error converting {markdown_document_path}: {e}")
|
|
|
|
# Validate the YAML/JSON Export
|
|
if yaml_snapshot_path:
|
|
if GENERATE:
|
|
out_docling_document.save_as_yaml(
|
|
yaml_snapshot_path,
|
|
coord_precision=COORD_PREC,
|
|
confid_precision=CONFID_PREC,
|
|
)
|
|
else:
|
|
assert out_docling_document == DoclingDocument.load_from_yaml(
|
|
yaml_snapshot_path
|
|
)
|
|
|
|
# Validate the Markdown Export
|
|
out_markdown: str = out_docling_document.export_to_markdown()
|
|
|
|
if GENERATE:
|
|
_ = markdown_snapshot_path.write_text(out_markdown + "\n")
|
|
else:
|
|
assert (
|
|
out_markdown == markdown_snapshot_path.read_text(encoding="utf-8")
|
|
)
|
|
|
|
|
|
def test_convert_headers_to_groups():
|
|
"""Test that the Markdown backend can convert headers into hierarchical groups"""
|
|
|
|
input_document = dedent("""
|
|
# Header 1
|
|
|
|
some content under the header 1
|
|
|
|
## Header 2a
|
|
|
|
some content under the header 2
|
|
|
|
### Header 3
|
|
|
|
some content under the header 3
|
|
|
|
## Header 2b
|
|
""")
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
|
|
format=InputFormat.MD,
|
|
filename="headers_to_groups.md",
|
|
backend=MarkdownDocumentBackend,
|
|
)
|
|
backend = MarkdownDocumentBackend(
|
|
in_doc=in_doc,
|
|
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
|
|
)
|
|
|
|
act_doc: DoclingDocument = backend.convert()
|
|
|
|
assert len(act_doc.body.children) == 1
|
|
body_first_child_ref: RefItem = act_doc.body.children[0]
|
|
assert isinstance(body_first_child_ref, RefItem)
|
|
|
|
assert body_first_child_ref.cref == "#/groups/0"
|
|
|
|
body_first_child: GroupItem = body_first_child_ref.resolve(act_doc)
|
|
|
|
# The first child should have the header, content and two subheaders
|
|
assert len(body_first_child.children) == 4
|
|
|
|
act_data = act_doc.export_to_markdown()
|
|
|
|
expected_output = dedent("""
|
|
# Header 1
|
|
|
|
some content under the header 1
|
|
|
|
## Header 2a
|
|
|
|
some content under the header 2
|
|
|
|
### Header 3
|
|
|
|
some content under the header 3
|
|
|
|
## Header 2b
|
|
""").strip()
|
|
|
|
assert act_data == expected_output
|