docling/tests/test_backend_markdown.py
2025-07-12 20:10:27 -05:00

184 lines
5.0 KiB
Python

import io
from pathlib import Path
from textwrap import dedent
from typing import Annotated
import pytest
from _pytest.mark import ParameterSet
from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
InputDocument,
)
from tests.conftest import TEST_DATA_DIR
from tests.verify_utils import CONFID_PREC, COORD_PREC
from .test_data_gen_flag import GEN_TEST_DATA
GENERATE = True or GEN_TEST_DATA
ALSO_GENERATE_YAML = ["inline_and_formatting"]
"""A list of document names that should also be generated as yaml"""
# Test Input Directories
INPUT_DIR = TEST_DATA_DIR / "md"
# Test Output Directories
SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2"
TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"]
def markdown_test_data() -> list[ParameterSet]:
"""Returns test cases for each of our input markdown files"""
test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name)
test_cases: list[ParameterSet] = []
for test_case_path in test_case_paths:
name: str = test_case_path.stem
markdown_document_path: Path = test_case_path.resolve()
markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md"
yaml_snapshot_path: Path | None = (
SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None
)
test_cases.append(
pytest.param(
markdown_document_path,
markdown_snapshot_path,
yaml_snapshot_path,
id=name,
)
)
return test_cases
@pytest.mark.parametrize(
("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"),
markdown_test_data(),
)
def test_convert_markdown(
markdown_document_path: Path,
markdown_snapshot_path: Path,
yaml_snapshot_path: Path | None,
):
"""Test that the Markdown backend can:
1) convert the input markdown file to a DoclingDocument
2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot
"""
if not GENERATE and not markdown_snapshot_path.exists():
pytest.skip(
f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True"
)
document_backend = MarkdownDocumentBackend(
in_doc=InputDocument(
path_or_stream=markdown_document_path,
format=InputFormat.MD,
backend=MarkdownDocumentBackend,
),
path_or_stream=markdown_document_path,
)
assert document_backend.is_valid()
try:
out_docling_document: DoclingDocument = document_backend.convert()
except Exception as e:
pytest.skip(f"Error converting {markdown_document_path}: {e}")
# Validate the YAML/JSON Export
if yaml_snapshot_path:
if GENERATE:
out_docling_document.save_as_yaml(
yaml_snapshot_path,
coord_precision=COORD_PREC,
confid_precision=CONFID_PREC,
)
else:
assert out_docling_document == DoclingDocument.load_from_yaml(
yaml_snapshot_path
)
# Validate the Markdown Export
out_markdown: str = out_docling_document.export_to_markdown()
if GENERATE:
_ = markdown_snapshot_path.write_text(out_markdown + "\n")
else:
assert (
out_markdown == markdown_snapshot_path.read_text(encoding="utf-8")
)
def test_convert_headers_to_groups():
"""Test that the Markdown backend can convert headers into hierarchical groups"""
input_document = dedent("""
# Header 1
some content under the header 1
## Header 2a
some content under the header 2
### Header 3
some content under the header 3
## Header 2b
""")
in_doc = InputDocument(
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
format=InputFormat.MD,
filename="headers_to_groups.md",
backend=MarkdownDocumentBackend,
)
backend = MarkdownDocumentBackend(
in_doc=in_doc,
path_or_stream=io.BytesIO(input_document.encode("utf-8")),
)
act_doc: DoclingDocument = backend.convert()
assert len(act_doc.body.children) == 1
body_first_child_ref: RefItem = act_doc.body.children[0]
assert isinstance(body_first_child_ref, RefItem)
assert body_first_child_ref.cref == "#/groups/0"
body_first_child: GroupItem = body_first_child_ref.resolve(act_doc)
# The first child should have the header, content and two subheaders
assert len(body_first_child.children) == 4
act_data = act_doc.export_to_markdown()
expected_output = dedent("""
# Header 1
some content under the header 1
## Header 2a
some content under the header 2
### Header 3
some content under the header 3
## Header 2b
""").strip()
assert act_data == expected_output