import io from pathlib import Path from textwrap import dedent from typing import Annotated import pytest from _pytest.mark import ParameterSet from docling_core.types.doc.document import DoclingDocument, GroupItem, RefItem from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ( InputDocument, ) from tests.conftest import TEST_DATA_DIR from tests.verify_utils import CONFID_PREC, COORD_PREC from .test_data_gen_flag import GEN_TEST_DATA GENERATE = True or GEN_TEST_DATA ALSO_GENERATE_YAML = ["inline_and_formatting"] """A list of document names that should also be generated as yaml""" # Test Input Directories INPUT_DIR = TEST_DATA_DIR / "md" # Test Output Directories SNAPSHOT_DIR = TEST_DATA_DIR / "groundtruth" / "docling_v2" TestCase = Annotated[tuple[str, Path, Path], "test_name, in_file, snapshot_file"] def markdown_test_data() -> list[ParameterSet]: """Returns test cases for each of our input markdown files""" test_case_paths = sorted(INPUT_DIR.glob(pattern="*.md"), key=lambda x: x.name) test_cases: list[ParameterSet] = [] for test_case_path in test_case_paths: name: str = test_case_path.stem markdown_document_path: Path = test_case_path.resolve() markdown_snapshot_path: Path = SNAPSHOT_DIR / f"{name}.md.md" yaml_snapshot_path: Path | None = ( SNAPSHOT_DIR / f"{name}.md.yaml" if name in ALSO_GENERATE_YAML else None ) test_cases.append( pytest.param( markdown_document_path, markdown_snapshot_path, yaml_snapshot_path, id=name, ) ) return test_cases @pytest.mark.parametrize( ("markdown_document_path", "markdown_snapshot_path", "yaml_snapshot_path"), markdown_test_data(), ) def test_convert_markdown( markdown_document_path: Path, markdown_snapshot_path: Path, yaml_snapshot_path: Path | None, ): """Test that the Markdown backend can: 1) convert the input markdown file to a DoclingDocument 2) export the markdown (and optionally, yaml) and verify it matches the committed snapshot """ if not GENERATE and not markdown_snapshot_path.exists(): pytest.skip( f"Test requires {markdown_snapshot_path} to exist, you may need to generate it with GENERATE=True" ) document_backend = MarkdownDocumentBackend( in_doc=InputDocument( path_or_stream=markdown_document_path, format=InputFormat.MD, backend=MarkdownDocumentBackend, ), path_or_stream=markdown_document_path, ) assert document_backend.is_valid() try: out_docling_document: DoclingDocument = document_backend.convert() except Exception as e: pytest.skip(f"Error converting {markdown_document_path}: {e}") # Validate the YAML/JSON Export if yaml_snapshot_path: if GENERATE: out_docling_document.save_as_yaml( yaml_snapshot_path, coord_precision=COORD_PREC, confid_precision=CONFID_PREC, ) else: assert out_docling_document == DoclingDocument.load_from_yaml( yaml_snapshot_path ) # Validate the Markdown Export out_markdown: str = out_docling_document.export_to_markdown() if GENERATE: _ = markdown_snapshot_path.write_text(out_markdown + "\n") else: assert ( out_markdown == markdown_snapshot_path.read_text(encoding="utf-8") ) def test_convert_headers_to_groups(): """Test that the Markdown backend can convert headers into hierarchical groups""" input_document = dedent(""" # Header 1 some content under the header 1 ## Header 2a some content under the header 2 ### Header 3 some content under the header 3 ## Header 2b """) in_doc = InputDocument( path_or_stream=io.BytesIO(input_document.encode("utf-8")), format=InputFormat.MD, filename="headers_to_groups.md", backend=MarkdownDocumentBackend, ) backend = MarkdownDocumentBackend( in_doc=in_doc, path_or_stream=io.BytesIO(input_document.encode("utf-8")), ) act_doc: DoclingDocument = backend.convert() assert len(act_doc.body.children) == 1 body_first_child_ref: RefItem = act_doc.body.children[0] assert isinstance(body_first_child_ref, RefItem) assert body_first_child_ref.cref == "#/groups/0" body_first_child: GroupItem = body_first_child_ref.resolve(act_doc) # The first child should have the header, content and two subheaders assert len(body_first_child.children) == 4 act_data = act_doc.export_to_markdown() expected_output = dedent(""" # Header 1 some content under the header 1 ## Header 2a some content under the header 2 ### Header 3 some content under the header 3 ## Header 2b """).strip() assert act_data == expected_output