feat(markdown): add formatting & improve inline support (#1804)

feat(markdown): support formatting & hyperlinks

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas
2025-06-18 15:57:57 +02:00
committed by GitHub
parent 215b540f6c
commit 861abcdcb0
5 changed files with 722 additions and 88 deletions

View File

@@ -2,7 +2,7 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.datamodel.document import DoclingDocument, InputDocument
from .test_data_gen_flag import GEN_TEST_DATA
@@ -11,12 +11,15 @@ def test_convert_valid():
fmt = InputFormat.MD
cls = MarkdownDocumentBackend
test_data_path = Path("tests") / "data"
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
root_path = Path("tests") / "data"
relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"]
for in_path in relevant_paths:
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
in_doc = InputDocument(
path_or_stream=in_path,
@@ -33,9 +36,17 @@ def test_convert_valid():
act_data = act_doc.export_to_markdown()
if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(yaml_gt_path)
else:
with open(gt_path, encoding="utf-8") as f:
with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert exp_data == act_data
assert act_data == exp_data
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc