fix: fix single newline handling in MD backend (#824)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas
2025-01-28 19:05:55 +01:00
committed by GitHub
parent adf6353483
commit 5aed9f8aeb
5 changed files with 170 additions and 8 deletions

View File

@@ -0,0 +1,35 @@
from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
def test_convert_valid():
fmt = InputFormat.MD
cls = MarkdownDocumentBackend
test_data_path = Path("tests") / "data"
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0
for in_path in relevant_paths:
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
in_doc = InputDocument(
path_or_stream=in_path,
format=fmt,
backend=cls,
)
backend = cls(
in_doc=in_doc,
path_or_stream=in_path,
)
assert backend.is_valid()
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
with open(gt_path, "r", encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert act_data == exp_data