feat: add convert_string to document-converter (#2069)

* feat: add convert_string to document-converter

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix unsupported operand type(s) for |: type and NoneType

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added tests for convert_string

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-08-12 11:02:38 +02:00
committed by GitHub
parent e2cca931be
commit b09033cb73
2 changed files with 82 additions and 1 deletions

View File

@@ -2,10 +2,19 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument
from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from tests.verify_utils import CONFID_PREC, COORD_PREC
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_convert_valid():
@@ -54,3 +63,45 @@ def test_convert_valid():
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
def get_md_paths():
# Define the directory you want to search
directory = Path("./tests/groundtruth/docling_v2")
# List all MD files in the directory and its subdirectories
md_files = sorted(directory.rglob("*.md"))
return md_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
return converter
def test_e2e_md_conversions():
md_paths = get_md_paths()
converter = get_converter()
for md_path in md_paths:
# print(f"converting {md_path}")
with open(md_path) as fr:
true_md = fr.read()
conv_result: ConversionResult = converter.convert(md_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert true_md == pred_md
conv_result_: ConversionResult = converter.convert_string(
true_md, format=InputFormat.MD
)
doc_: DoclingDocument = conv_result_.document
pred_md_: str = doc_.export_to_markdown()
assert true_md == pred_md_