fix(markdown): handle nested lists (#910)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas
2025-02-07 12:55:12 +01:00
committed by GitHub
parent 9114ada7bc
commit 90b766e2ae
5 changed files with 177 additions and 49 deletions

View File

@@ -0,0 +1,31 @@
# Nesting
A list featuring nesting:
- abc
- abc123
- abc1234
- abc12345
- a.
- b.
- abcd1234
- abcd12345
- a.
- b.
- def
- def1234
- def12345。
- after one empty line
- foo
- afer two empty lines
- bar
- changing symbol
A nested HTML list:
- First item
- Second item with subitems:
- Subitem 1
- Subitem 2
- Last list item

66
tests/data/md/nested.md Normal file
View File

@@ -0,0 +1,66 @@
# Nesting
A list featuring nesting:
- abc
- abc123
- abc1234
- abc12345
- a.
- b.
- abcd1234
- abcd12345
- a.
- b.
- def
- def1234
- def12345。
- after one empty line
- foo
- afer two empty lines
- bar
* changing symbol
A nested HTML list:
<ul>
<li>First item</li>
<li>Second item with subitems:
<ul>
<li>Subitem 1</li>
<li>Subitem 2</li>
</ul>
</li>
<li>Last list item</li>
</ul>
<!--
Table nesting apparently not yet suported by HTML backend:
<table>
<tr>
<td>Cell</td>
<td>Nested Table
<table>
<tr>
<td>Cell 1</td>
<>
</tr>
<tr>
<td>Cell 2</td>
</tr>
<tr>
<td>Cell 3</td>
</tr>
<tr>
<td>Cell 4</td>
</tr>
</table>
</td>
</tr>
<tr><td>additional row</td></tr>
</table>
-->

View File

@@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from .test_data_gen_flag import GEN_TEST_DATA
def test_convert_valid():
fmt = InputFormat.MD
@@ -30,6 +32,10 @@ def test_convert_valid():
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
with open(gt_path, "r", encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert act_data == exp_data
if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
else:
with open(gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert exp_data == act_data

View File

@@ -0,0 +1,9 @@
import os
from pydantic import TypeAdapter
GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
def test_gen_test_data_flag():
assert not GEN_TEST_DATA