mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix(markdown): handle nested lists (#910)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
31
tests/data/groundtruth/docling_v2/nested.md.md
Normal file
31
tests/data/groundtruth/docling_v2/nested.md.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Nesting
|
||||
|
||||
A list featuring nesting:
|
||||
|
||||
- abc
|
||||
- abc123
|
||||
- abc1234
|
||||
- abc12345
|
||||
- a.
|
||||
- b.
|
||||
- abcd1234:
|
||||
- abcd12345:
|
||||
- a.
|
||||
- b.
|
||||
- def:
|
||||
- def1234:
|
||||
- def12345。
|
||||
- after one empty line
|
||||
- foo
|
||||
- afer two empty lines
|
||||
- bar
|
||||
|
||||
- changing symbol
|
||||
|
||||
A nested HTML list:
|
||||
|
||||
- First item
|
||||
- Second item with subitems:
|
||||
- Subitem 1
|
||||
- Subitem 2
|
||||
- Last list item
|
||||
66
tests/data/md/nested.md
Normal file
66
tests/data/md/nested.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# Nesting
|
||||
|
||||
A list featuring nesting:
|
||||
|
||||
- abc
|
||||
- abc123
|
||||
- abc1234
|
||||
- abc12345
|
||||
- a.
|
||||
- b.
|
||||
- abcd1234:
|
||||
- abcd12345:
|
||||
- a.
|
||||
- b.
|
||||
- def:
|
||||
- def1234:
|
||||
- def12345。
|
||||
|
||||
- after one empty line
|
||||
- foo
|
||||
|
||||
|
||||
- afer two empty lines
|
||||
- bar
|
||||
* changing symbol
|
||||
|
||||
A nested HTML list:
|
||||
|
||||
<ul>
|
||||
<li>First item</li>
|
||||
<li>Second item with subitems:
|
||||
<ul>
|
||||
<li>Subitem 1</li>
|
||||
<li>Subitem 2</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Last list item</li>
|
||||
</ul>
|
||||
|
||||
<!--
|
||||
Table nesting apparently not yet suported by HTML backend:
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td>Cell</td>
|
||||
<td>Nested Table
|
||||
<table>
|
||||
<tr>
|
||||
<td>Cell 1</td>
|
||||
<>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 3</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 4</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><td>additional row</td></tr>
|
||||
</table>
|
||||
-->
|
||||
@@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
|
||||
|
||||
def test_convert_valid():
|
||||
fmt = InputFormat.MD
|
||||
@@ -30,6 +32,10 @@ def test_convert_valid():
|
||||
act_doc = backend.convert()
|
||||
act_data = act_doc.export_to_markdown()
|
||||
|
||||
with open(gt_path, "r", encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
assert act_data == exp_data
|
||||
if GEN_TEST_DATA:
|
||||
with open(gt_path, mode="w", encoding="utf-8") as f:
|
||||
f.write(f"{act_data}\n")
|
||||
else:
|
||||
with open(gt_path, encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
assert exp_data == act_data
|
||||
|
||||
9
tests/test_data_gen_flag.py
Normal file
9
tests/test_data_gen_flag.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import os
|
||||
|
||||
from pydantic import TypeAdapter
|
||||
|
||||
GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
|
||||
|
||||
|
||||
def test_gen_test_data_flag():
|
||||
assert not GEN_TEST_DATA
|
||||
Reference in New Issue
Block a user