mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix(markdown): ensure correct parsing of nested lists (#1995)
* fix(markdown): ensure correct parsing of nested lists Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: update dependencies in uv.lock file Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
1985841a19
commit
aec29a7315
@@ -3,6 +3,6 @@
|
||||
- A. first
|
||||
- subitem
|
||||
- B. second
|
||||
1. strange
|
||||
- 2 . strange
|
||||
|
||||
The end!
|
||||
|
||||
139
tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
vendored
Normal file
139
tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
vendored
Normal file
@@ -0,0 +1,139 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/groups/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups:
|
||||
- children:
|
||||
- $ref: '#/texts/2'
|
||||
content_layer: body
|
||||
label: section
|
||||
name: header-1
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/0'
|
||||
- children:
|
||||
- $ref: '#/texts/3'
|
||||
- $ref: '#/texts/5'
|
||||
- $ref: '#/texts/6'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/texts/2'
|
||||
self_ref: '#/groups/1'
|
||||
- children:
|
||||
- $ref: '#/texts/4'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/texts/3'
|
||||
self_ref: '#/groups/2'
|
||||
key_value_items: []
|
||||
name: mixed_without_h1
|
||||
origin:
|
||||
binary_hash: 7394721163373597328
|
||||
filename: mixed_without_h1.md
|
||||
mimetype: text/html
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables: []
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: furniture
|
||||
label: title
|
||||
orig: mixed_without_h1
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: mixed_without_h1
|
||||
- children: []
|
||||
content_layer: furniture
|
||||
label: text
|
||||
orig: Content before first heading
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: Content before first heading
|
||||
- children:
|
||||
- $ref: '#/groups/1'
|
||||
- $ref: '#/texts/7'
|
||||
content_layer: body
|
||||
label: section_header
|
||||
level: 1
|
||||
orig: Some heading
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: Some heading
|
||||
- children:
|
||||
- $ref: '#/groups/2'
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: A. first
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: A. first
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: subitem
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/4'
|
||||
text: subitem
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: B. second
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/5'
|
||||
text: B. second
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: 2 . strange
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/6'
|
||||
text: 2 . strange
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: The end!
|
||||
parent:
|
||||
$ref: '#/texts/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/7'
|
||||
text: The end!
|
||||
version: 1.5.0
|
||||
2
tests/data/md/mixed_without_h1.md
vendored
2
tests/data/md/mixed_without_h1.md
vendored
@@ -7,6 +7,6 @@ Content before first heading
|
||||
- A. first
|
||||
- subitem
|
||||
- B. second
|
||||
- 2. strange
|
||||
- 2\. strange
|
||||
|
||||
The end!
|
||||
|
||||
@@ -16,7 +16,7 @@ def test_convert_valid():
|
||||
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
||||
assert len(relevant_paths) > 0
|
||||
|
||||
yaml_filter = ["inline_and_formatting"]
|
||||
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
|
||||
|
||||
for in_path in relevant_paths:
|
||||
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||
@@ -41,12 +41,11 @@ def test_convert_valid():
|
||||
f.write(f"{act_data}\n")
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
|
||||
act_doc.save_as_yaml(
|
||||
yaml_gt_path,
|
||||
coord_precision=COORD_PREC,
|
||||
confid_precision=CONFID_PREC,
|
||||
)
|
||||
act_doc.save_as_yaml(
|
||||
yaml_gt_path,
|
||||
coord_precision=COORD_PREC,
|
||||
confid_precision=CONFID_PREC,
|
||||
)
|
||||
else:
|
||||
with open(md_gt_path, encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
@@ -54,4 +53,4 @@ def test_convert_valid():
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||
assert act_doc == exp_doc
|
||||
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
|
||||
|
||||
Reference in New Issue
Block a user