fix(markdown): ensure correct parsing of nested lists (#1995)

* fix(markdown): ensure correct parsing of nested lists

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore: update dependencies in uv.lock file

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-07-25 15:17:57 +02:00 committed by GitHub
parent 1985841a19
commit aec29a7315
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 765 additions and 550 deletions

View File

@ -5,7 +5,7 @@ from copy import deepcopy
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import List, Literal, Optional, Set, Union
from typing import Literal, Optional, Union, cast
import marko
import marko.element
@ -14,6 +14,7 @@ from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
ListItem,
NodeItem,
TableCell,
TableData,
@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("MD INIT!!!")
_log.debug("Starting MarkdownDocumentBackend...")
# Markdown file:
self.path_or_stream = path_or_stream
@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
for md_table_row in self.md_table_buffer:
_log.debug(md_table_row)
_log.debug("=== TABLE END ===")
tcells: List[TableCell] = []
tcells: list[TableCell] = []
result_table = []
for n, md_table_row in enumerate(self.md_table_buffer):
data = []
@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
visited: Set[marko.element.Element],
visited: set[marko.element.Element],
creation_stack: list[
_CreationPayload
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
list_ordered_flag_by_ref: dict[str, bool],
list_last_item_by_ref: dict[str, ListItem],
parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif (
isinstance(element, marko.block.ListItem)
and len(element.children) == 1
and len(element.children) > 0
and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
):
@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if parent_item
else False
)
if len(child.children) > 1: # inline group will be created further down
non_list_children: list[marko.element.Element] = [
item
for item in child.children
if not isinstance(item, marko.block.ListItem)
]
if len(non_list_children) > 1: # inline group will be created further down
parent_ref: Optional[str] = (
parent_item.self_ref if parent_item else None
)
parent_item = self._create_list_item(
doc=doc,
parent_item=parent_item,
@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
formatting=formatting,
hyperlink=hyperlink,
)
if parent_ref:
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
else:
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element.dest
)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = element.children.strip()
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
_log.debug(f" - RawText/Literal: {element.children}")
snippet_text = (
element.children.strip() if isinstance(element.children, str) else ""
)
# Detect start of the table:
if "|" in snippet_text or self.in_table:
# most likely part of the markdown table
@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if parent_item
else False
)
parent_ref = parent_item.self_ref if parent_item else None
parent_item = self._create_list_item(
doc=doc,
parent_item=parent_item,
@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
formatting=formatting,
hyperlink=hyperlink,
)
if parent_ref:
list_last_item_by_ref[parent_ref] = cast(
ListItem, parent_item
)
elif isinstance(to_create, _HeadingCreationPayload):
# not keeping as parent_item as logic for correctly tracking
# that not implemented yet (section components not captured
@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element, processed_block_types
):
for child in element.children:
if (
isinstance(element, marko.block.ListItem)
and isinstance(child, marko.block.List)
and parent_item
and list_last_item_by_ref.get(parent_item.self_ref, None)
):
_log.debug(
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
)
parent_item = list_last_item_by_ref[parent_item.self_ref]
self._iterate_elements(
element=child,
depth=depth + 1,
@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited=visited,
creation_stack=creation_stack,
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
list_last_item_by_ref=list_last_item_by_ref,
parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
return False
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.MD}
def convert(self) -> DoclingDocument:
@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited=set(),
creation_stack=[],
list_ordered_flag_by_ref={},
list_last_item_by_ref={},
)
self._close_table(doc=doc) # handle any last hanging table
@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
]:
html_str = _restore_original_html(txt=html_str, regex=regex)
self._html_blocks = 0
# delegate to HTML backend
stream = BytesIO(bytes(html_str, encoding="utf-8"))
in_doc = InputDocument(

View File

@ -3,6 +3,6 @@
- A. first
- subitem
- B. second
1. strange
- 2 . strange
The end!

View File

@ -0,0 +1,139 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
content_layer: body
label: section
name: header-1
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/3'
- $ref: '#/texts/5'
- $ref: '#/texts/6'
content_layer: body
label: list
name: list
parent:
$ref: '#/texts/2'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/4'
content_layer: body
label: list
name: list
parent:
$ref: '#/texts/3'
self_ref: '#/groups/2'
key_value_items: []
name: mixed_without_h1
origin:
binary_hash: 7394721163373597328
filename: mixed_without_h1.md
mimetype: text/html
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
texts:
- children: []
content_layer: furniture
label: title
orig: mixed_without_h1
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: mixed_without_h1
- children: []
content_layer: furniture
label: text
orig: Content before first heading
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: Content before first heading
- children:
- $ref: '#/groups/1'
- $ref: '#/texts/7'
content_layer: body
label: section_header
level: 1
orig: Some heading
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: Some heading
- children:
- $ref: '#/groups/2'
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: A. first
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/3'
text: A. first
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: subitem
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/4'
text: subitem
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: B. second
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/5'
text: B. second
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: 2 . strange
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/6'
text: 2 . strange
- children: []
content_layer: body
label: text
orig: The end!
parent:
$ref: '#/texts/2'
prov: []
self_ref: '#/texts/7'
text: The end!
version: 1.5.0

View File

@ -7,6 +7,6 @@ Content before first heading
- A. first
- subitem
- B. second
- 2. strange
- 2\. strange
The end!

View File

@ -16,7 +16,7 @@ def test_convert_valid():
relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"]
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
for in_path in relevant_paths:
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
@ -41,12 +41,11 @@ def test_convert_valid():
f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(
yaml_gt_path,
coord_precision=COORD_PREC,
confid_precision=CONFID_PREC,
)
act_doc.save_as_yaml(
yaml_gt_path,
coord_precision=COORD_PREC,
confid_precision=CONFID_PREC,
)
else:
with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
@ -54,4 +53,4 @@ def test_convert_valid():
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"

1103
uv.lock generated

File diff suppressed because it is too large Load Diff