fix(markdown): ensure correct parsing of nested lists

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-07-25 13:16:28 +02:00
parent 1985841a19
commit 82940c47a6
6 changed files with 194 additions and 24 deletions

View File

@ -5,7 +5,7 @@ from copy import deepcopy
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Literal, Optional, Set, Union from typing import Literal, Optional, Union, cast
import marko import marko
import marko.element import marko.element
@ -14,6 +14,7 @@ from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
ListItem,
NodeItem, NodeItem,
TableCell, TableCell,
TableData, TableData,
@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
_log.debug("MD INIT!!!") _log.debug("Starting MarkdownDocumentBackend...")
# Markdown file: # Markdown file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
for md_table_row in self.md_table_buffer: for md_table_row in self.md_table_buffer:
_log.debug(md_table_row) _log.debug(md_table_row)
_log.debug("=== TABLE END ===") _log.debug("=== TABLE END ===")
tcells: List[TableCell] = [] tcells: list[TableCell] = []
result_table = [] result_table = []
for n, md_table_row in enumerate(self.md_table_buffer): for n, md_table_row in enumerate(self.md_table_buffer):
data = [] data = []
@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element: marko.element.Element, element: marko.element.Element,
depth: int, depth: int,
doc: DoclingDocument, doc: DoclingDocument,
visited: Set[marko.element.Element], visited: set[marko.element.Element],
creation_stack: list[ creation_stack: list[
_CreationPayload _CreationPayload
], # stack for lazy item creation triggered deep in marko's AST (on RawText) ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
list_ordered_flag_by_ref: dict[str, bool], list_ordered_flag_by_ref: dict[str, bool],
list_last_item_by_ref: dict[str, ListItem],
parent_item: Optional[NodeItem] = None, parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None, formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None,
@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif ( elif (
isinstance(element, marko.block.ListItem) isinstance(element, marko.block.ListItem)
and len(element.children) == 1 and len(element.children) > 0
and isinstance((child := element.children[0]), marko.block.Paragraph) and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0 and len(child.children) > 0
): ):
@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if parent_item if parent_item
else False else False
) )
if len(child.children) > 1: # inline group will be created further down non_list_children: list[marko.element.Element] = [
item
for item in child.children
if not isinstance(item, marko.block.ListItem)
]
if len(non_list_children) > 1: # inline group will be created further down
parent_ref: Optional[str] = (
parent_item.self_ref if parent_item else None
)
parent_item = self._create_list_item( parent_item = self._create_list_item(
doc=doc, doc=doc,
parent_item=parent_item, parent_item=parent_item,
@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
formatting=formatting, formatting=formatting,
hyperlink=hyperlink, hyperlink=hyperlink,
) )
if parent_ref:
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
else: else:
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated)) creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element.dest element.dest
) )
elif isinstance(element, marko.inline.RawText): elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
_log.debug(f" - Paragraph (raw text): {element.children}") _log.debug(f" - RawText/Literal: {element.children}")
snippet_text = element.children.strip() snippet_text = (
element.children.strip() if isinstance(element.children, str) else ""
)
# Detect start of the table: # Detect start of the table:
if "|" in snippet_text or self.in_table: if "|" in snippet_text or self.in_table:
# most likely part of the markdown table # most likely part of the markdown table
@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if parent_item if parent_item
else False else False
) )
parent_ref = parent_item.self_ref if parent_item else None
parent_item = self._create_list_item( parent_item = self._create_list_item(
doc=doc, doc=doc,
parent_item=parent_item, parent_item=parent_item,
@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
formatting=formatting, formatting=formatting,
hyperlink=hyperlink, hyperlink=hyperlink,
) )
if parent_ref:
list_last_item_by_ref[parent_ref] = cast(
ListItem, parent_item
)
elif isinstance(to_create, _HeadingCreationPayload): elif isinstance(to_create, _HeadingCreationPayload):
# not keeping as parent_item as logic for correctly tracking # not keeping as parent_item as logic for correctly tracking
# that not implemented yet (section components not captured # that not implemented yet (section components not captured
@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element, processed_block_types element, processed_block_types
): ):
for child in element.children: for child in element.children:
if (
isinstance(element, marko.block.ListItem)
and isinstance(child, marko.block.List)
and parent_item
and list_last_item_by_ref.get(parent_item.self_ref, None)
):
_log.debug(
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
)
parent_item = list_last_item_by_ref[parent_item.self_ref]
self._iterate_elements( self._iterate_elements(
element=child, element=child,
depth=depth + 1, depth=depth + 1,
@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited=visited, visited=visited,
creation_stack=creation_stack, creation_stack=creation_stack,
list_ordered_flag_by_ref=list_ordered_flag_by_ref, list_ordered_flag_by_ref=list_ordered_flag_by_ref,
list_last_item_by_ref=list_last_item_by_ref,
parent_item=parent_item, parent_item=parent_item,
formatting=formatting, formatting=formatting,
hyperlink=hyperlink, hyperlink=hyperlink,
@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
return False return False
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.MD} return {InputFormat.MD}
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited=set(), visited=set(),
creation_stack=[], creation_stack=[],
list_ordered_flag_by_ref={}, list_ordered_flag_by_ref={},
list_last_item_by_ref={},
) )
self._close_table(doc=doc) # handle any last hanging table self._close_table(doc=doc) # handle any last hanging table
@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
]: ]:
html_str = _restore_original_html(txt=html_str, regex=regex) html_str = _restore_original_html(txt=html_str, regex=regex)
self._html_blocks = 0 self._html_blocks = 0
# delegate to HTML backend # delegate to HTML backend
stream = BytesIO(bytes(html_str, encoding="utf-8")) stream = BytesIO(bytes(html_str, encoding="utf-8"))
in_doc = InputDocument( in_doc = InputDocument(

View File

@ -3,6 +3,6 @@
- A. first - A. first
- subitem - subitem
- B. second - B. second
1. strange - 2 . strange
The end! The end!

View File

@ -0,0 +1,139 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
content_layer: body
label: section
name: header-1
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/3'
- $ref: '#/texts/5'
- $ref: '#/texts/6'
content_layer: body
label: list
name: list
parent:
$ref: '#/texts/2'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/4'
content_layer: body
label: list
name: list
parent:
$ref: '#/texts/3'
self_ref: '#/groups/2'
key_value_items: []
name: mixed_without_h1
origin:
binary_hash: 7394721163373597328
filename: mixed_without_h1.md
mimetype: text/html
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
texts:
- children: []
content_layer: furniture
label: title
orig: mixed_without_h1
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: mixed_without_h1
- children: []
content_layer: furniture
label: text
orig: Content before first heading
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: Content before first heading
- children:
- $ref: '#/groups/1'
- $ref: '#/texts/7'
content_layer: body
label: section_header
level: 1
orig: Some heading
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: Some heading
- children:
- $ref: '#/groups/2'
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: A. first
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/3'
text: A. first
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: subitem
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/4'
text: subitem
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: B. second
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/5'
text: B. second
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: 2 . strange
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/6'
text: 2 . strange
- children: []
content_layer: body
label: text
orig: The end!
parent:
$ref: '#/texts/2'
prov: []
self_ref: '#/texts/7'
text: The end!
version: 1.5.0

View File

@ -7,6 +7,6 @@ Content before first heading
- A. first - A. first
- subitem - subitem
- B. second - B. second
- 2. strange - 2\. strange
The end! The end!

View File

@ -16,7 +16,7 @@ def test_convert_valid():
relevant_paths = sorted((root_path / "md").rglob("*.md")) relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0 assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"] yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
for in_path in relevant_paths: for in_path in relevant_paths:
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
@ -41,12 +41,11 @@ def test_convert_valid():
f.write(f"{act_data}\n") f.write(f"{act_data}\n")
if in_path.stem in yaml_filter: if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f: act_doc.save_as_yaml(
act_doc.save_as_yaml( yaml_gt_path,
yaml_gt_path, coord_precision=COORD_PREC,
coord_precision=COORD_PREC, confid_precision=CONFID_PREC,
confid_precision=CONFID_PREC, )
)
else: else:
with open(md_gt_path, encoding="utf-8") as f: with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip() exp_data = f.read().rstrip()
@ -54,4 +53,4 @@ def test_convert_valid():
if in_path.stem in yaml_filter: if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc assert act_doc == exp_doc, f"export to yaml failed on {in_path}"

6
uv.lock generated
View File

@ -982,7 +982,7 @@ examples = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.42.0" version = "2.43.1"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "jsonref" }, { name = "jsonref" },
@ -996,9 +996,9 @@ dependencies = [
{ name = "typer" }, { name = "typer" },
{ name = "typing-extensions" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/60/c9/f5555f8efbbbecce858e78791fbe0b9465c3c91ea917a3a3acdb1c3c9887/docling_core-2.42.0.tar.gz", hash = "sha256:cf2bb9e889920bac1d94412bd89c10e647419b6d5f7fe5e6f71ed732eb8f24f6", size = 154657, upload-time = "2025-07-09T12:27:34.858Z" } sdist = { url = "https://files.pythonhosted.org/packages/a1/eb/c5af5ab617ca162fac7a1b9b89db6e52c33beb50b083b4eed858cea4f4b3/docling_core-2.43.1.tar.gz", hash = "sha256:8bc76879439e4ef6f65e60621fc70e6c81e02cb7490b08a12e416bfb05593180", size = 155583, upload-time = "2025-07-23T14:18:34.149Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/0d/e4/685bb1b38ca120fdffc1c24f1ce54229ff996e5cad50a9c9dd39b671cb83/docling_core-2.42.0-py3-none-any.whl", hash = "sha256:0774391f335217a5aec8357977e66b63b6e8c9d821c56103de54c526eab92ed6", size = 158101, upload-time = "2025-07-09T12:27:33.147Z" }, { url = "https://files.pythonhosted.org/packages/de/a1/25eafa2cfd8e73ff15a23e74d3698dac7608e1ca984081728788dd1444df/docling_core-2.43.1-py3-none-any.whl", hash = "sha256:24364a2344b3324a55fb4eba8cf2396345a90ca782766daa78412b6cdef00776", size = 159318, upload-time = "2025-07-23T14:18:32.576Z" },
] ]
[package.optional-dependencies] [package.optional-dependencies]