mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
fix(markdown): ensure correct parsing of nested lists (#1995)
* fix(markdown): ensure correct parsing of nested lists Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: update dependencies in uv.lock file Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
1985841a19
commit
aec29a7315
@@ -5,7 +5,7 @@ from copy import deepcopy
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Set, Union
|
||||
from typing import Literal, Optional, Union, cast
|
||||
|
||||
import marko
|
||||
import marko.element
|
||||
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
ListItem,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
_log.debug("MD INIT!!!")
|
||||
_log.debug("Starting MarkdownDocumentBackend...")
|
||||
|
||||
# Markdown file:
|
||||
self.path_or_stream = path_or_stream
|
||||
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
for md_table_row in self.md_table_buffer:
|
||||
_log.debug(md_table_row)
|
||||
_log.debug("=== TABLE END ===")
|
||||
tcells: List[TableCell] = []
|
||||
tcells: list[TableCell] = []
|
||||
result_table = []
|
||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||
data = []
|
||||
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
visited: Set[marko.element.Element],
|
||||
visited: set[marko.element.Element],
|
||||
creation_stack: list[
|
||||
_CreationPayload
|
||||
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
||||
list_ordered_flag_by_ref: dict[str, bool],
|
||||
list_last_item_by_ref: dict[str, ListItem],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and len(element.children) == 1
|
||||
and len(element.children) > 0
|
||||
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||
and len(child.children) > 0
|
||||
):
|
||||
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if parent_item
|
||||
else False
|
||||
)
|
||||
if len(child.children) > 1: # inline group will be created further down
|
||||
non_list_children: list[marko.element.Element] = [
|
||||
item
|
||||
for item in child.children
|
||||
if not isinstance(item, marko.block.ListItem)
|
||||
]
|
||||
if len(non_list_children) > 1: # inline group will be created further down
|
||||
parent_ref: Optional[str] = (
|
||||
parent_item.self_ref if parent_item else None
|
||||
)
|
||||
parent_item = self._create_list_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
if parent_ref:
|
||||
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
|
||||
else:
|
||||
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
||||
|
||||
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element.dest
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
snippet_text = element.children.strip()
|
||||
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
|
||||
_log.debug(f" - RawText/Literal: {element.children}")
|
||||
snippet_text = (
|
||||
element.children.strip() if isinstance(element.children, str) else ""
|
||||
)
|
||||
# Detect start of the table:
|
||||
if "|" in snippet_text or self.in_table:
|
||||
# most likely part of the markdown table
|
||||
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if parent_item
|
||||
else False
|
||||
)
|
||||
parent_ref = parent_item.self_ref if parent_item else None
|
||||
parent_item = self._create_list_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
if parent_ref:
|
||||
list_last_item_by_ref[parent_ref] = cast(
|
||||
ListItem, parent_item
|
||||
)
|
||||
|
||||
elif isinstance(to_create, _HeadingCreationPayload):
|
||||
# not keeping as parent_item as logic for correctly tracking
|
||||
# that not implemented yet (section components not captured
|
||||
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element, processed_block_types
|
||||
):
|
||||
for child in element.children:
|
||||
if (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and isinstance(child, marko.block.List)
|
||||
and parent_item
|
||||
and list_last_item_by_ref.get(parent_item.self_ref, None)
|
||||
):
|
||||
_log.debug(
|
||||
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
|
||||
)
|
||||
parent_item = list_last_item_by_ref[parent_item.self_ref]
|
||||
|
||||
self._iterate_elements(
|
||||
element=child,
|
||||
depth=depth + 1,
|
||||
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
visited=visited,
|
||||
creation_stack=creation_stack,
|
||||
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
||||
list_last_item_by_ref=list_last_item_by_ref,
|
||||
parent_item=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.MD}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
visited=set(),
|
||||
creation_stack=[],
|
||||
list_ordered_flag_by_ref={},
|
||||
list_last_item_by_ref={},
|
||||
)
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
]:
|
||||
html_str = _restore_original_html(txt=html_str, regex=regex)
|
||||
self._html_blocks = 0
|
||||
|
||||
# delegate to HTML backend
|
||||
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
||||
in_doc = InputDocument(
|
||||
|
||||
Reference in New Issue
Block a user