fix: Improve markdown list parser

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
Tobias Strebitzer 2025-02-23 13:35:37 +08:00
parent d8a81c3168
commit c075d8d765
4 changed files with 95 additions and 27 deletions

View File

@ -11,10 +11,10 @@ import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
NodeItem,
TableCell,
@ -35,6 +35,27 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
def extract_text_recursive(children: list[marko.element.Element]):
strings: List[str] = []
# Define a recursive function to traverse the tree
def traverse(nodes: list[marko.element.Element]):
for node in nodes:
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
traverse(node.children)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(children)
# Return result
return "".join(strings)
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
@ -175,8 +196,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
index: int = 0,
):
if element in visited:
return
@ -195,22 +216,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings: List[str] = []
# Define a recursive function to traverse the tree
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
snippet_text = extract_text_recursive(list(element.children))
if len(snippet_text) > 0:
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
@ -237,19 +243,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
first_child = element.children[0]
snippet_text = str(first_child.children[0].children) # type: ignore
text_children = [
child
for child in element.children
if not isinstance(child, marko.block.List)
]
snippet_text = extract_text_recursive(text_children)
is_numbered = False
if (
parent_item is not None
and isinstance(parent_item, DocItem)
and isinstance(parent_item, GroupItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_item, text=snippet_text
)
visited.add(first_child)
if len(snippet_text) > 0:
marker = f"{index + 1}." if is_numbered else "-"
doc.add_list_item(
enumerated=is_numbered,
parent=parent_item,
text=snippet_text,
marker=marker,
)
for child in text_children:
visited.add(child)
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
@ -335,13 +352,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if hasattr(element, "children") and not isinstance(
element, processed_block_types
):
for child in element.children:
for index, child in enumerate(element.children):
self._iterate_elements(
element=child,
depth=depth + 1,
doc=doc,
visited=visited,
parent_item=parent_item,
index=index,
)
def is_valid(self) -> bool:

View File

@ -6,7 +6,7 @@ Empty unordered list:
Ordered list:
- bar
1. bar
Empty ordered list:

View File

@ -0,0 +1,24 @@
# Contributing
this is awesome stuff
1. Pull the repository
- Make sure to clone the full repository
2. Create your feature branch (git checkout -b feature/AmazingFeature)
3. Commit your changes (git commit -m 'Add some AmazingFeature')
4. Push to the branch (git push origin feature/AmazingFeature)
5. Open a Pull Request
# Example
1. Item 1 bold: Item 1 description
2. Item 2 bold: Item 2 description
3. Item 3 bold: Item 3 description
# Images
- &lt;img src="image-01.png" alt="Image 1"&gt; Image 1
- &lt;img src="image-02.png" alt="Image 2"&gt; Image 2
- &lt;img src="image-03.png" alt="Image 3"&gt; Image 3
# Empty

26
tests/data/md/lists.md Normal file
View File

@ -0,0 +1,26 @@
# Contributing
> this is *awesome* stuff
1. Pull the repository
* Make sure to clone the full repository
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
4. Push to the branch (`git push origin feature/AmazingFeature`)
5. Open a Pull Request
# Example
1. **Item 1 bold**: Item 1 description
2. **Item 2 bold**: Item 2 description
3. **Item 3 bold**: Item 3 description
# Images
* <img src="image-01.png" alt="Image 1"> Image 1
* <img src="image-02.png" alt="Image 2"> Image 2
* <img src="image-03.png" alt="Image 3"> Image 3
# Empty
* <img src="about:blank" alt="blank">
* <img src="about:blank" alt="blank">
* <img src="about:blank" alt="blank">