diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index 19a21c19..652e0f27 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -11,10 +11,10 @@ import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
- DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
+ GroupItem,
GroupLabel,
NodeItem,
TableCell,
@@ -35,6 +35,27 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
+def extract_text_recursive(children: list[marko.element.Element]):
+ strings: List[str] = []
+
+ # Define a recursive function to traverse the tree
+ def traverse(nodes: list[marko.element.Element]):
+ for node in nodes:
+ # Check if the node has a "children" attribute
+ if hasattr(node, "children"):
+ # If "children" is a list, continue traversal
+ if isinstance(node.children, list):
+ traverse(node.children)
+ # If "children" is text, add it to header text
+ elif isinstance(node.children, str):
+ strings.append(node.children)
+
+ traverse(children)
+
+ # Return result
+ return "".join(strings)
+
+
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
@@ -175,8 +196,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
+ index: int = 0,
):
-
if element in visited:
return
@@ -195,22 +216,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
- strings: List[str] = []
-
- # Define a recursive function to traverse the tree
- def traverse(node: marko.block.BlockElement):
- # Check if the node has a "children" attribute
- if hasattr(node, "children"):
- # If "children" is a list, continue traversal
- if isinstance(node.children, list):
- for child in node.children:
- traverse(child)
- # If "children" is text, add it to header text
- elif isinstance(node.children, str):
- strings.append(node.children)
-
- traverse(element)
- snippet_text = "".join(strings)
+ snippet_text = extract_text_recursive(list(element.children))
if len(snippet_text) > 0:
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
@@ -237,19 +243,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
- first_child = element.children[0]
- snippet_text = str(first_child.children[0].children) # type: ignore
+ text_children = [
+ child
+ for child in element.children
+ if not isinstance(child, marko.block.List)
+ ]
+
+ snippet_text = extract_text_recursive(text_children)
is_numbered = False
if (
parent_item is not None
- and isinstance(parent_item, DocItem)
+ and isinstance(parent_item, GroupItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
- doc.add_list_item(
- enumerated=is_numbered, parent=parent_item, text=snippet_text
- )
- visited.add(first_child)
+ if len(snippet_text) > 0:
+ marker = f"{index + 1}." if is_numbered else "-"
+ doc.add_list_item(
+ enumerated=is_numbered,
+ parent=parent_item,
+ text=snippet_text,
+ marker=marker,
+ )
+ for child in text_children:
+ visited.add(child)
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
@@ -335,13 +352,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if hasattr(element, "children") and not isinstance(
element, processed_block_types
):
- for child in element.children:
+ for index, child in enumerate(element.children):
self._iterate_elements(
element=child,
depth=depth + 1,
doc=doc,
visited=visited,
parent_item=parent_item,
+ index=index,
)
def is_valid(self) -> bool:
diff --git a/tests/data/groundtruth/docling_v2/blocks.md.md b/tests/data/groundtruth/docling_v2/blocks.md.md
index 5269e7d8..6a194066 100644
--- a/tests/data/groundtruth/docling_v2/blocks.md.md
+++ b/tests/data/groundtruth/docling_v2/blocks.md.md
@@ -6,7 +6,7 @@ Empty unordered list:
Ordered list:
-- bar
+1. bar
Empty ordered list:
diff --git a/tests/data/groundtruth/docling_v2/lists.md.md b/tests/data/groundtruth/docling_v2/lists.md.md
new file mode 100644
index 00000000..995999b1
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/lists.md.md
@@ -0,0 +1,24 @@
+# Contributing
+
+this is awesome stuff
+
+1. Pull the repository
+ - Make sure to clone the full repository
+2. Create your feature branch (git checkout -b feature/AmazingFeature)
+3. Commit your changes (git commit -m 'Add some AmazingFeature')
+4. Push to the branch (git push origin feature/AmazingFeature)
+5. Open a Pull Request
+
+# Example
+
+1. Item 1 bold: Item 1 description
+2. Item 2 bold: Item 2 description
+3. Item 3 bold: Item 3 description
+
+# Images
+
+- <img src="image-01.png" alt="Image 1"> Image 1
+- <img src="image-02.png" alt="Image 2"> Image 2
+- <img src="image-03.png" alt="Image 3"> Image 3
+
+# Empty
diff --git a/tests/data/md/lists.md b/tests/data/md/lists.md
new file mode 100644
index 00000000..5d6434d0
--- /dev/null
+++ b/tests/data/md/lists.md
@@ -0,0 +1,26 @@
+# Contributing
+
+> this is *awesome* stuff
+
+1. Pull the repository
+ * Make sure to clone the full repository
+2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
+3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
+4. Push to the branch (`git push origin feature/AmazingFeature`)
+5. Open a Pull Request
+
+# Example
+
+1. **Item 1 bold**: Item 1 description
+2. **Item 2 bold**: Item 2 description
+3. **Item 3 bold**: Item 3 description
+
+# Images
+*
Image 1
+*
Image 2
+*
Image 3
+
+# Empty
+*
+*
+*