From c075d8d7651eee0c22e38a46ce8ab08511d8bff8 Mon Sep 17 00:00:00 2001 From: Tobias Strebitzer Date: Sun, 23 Feb 2025 13:35:37 +0800 Subject: [PATCH] fix: Improve markdown list parser Signed-off-by: Tobias Strebitzer --- docling/backend/md_backend.py | 70 ++++++++++++------- .../data/groundtruth/docling_v2/blocks.md.md | 2 +- tests/data/groundtruth/docling_v2/lists.md.md | 24 +++++++ tests/data/md/lists.md | 26 +++++++ 4 files changed, 95 insertions(+), 27 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/lists.md.md create mode 100644 tests/data/md/lists.md diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 19a21c19..652e0f27 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -11,10 +11,10 @@ import marko.ext import marko.ext.gfm import marko.inline from docling_core.types.doc import ( - DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, NodeItem, TableCell, @@ -35,6 +35,27 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#" _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#" +def extract_text_recursive(children: list[marko.element.Element]): + strings: List[str] = [] + + # Define a recursive function to traverse the tree + def traverse(nodes: list[marko.element.Element]): + for node in nodes: + # Check if the node has a "children" attribute + if hasattr(node, "children"): + # If "children" is a list, continue traversal + if isinstance(node.children, list): + traverse(node.children) + # If "children" is text, add it to header text + elif isinstance(node.children, str): + strings.append(node.children) + + traverse(children) + + # Return result + return "".join(strings) + + class MarkdownDocumentBackend(DeclarativeDocumentBackend): def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): # This regex will match any sequence of underscores @@ -175,8 +196,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, visited: Set[marko.element.Element], parent_item: Optional[NodeItem] = None, + index: int = 0, ): - if element in visited: return @@ -195,22 +216,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # Header could have arbitrary inclusion of bold, italic or emphasis, # hence we need to traverse the tree to get full text of a header - strings: List[str] = [] - - # Define a recursive function to traverse the tree - def traverse(node: marko.block.BlockElement): - # Check if the node has a "children" attribute - if hasattr(node, "children"): - # If "children" is a list, continue traversal - if isinstance(node.children, list): - for child in node.children: - traverse(child) - # If "children" is text, add it to header text - elif isinstance(node.children, str): - strings.append(node.children) - - traverse(element) - snippet_text = "".join(strings) + snippet_text = extract_text_recursive(list(element.children)) if len(snippet_text) > 0: parent_item = doc.add_text( label=doc_label, parent=parent_item, text=snippet_text @@ -237,19 +243,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._process_inline_text(parent_item, doc) _log.debug(" - List item") - first_child = element.children[0] - snippet_text = str(first_child.children[0].children) # type: ignore + text_children = [ + child + for child in element.children + if not isinstance(child, marko.block.List) + ] + + snippet_text = extract_text_recursive(text_children) is_numbered = False if ( parent_item is not None - and isinstance(parent_item, DocItem) + and isinstance(parent_item, GroupItem) and parent_item.label == GroupLabel.ORDERED_LIST ): is_numbered = True - doc.add_list_item( - enumerated=is_numbered, parent=parent_item, text=snippet_text - ) - visited.add(first_child) + if len(snippet_text) > 0: + marker = f"{index + 1}." if is_numbered else "-" + doc.add_list_item( + enumerated=is_numbered, + parent=parent_item, + text=snippet_text, + marker=marker, + ) + for child in text_children: + visited.add(child) elif isinstance(element, marko.inline.Image): self._close_table(doc) @@ -335,13 +352,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if hasattr(element, "children") and not isinstance( element, processed_block_types ): - for child in element.children: + for index, child in enumerate(element.children): self._iterate_elements( element=child, depth=depth + 1, doc=doc, visited=visited, parent_item=parent_item, + index=index, ) def is_valid(self) -> bool: diff --git a/tests/data/groundtruth/docling_v2/blocks.md.md b/tests/data/groundtruth/docling_v2/blocks.md.md index 5269e7d8..6a194066 100644 --- a/tests/data/groundtruth/docling_v2/blocks.md.md +++ b/tests/data/groundtruth/docling_v2/blocks.md.md @@ -6,7 +6,7 @@ Empty unordered list: Ordered list: -- bar +1. bar Empty ordered list: diff --git a/tests/data/groundtruth/docling_v2/lists.md.md b/tests/data/groundtruth/docling_v2/lists.md.md new file mode 100644 index 00000000..995999b1 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/lists.md.md @@ -0,0 +1,24 @@ +# Contributing + +this is awesome stuff + +1. Pull the repository + - Make sure to clone the full repository +2. Create your feature branch (git checkout -b feature/AmazingFeature) +3. Commit your changes (git commit -m 'Add some AmazingFeature') +4. Push to the branch (git push origin feature/AmazingFeature) +5. Open a Pull Request + +# Example + +1. Item 1 bold: Item 1 description +2. Item 2 bold: Item 2 description +3. Item 3 bold: Item 3 description + +# Images + +- <img src="image-01.png" alt="Image 1"> Image 1 +- <img src="image-02.png" alt="Image 2"> Image 2 +- <img src="image-03.png" alt="Image 3"> Image 3 + +# Empty diff --git a/tests/data/md/lists.md b/tests/data/md/lists.md new file mode 100644 index 00000000..5d6434d0 --- /dev/null +++ b/tests/data/md/lists.md @@ -0,0 +1,26 @@ +# Contributing + +> this is *awesome* stuff + +1. Pull the repository + * Make sure to clone the full repository +2. Create your feature branch (`git checkout -b feature/AmazingFeature`) +3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) +4. Push to the branch (`git push origin feature/AmazingFeature`) +5. Open a Pull Request + +# Example + +1. **Item 1 bold**: Item 1 description +2. **Item 2 bold**: Item 2 description +3. **Item 3 bold**: Item 3 description + +# Images +* Image 1 Image 1 +* Image 2 Image 2 +* Image 3 Image 3 + +# Empty +* blank +* blank +* blank