mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
fix: Improve markdown list parser
Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
parent
d8a81c3168
commit
c075d8d765
@ -11,10 +11,10 @@ import marko.ext
|
||||
import marko.ext.gfm
|
||||
import marko.inline
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
@ -35,6 +35,27 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
||||
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
||||
|
||||
|
||||
def extract_text_recursive(children: list[marko.element.Element]):
|
||||
strings: List[str] = []
|
||||
|
||||
# Define a recursive function to traverse the tree
|
||||
def traverse(nodes: list[marko.element.Element]):
|
||||
for node in nodes:
|
||||
# Check if the node has a "children" attribute
|
||||
if hasattr(node, "children"):
|
||||
# If "children" is a list, continue traversal
|
||||
if isinstance(node.children, list):
|
||||
traverse(node.children)
|
||||
# If "children" is text, add it to header text
|
||||
elif isinstance(node.children, str):
|
||||
strings.append(node.children)
|
||||
|
||||
traverse(children)
|
||||
|
||||
# Return result
|
||||
return "".join(strings)
|
||||
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||
# This regex will match any sequence of underscores
|
||||
@ -175,8 +196,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
visited: Set[marko.element.Element],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
index: int = 0,
|
||||
):
|
||||
|
||||
if element in visited:
|
||||
return
|
||||
|
||||
@ -195,22 +216,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
||||
# hence we need to traverse the tree to get full text of a header
|
||||
strings: List[str] = []
|
||||
|
||||
# Define a recursive function to traverse the tree
|
||||
def traverse(node: marko.block.BlockElement):
|
||||
# Check if the node has a "children" attribute
|
||||
if hasattr(node, "children"):
|
||||
# If "children" is a list, continue traversal
|
||||
if isinstance(node.children, list):
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
# If "children" is text, add it to header text
|
||||
elif isinstance(node.children, str):
|
||||
strings.append(node.children)
|
||||
|
||||
traverse(element)
|
||||
snippet_text = "".join(strings)
|
||||
snippet_text = extract_text_recursive(list(element.children))
|
||||
if len(snippet_text) > 0:
|
||||
parent_item = doc.add_text(
|
||||
label=doc_label, parent=parent_item, text=snippet_text
|
||||
@ -237,19 +243,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
first_child = element.children[0]
|
||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
||||
text_children = [
|
||||
child
|
||||
for child in element.children
|
||||
if not isinstance(child, marko.block.List)
|
||||
]
|
||||
|
||||
snippet_text = extract_text_recursive(text_children)
|
||||
is_numbered = False
|
||||
if (
|
||||
parent_item is not None
|
||||
and isinstance(parent_item, DocItem)
|
||||
and isinstance(parent_item, GroupItem)
|
||||
and parent_item.label == GroupLabel.ORDERED_LIST
|
||||
):
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
||||
)
|
||||
visited.add(first_child)
|
||||
if len(snippet_text) > 0:
|
||||
marker = f"{index + 1}." if is_numbered else "-"
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered,
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
marker=marker,
|
||||
)
|
||||
for child in text_children:
|
||||
visited.add(child)
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self._close_table(doc)
|
||||
@ -335,13 +352,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if hasattr(element, "children") and not isinstance(
|
||||
element, processed_block_types
|
||||
):
|
||||
for child in element.children:
|
||||
for index, child in enumerate(element.children):
|
||||
self._iterate_elements(
|
||||
element=child,
|
||||
depth=depth + 1,
|
||||
doc=doc,
|
||||
visited=visited,
|
||||
parent_item=parent_item,
|
||||
index=index,
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
|
@ -6,7 +6,7 @@ Empty unordered list:
|
||||
|
||||
Ordered list:
|
||||
|
||||
- bar
|
||||
1. bar
|
||||
|
||||
Empty ordered list:
|
||||
|
||||
|
24
tests/data/groundtruth/docling_v2/lists.md.md
Normal file
24
tests/data/groundtruth/docling_v2/lists.md.md
Normal file
@ -0,0 +1,24 @@
|
||||
# Contributing
|
||||
|
||||
this is awesome stuff
|
||||
|
||||
1. Pull the repository
|
||||
- Make sure to clone the full repository
|
||||
2. Create your feature branch (git checkout -b feature/AmazingFeature)
|
||||
3. Commit your changes (git commit -m 'Add some AmazingFeature')
|
||||
4. Push to the branch (git push origin feature/AmazingFeature)
|
||||
5. Open a Pull Request
|
||||
|
||||
# Example
|
||||
|
||||
1. Item 1 bold: Item 1 description
|
||||
2. Item 2 bold: Item 2 description
|
||||
3. Item 3 bold: Item 3 description
|
||||
|
||||
# Images
|
||||
|
||||
- <img src="image-01.png" alt="Image 1"> Image 1
|
||||
- <img src="image-02.png" alt="Image 2"> Image 2
|
||||
- <img src="image-03.png" alt="Image 3"> Image 3
|
||||
|
||||
# Empty
|
26
tests/data/md/lists.md
Normal file
26
tests/data/md/lists.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Contributing
|
||||
|
||||
> this is *awesome* stuff
|
||||
|
||||
1. Pull the repository
|
||||
* Make sure to clone the full repository
|
||||
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
||||
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
||||
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
||||
5. Open a Pull Request
|
||||
|
||||
# Example
|
||||
|
||||
1. **Item 1 bold**: Item 1 description
|
||||
2. **Item 2 bold**: Item 2 description
|
||||
3. **Item 3 bold**: Item 3 description
|
||||
|
||||
# Images
|
||||
* <img src="image-01.png" alt="Image 1"> Image 1
|
||||
* <img src="image-02.png" alt="Image 2"> Image 2
|
||||
* <img src="image-03.png" alt="Image 3"> Image 3
|
||||
|
||||
# Empty
|
||||
* <img src="about:blank" alt="blank">
|
||||
* <img src="about:blank" alt="blank">
|
||||
* <img src="about:blank" alt="blank">
|
Loading…
Reference in New Issue
Block a user