mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
fix: Improve markdown list parser
Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
parent
d8a81c3168
commit
c075d8d765
@ -11,10 +11,10 @@ import marko.ext
|
|||||||
import marko.ext.gfm
|
import marko.ext.gfm
|
||||||
import marko.inline
|
import marko.inline
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
|
GroupItem,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
NodeItem,
|
NodeItem,
|
||||||
TableCell,
|
TableCell,
|
||||||
@ -35,6 +35,27 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
|||||||
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_recursive(children: list[marko.element.Element]):
|
||||||
|
strings: List[str] = []
|
||||||
|
|
||||||
|
# Define a recursive function to traverse the tree
|
||||||
|
def traverse(nodes: list[marko.element.Element]):
|
||||||
|
for node in nodes:
|
||||||
|
# Check if the node has a "children" attribute
|
||||||
|
if hasattr(node, "children"):
|
||||||
|
# If "children" is a list, continue traversal
|
||||||
|
if isinstance(node.children, list):
|
||||||
|
traverse(node.children)
|
||||||
|
# If "children" is text, add it to header text
|
||||||
|
elif isinstance(node.children, str):
|
||||||
|
strings.append(node.children)
|
||||||
|
|
||||||
|
traverse(children)
|
||||||
|
|
||||||
|
# Return result
|
||||||
|
return "".join(strings)
|
||||||
|
|
||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||||
# This regex will match any sequence of underscores
|
# This regex will match any sequence of underscores
|
||||||
@ -175,8 +196,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
visited: Set[marko.element.Element],
|
visited: Set[marko.element.Element],
|
||||||
parent_item: Optional[NodeItem] = None,
|
parent_item: Optional[NodeItem] = None,
|
||||||
|
index: int = 0,
|
||||||
):
|
):
|
||||||
|
|
||||||
if element in visited:
|
if element in visited:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -195,22 +216,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
||||||
# hence we need to traverse the tree to get full text of a header
|
# hence we need to traverse the tree to get full text of a header
|
||||||
strings: List[str] = []
|
snippet_text = extract_text_recursive(list(element.children))
|
||||||
|
|
||||||
# Define a recursive function to traverse the tree
|
|
||||||
def traverse(node: marko.block.BlockElement):
|
|
||||||
# Check if the node has a "children" attribute
|
|
||||||
if hasattr(node, "children"):
|
|
||||||
# If "children" is a list, continue traversal
|
|
||||||
if isinstance(node.children, list):
|
|
||||||
for child in node.children:
|
|
||||||
traverse(child)
|
|
||||||
# If "children" is text, add it to header text
|
|
||||||
elif isinstance(node.children, str):
|
|
||||||
strings.append(node.children)
|
|
||||||
|
|
||||||
traverse(element)
|
|
||||||
snippet_text = "".join(strings)
|
|
||||||
if len(snippet_text) > 0:
|
if len(snippet_text) > 0:
|
||||||
parent_item = doc.add_text(
|
parent_item = doc.add_text(
|
||||||
label=doc_label, parent=parent_item, text=snippet_text
|
label=doc_label, parent=parent_item, text=snippet_text
|
||||||
@ -237,19 +243,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._process_inline_text(parent_item, doc)
|
self._process_inline_text(parent_item, doc)
|
||||||
_log.debug(" - List item")
|
_log.debug(" - List item")
|
||||||
|
|
||||||
first_child = element.children[0]
|
text_children = [
|
||||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
child
|
||||||
|
for child in element.children
|
||||||
|
if not isinstance(child, marko.block.List)
|
||||||
|
]
|
||||||
|
|
||||||
|
snippet_text = extract_text_recursive(text_children)
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
if (
|
if (
|
||||||
parent_item is not None
|
parent_item is not None
|
||||||
and isinstance(parent_item, DocItem)
|
and isinstance(parent_item, GroupItem)
|
||||||
and parent_item.label == GroupLabel.ORDERED_LIST
|
and parent_item.label == GroupLabel.ORDERED_LIST
|
||||||
):
|
):
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
doc.add_list_item(
|
if len(snippet_text) > 0:
|
||||||
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
marker = f"{index + 1}." if is_numbered else "-"
|
||||||
)
|
doc.add_list_item(
|
||||||
visited.add(first_child)
|
enumerated=is_numbered,
|
||||||
|
parent=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
marker=marker,
|
||||||
|
)
|
||||||
|
for child in text_children:
|
||||||
|
visited.add(child)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
@ -335,13 +352,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if hasattr(element, "children") and not isinstance(
|
if hasattr(element, "children") and not isinstance(
|
||||||
element, processed_block_types
|
element, processed_block_types
|
||||||
):
|
):
|
||||||
for child in element.children:
|
for index, child in enumerate(element.children):
|
||||||
self._iterate_elements(
|
self._iterate_elements(
|
||||||
element=child,
|
element=child,
|
||||||
depth=depth + 1,
|
depth=depth + 1,
|
||||||
doc=doc,
|
doc=doc,
|
||||||
visited=visited,
|
visited=visited,
|
||||||
parent_item=parent_item,
|
parent_item=parent_item,
|
||||||
|
index=index,
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
|
@ -6,7 +6,7 @@ Empty unordered list:
|
|||||||
|
|
||||||
Ordered list:
|
Ordered list:
|
||||||
|
|
||||||
- bar
|
1. bar
|
||||||
|
|
||||||
Empty ordered list:
|
Empty ordered list:
|
||||||
|
|
||||||
|
24
tests/data/groundtruth/docling_v2/lists.md.md
Normal file
24
tests/data/groundtruth/docling_v2/lists.md.md
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# Contributing
|
||||||
|
|
||||||
|
this is awesome stuff
|
||||||
|
|
||||||
|
1. Pull the repository
|
||||||
|
- Make sure to clone the full repository
|
||||||
|
2. Create your feature branch (git checkout -b feature/AmazingFeature)
|
||||||
|
3. Commit your changes (git commit -m 'Add some AmazingFeature')
|
||||||
|
4. Push to the branch (git push origin feature/AmazingFeature)
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
# Example
|
||||||
|
|
||||||
|
1. Item 1 bold: Item 1 description
|
||||||
|
2. Item 2 bold: Item 2 description
|
||||||
|
3. Item 3 bold: Item 3 description
|
||||||
|
|
||||||
|
# Images
|
||||||
|
|
||||||
|
- <img src="image-01.png" alt="Image 1"> Image 1
|
||||||
|
- <img src="image-02.png" alt="Image 2"> Image 2
|
||||||
|
- <img src="image-03.png" alt="Image 3"> Image 3
|
||||||
|
|
||||||
|
# Empty
|
26
tests/data/md/lists.md
Normal file
26
tests/data/md/lists.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# Contributing
|
||||||
|
|
||||||
|
> this is *awesome* stuff
|
||||||
|
|
||||||
|
1. Pull the repository
|
||||||
|
* Make sure to clone the full repository
|
||||||
|
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
||||||
|
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
||||||
|
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
# Example
|
||||||
|
|
||||||
|
1. **Item 1 bold**: Item 1 description
|
||||||
|
2. **Item 2 bold**: Item 2 description
|
||||||
|
3. **Item 3 bold**: Item 3 description
|
||||||
|
|
||||||
|
# Images
|
||||||
|
* <img src="image-01.png" alt="Image 1"> Image 1
|
||||||
|
* <img src="image-02.png" alt="Image 2"> Image 2
|
||||||
|
* <img src="image-03.png" alt="Image 3"> Image 3
|
||||||
|
|
||||||
|
# Empty
|
||||||
|
* <img src="about:blank" alt="blank">
|
||||||
|
* <img src="about:blank" alt="blank">
|
||||||
|
* <img src="about:blank" alt="blank">
|
Loading…
Reference in New Issue
Block a user