mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fix(markdown): ensure correct parsing of nested lists (#1995)
* fix(markdown): ensure correct parsing of nested lists Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: update dependencies in uv.lock file Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
1985841a19
commit
aec29a7315
@ -5,7 +5,7 @@ from copy import deepcopy
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Set, Union
|
||||
from typing import Literal, Optional, Union, cast
|
||||
|
||||
import marko
|
||||
import marko.element
|
||||
@ -14,6 +14,7 @@ from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
ListItem,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
_log.debug("MD INIT!!!")
|
||||
_log.debug("Starting MarkdownDocumentBackend...")
|
||||
|
||||
# Markdown file:
|
||||
self.path_or_stream = path_or_stream
|
||||
@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
for md_table_row in self.md_table_buffer:
|
||||
_log.debug(md_table_row)
|
||||
_log.debug("=== TABLE END ===")
|
||||
tcells: List[TableCell] = []
|
||||
tcells: list[TableCell] = []
|
||||
result_table = []
|
||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||
data = []
|
||||
@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
visited: Set[marko.element.Element],
|
||||
visited: set[marko.element.Element],
|
||||
creation_stack: list[
|
||||
_CreationPayload
|
||||
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
||||
list_ordered_flag_by_ref: dict[str, bool],
|
||||
list_last_item_by_ref: dict[str, ListItem],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and len(element.children) == 1
|
||||
and len(element.children) > 0
|
||||
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||
and len(child.children) > 0
|
||||
):
|
||||
@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if parent_item
|
||||
else False
|
||||
)
|
||||
if len(child.children) > 1: # inline group will be created further down
|
||||
non_list_children: list[marko.element.Element] = [
|
||||
item
|
||||
for item in child.children
|
||||
if not isinstance(item, marko.block.ListItem)
|
||||
]
|
||||
if len(non_list_children) > 1: # inline group will be created further down
|
||||
parent_ref: Optional[str] = (
|
||||
parent_item.self_ref if parent_item else None
|
||||
)
|
||||
parent_item = self._create_list_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
if parent_ref:
|
||||
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
|
||||
else:
|
||||
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
||||
|
||||
@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element.dest
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
snippet_text = element.children.strip()
|
||||
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
|
||||
_log.debug(f" - RawText/Literal: {element.children}")
|
||||
snippet_text = (
|
||||
element.children.strip() if isinstance(element.children, str) else ""
|
||||
)
|
||||
# Detect start of the table:
|
||||
if "|" in snippet_text or self.in_table:
|
||||
# most likely part of the markdown table
|
||||
@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if parent_item
|
||||
else False
|
||||
)
|
||||
parent_ref = parent_item.self_ref if parent_item else None
|
||||
parent_item = self._create_list_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
if parent_ref:
|
||||
list_last_item_by_ref[parent_ref] = cast(
|
||||
ListItem, parent_item
|
||||
)
|
||||
|
||||
elif isinstance(to_create, _HeadingCreationPayload):
|
||||
# not keeping as parent_item as logic for correctly tracking
|
||||
# that not implemented yet (section components not captured
|
||||
@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element, processed_block_types
|
||||
):
|
||||
for child in element.children:
|
||||
if (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and isinstance(child, marko.block.List)
|
||||
and parent_item
|
||||
and list_last_item_by_ref.get(parent_item.self_ref, None)
|
||||
):
|
||||
_log.debug(
|
||||
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
|
||||
)
|
||||
parent_item = list_last_item_by_ref[parent_item.self_ref]
|
||||
|
||||
self._iterate_elements(
|
||||
element=child,
|
||||
depth=depth + 1,
|
||||
@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
visited=visited,
|
||||
creation_stack=creation_stack,
|
||||
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
||||
list_last_item_by_ref=list_last_item_by_ref,
|
||||
parent_item=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.MD}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
visited=set(),
|
||||
creation_stack=[],
|
||||
list_ordered_flag_by_ref={},
|
||||
list_last_item_by_ref={},
|
||||
)
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
]:
|
||||
html_str = _restore_original_html(txt=html_str, regex=regex)
|
||||
self._html_blocks = 0
|
||||
|
||||
# delegate to HTML backend
|
||||
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
||||
in_doc = InputDocument(
|
||||
|
@ -3,6 +3,6 @@
|
||||
- A. first
|
||||
- subitem
|
||||
- B. second
|
||||
1. strange
|
||||
- 2 . strange
|
||||
|
||||
The end!
|
||||
|
139
tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
vendored
Normal file
139
tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
vendored
Normal file
@ -0,0 +1,139 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/groups/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups:
|
||||
- children:
|
||||
- $ref: '#/texts/2'
|
||||
content_layer: body
|
||||
label: section
|
||||
name: header-1
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/0'
|
||||
- children:
|
||||
- $ref: '#/texts/3'
|
||||
- $ref: '#/texts/5'
|
||||
- $ref: '#/texts/6'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/texts/2'
|
||||
self_ref: '#/groups/1'
|
||||
- children:
|
||||
- $ref: '#/texts/4'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/texts/3'
|
||||
self_ref: '#/groups/2'
|
||||
key_value_items: []
|
||||
name: mixed_without_h1
|
||||
origin:
|
||||
binary_hash: 7394721163373597328
|
||||
filename: mixed_without_h1.md
|
||||
mimetype: text/html
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables: []
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: furniture
|
||||
label: title
|
||||
orig: mixed_without_h1
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: mixed_without_h1
|
||||
- children: []
|
||||
content_layer: furniture
|
||||
label: text
|
||||
orig: Content before first heading
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: Content before first heading
|
||||
- children:
|
||||
- $ref: '#/groups/1'
|
||||
- $ref: '#/texts/7'
|
||||
content_layer: body
|
||||
label: section_header
|
||||
level: 1
|
||||
orig: Some heading
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: Some heading
|
||||
- children:
|
||||
- $ref: '#/groups/2'
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: A. first
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: A. first
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: subitem
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/4'
|
||||
text: subitem
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: B. second
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/5'
|
||||
text: B. second
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: 2 . strange
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/6'
|
||||
text: 2 . strange
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: The end!
|
||||
parent:
|
||||
$ref: '#/texts/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/7'
|
||||
text: The end!
|
||||
version: 1.5.0
|
2
tests/data/md/mixed_without_h1.md
vendored
2
tests/data/md/mixed_without_h1.md
vendored
@ -7,6 +7,6 @@ Content before first heading
|
||||
- A. first
|
||||
- subitem
|
||||
- B. second
|
||||
- 2. strange
|
||||
- 2\. strange
|
||||
|
||||
The end!
|
||||
|
@ -16,7 +16,7 @@ def test_convert_valid():
|
||||
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
||||
assert len(relevant_paths) > 0
|
||||
|
||||
yaml_filter = ["inline_and_formatting"]
|
||||
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
|
||||
|
||||
for in_path in relevant_paths:
|
||||
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||
@ -41,7 +41,6 @@ def test_convert_valid():
|
||||
f.write(f"{act_data}\n")
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
|
||||
act_doc.save_as_yaml(
|
||||
yaml_gt_path,
|
||||
coord_precision=COORD_PREC,
|
||||
@ -54,4 +53,4 @@ def test_convert_valid():
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||
assert act_doc == exp_doc
|
||||
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
|
||||
|
Loading…
Reference in New Issue
Block a user