mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix(markdown): ensure correct parsing of nested lists (#1995)
* fix(markdown): ensure correct parsing of nested lists Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: update dependencies in uv.lock file Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
1985841a19
commit
aec29a7315
@ -5,7 +5,7 @@ from copy import deepcopy
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Literal, Optional, Set, Union
|
from typing import Literal, Optional, Union, cast
|
||||||
|
|
||||||
import marko
|
import marko
|
||||||
import marko.element
|
import marko.element
|
||||||
@ -14,6 +14,7 @@ from docling_core.types.doc import (
|
|||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
|
ListItem,
|
||||||
NodeItem,
|
NodeItem,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
_log.debug("MD INIT!!!")
|
_log.debug("Starting MarkdownDocumentBackend...")
|
||||||
|
|
||||||
# Markdown file:
|
# Markdown file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for md_table_row in self.md_table_buffer:
|
for md_table_row in self.md_table_buffer:
|
||||||
_log.debug(md_table_row)
|
_log.debug(md_table_row)
|
||||||
_log.debug("=== TABLE END ===")
|
_log.debug("=== TABLE END ===")
|
||||||
tcells: List[TableCell] = []
|
tcells: list[TableCell] = []
|
||||||
result_table = []
|
result_table = []
|
||||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||||
data = []
|
data = []
|
||||||
@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
element: marko.element.Element,
|
element: marko.element.Element,
|
||||||
depth: int,
|
depth: int,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
visited: Set[marko.element.Element],
|
visited: set[marko.element.Element],
|
||||||
creation_stack: list[
|
creation_stack: list[
|
||||||
_CreationPayload
|
_CreationPayload
|
||||||
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
||||||
list_ordered_flag_by_ref: dict[str, bool],
|
list_ordered_flag_by_ref: dict[str, bool],
|
||||||
|
list_last_item_by_ref: dict[str, ListItem],
|
||||||
parent_item: Optional[NodeItem] = None,
|
parent_item: Optional[NodeItem] = None,
|
||||||
formatting: Optional[Formatting] = None,
|
formatting: Optional[Formatting] = None,
|
||||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||||
@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif (
|
elif (
|
||||||
isinstance(element, marko.block.ListItem)
|
isinstance(element, marko.block.ListItem)
|
||||||
and len(element.children) == 1
|
and len(element.children) > 0
|
||||||
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||||
and len(child.children) > 0
|
and len(child.children) > 0
|
||||||
):
|
):
|
||||||
@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if parent_item
|
if parent_item
|
||||||
else False
|
else False
|
||||||
)
|
)
|
||||||
if len(child.children) > 1: # inline group will be created further down
|
non_list_children: list[marko.element.Element] = [
|
||||||
|
item
|
||||||
|
for item in child.children
|
||||||
|
if not isinstance(item, marko.block.ListItem)
|
||||||
|
]
|
||||||
|
if len(non_list_children) > 1: # inline group will be created further down
|
||||||
|
parent_ref: Optional[str] = (
|
||||||
|
parent_item.self_ref if parent_item else None
|
||||||
|
)
|
||||||
parent_item = self._create_list_item(
|
parent_item = self._create_list_item(
|
||||||
doc=doc,
|
doc=doc,
|
||||||
parent_item=parent_item,
|
parent_item=parent_item,
|
||||||
@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
formatting=formatting,
|
formatting=formatting,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
if parent_ref:
|
||||||
|
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
|
||||||
else:
|
else:
|
||||||
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
||||||
|
|
||||||
@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
element.dest
|
element.dest
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
|
||||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
_log.debug(f" - RawText/Literal: {element.children}")
|
||||||
snippet_text = element.children.strip()
|
snippet_text = (
|
||||||
|
element.children.strip() if isinstance(element.children, str) else ""
|
||||||
|
)
|
||||||
# Detect start of the table:
|
# Detect start of the table:
|
||||||
if "|" in snippet_text or self.in_table:
|
if "|" in snippet_text or self.in_table:
|
||||||
# most likely part of the markdown table
|
# most likely part of the markdown table
|
||||||
@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if parent_item
|
if parent_item
|
||||||
else False
|
else False
|
||||||
)
|
)
|
||||||
|
parent_ref = parent_item.self_ref if parent_item else None
|
||||||
parent_item = self._create_list_item(
|
parent_item = self._create_list_item(
|
||||||
doc=doc,
|
doc=doc,
|
||||||
parent_item=parent_item,
|
parent_item=parent_item,
|
||||||
@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
formatting=formatting,
|
formatting=formatting,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
if parent_ref:
|
||||||
|
list_last_item_by_ref[parent_ref] = cast(
|
||||||
|
ListItem, parent_item
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(to_create, _HeadingCreationPayload):
|
elif isinstance(to_create, _HeadingCreationPayload):
|
||||||
# not keeping as parent_item as logic for correctly tracking
|
# not keeping as parent_item as logic for correctly tracking
|
||||||
# that not implemented yet (section components not captured
|
# that not implemented yet (section components not captured
|
||||||
@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
element, processed_block_types
|
element, processed_block_types
|
||||||
):
|
):
|
||||||
for child in element.children:
|
for child in element.children:
|
||||||
|
if (
|
||||||
|
isinstance(element, marko.block.ListItem)
|
||||||
|
and isinstance(child, marko.block.List)
|
||||||
|
and parent_item
|
||||||
|
and list_last_item_by_ref.get(parent_item.self_ref, None)
|
||||||
|
):
|
||||||
|
_log.debug(
|
||||||
|
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
|
||||||
|
)
|
||||||
|
parent_item = list_last_item_by_ref[parent_item.self_ref]
|
||||||
|
|
||||||
self._iterate_elements(
|
self._iterate_elements(
|
||||||
element=child,
|
element=child,
|
||||||
depth=depth + 1,
|
depth=depth + 1,
|
||||||
@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
visited=visited,
|
visited=visited,
|
||||||
creation_stack=creation_stack,
|
creation_stack=creation_stack,
|
||||||
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
||||||
|
list_last_item_by_ref=list_last_item_by_ref,
|
||||||
parent_item=parent_item,
|
parent_item=parent_item,
|
||||||
formatting=formatting,
|
formatting=formatting,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
return {InputFormat.MD}
|
return {InputFormat.MD}
|
||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
visited=set(),
|
visited=set(),
|
||||||
creation_stack=[],
|
creation_stack=[],
|
||||||
list_ordered_flag_by_ref={},
|
list_ordered_flag_by_ref={},
|
||||||
|
list_last_item_by_ref={},
|
||||||
)
|
)
|
||||||
self._close_table(doc=doc) # handle any last hanging table
|
self._close_table(doc=doc) # handle any last hanging table
|
||||||
|
|
||||||
@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
]:
|
]:
|
||||||
html_str = _restore_original_html(txt=html_str, regex=regex)
|
html_str = _restore_original_html(txt=html_str, regex=regex)
|
||||||
self._html_blocks = 0
|
self._html_blocks = 0
|
||||||
|
|
||||||
# delegate to HTML backend
|
# delegate to HTML backend
|
||||||
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
|
@ -3,6 +3,6 @@
|
|||||||
- A. first
|
- A. first
|
||||||
- subitem
|
- subitem
|
||||||
- B. second
|
- B. second
|
||||||
1. strange
|
- 2 . strange
|
||||||
|
|
||||||
The end!
|
The end!
|
||||||
|
139
tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
vendored
Normal file
139
tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
vendored
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
body:
|
||||||
|
children:
|
||||||
|
- $ref: '#/texts/0'
|
||||||
|
- $ref: '#/texts/1'
|
||||||
|
- $ref: '#/groups/0'
|
||||||
|
content_layer: body
|
||||||
|
label: unspecified
|
||||||
|
name: _root_
|
||||||
|
self_ref: '#/body'
|
||||||
|
form_items: []
|
||||||
|
furniture:
|
||||||
|
children: []
|
||||||
|
content_layer: furniture
|
||||||
|
label: unspecified
|
||||||
|
name: _root_
|
||||||
|
self_ref: '#/furniture'
|
||||||
|
groups:
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/2'
|
||||||
|
content_layer: body
|
||||||
|
label: section
|
||||||
|
name: header-1
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
self_ref: '#/groups/0'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/3'
|
||||||
|
- $ref: '#/texts/5'
|
||||||
|
- $ref: '#/texts/6'
|
||||||
|
content_layer: body
|
||||||
|
label: list
|
||||||
|
name: list
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/2'
|
||||||
|
self_ref: '#/groups/1'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/4'
|
||||||
|
content_layer: body
|
||||||
|
label: list
|
||||||
|
name: list
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/3'
|
||||||
|
self_ref: '#/groups/2'
|
||||||
|
key_value_items: []
|
||||||
|
name: mixed_without_h1
|
||||||
|
origin:
|
||||||
|
binary_hash: 7394721163373597328
|
||||||
|
filename: mixed_without_h1.md
|
||||||
|
mimetype: text/html
|
||||||
|
pages: {}
|
||||||
|
pictures: []
|
||||||
|
schema_name: DoclingDocument
|
||||||
|
tables: []
|
||||||
|
texts:
|
||||||
|
- children: []
|
||||||
|
content_layer: furniture
|
||||||
|
label: title
|
||||||
|
orig: mixed_without_h1
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/0'
|
||||||
|
text: mixed_without_h1
|
||||||
|
- children: []
|
||||||
|
content_layer: furniture
|
||||||
|
label: text
|
||||||
|
orig: Content before first heading
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/1'
|
||||||
|
text: Content before first heading
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/1'
|
||||||
|
- $ref: '#/texts/7'
|
||||||
|
content_layer: body
|
||||||
|
label: section_header
|
||||||
|
level: 1
|
||||||
|
orig: Some heading
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/0'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/2'
|
||||||
|
text: Some heading
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/2'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: false
|
||||||
|
label: list_item
|
||||||
|
marker: ''
|
||||||
|
orig: A. first
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/1'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/3'
|
||||||
|
text: A. first
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
enumerated: false
|
||||||
|
label: list_item
|
||||||
|
marker: ''
|
||||||
|
orig: subitem
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/4'
|
||||||
|
text: subitem
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
enumerated: false
|
||||||
|
label: list_item
|
||||||
|
marker: ''
|
||||||
|
orig: B. second
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/1'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/5'
|
||||||
|
text: B. second
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
enumerated: false
|
||||||
|
label: list_item
|
||||||
|
marker: ''
|
||||||
|
orig: 2 . strange
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/1'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/6'
|
||||||
|
text: 2 . strange
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: The end!
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/7'
|
||||||
|
text: The end!
|
||||||
|
version: 1.5.0
|
2
tests/data/md/mixed_without_h1.md
vendored
2
tests/data/md/mixed_without_h1.md
vendored
@ -7,6 +7,6 @@ Content before first heading
|
|||||||
- A. first
|
- A. first
|
||||||
- subitem
|
- subitem
|
||||||
- B. second
|
- B. second
|
||||||
- 2. strange
|
- 2\. strange
|
||||||
|
|
||||||
The end!
|
The end!
|
||||||
|
@ -16,7 +16,7 @@ def test_convert_valid():
|
|||||||
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
||||||
assert len(relevant_paths) > 0
|
assert len(relevant_paths) > 0
|
||||||
|
|
||||||
yaml_filter = ["inline_and_formatting"]
|
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
|
||||||
|
|
||||||
for in_path in relevant_paths:
|
for in_path in relevant_paths:
|
||||||
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||||
@ -41,12 +41,11 @@ def test_convert_valid():
|
|||||||
f.write(f"{act_data}\n")
|
f.write(f"{act_data}\n")
|
||||||
|
|
||||||
if in_path.stem in yaml_filter:
|
if in_path.stem in yaml_filter:
|
||||||
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
|
act_doc.save_as_yaml(
|
||||||
act_doc.save_as_yaml(
|
yaml_gt_path,
|
||||||
yaml_gt_path,
|
coord_precision=COORD_PREC,
|
||||||
coord_precision=COORD_PREC,
|
confid_precision=CONFID_PREC,
|
||||||
confid_precision=CONFID_PREC,
|
)
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
with open(md_gt_path, encoding="utf-8") as f:
|
with open(md_gt_path, encoding="utf-8") as f:
|
||||||
exp_data = f.read().rstrip()
|
exp_data = f.read().rstrip()
|
||||||
@ -54,4 +53,4 @@ def test_convert_valid():
|
|||||||
|
|
||||||
if in_path.stem in yaml_filter:
|
if in_path.stem in yaml_filter:
|
||||||
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||||
assert act_doc == exp_doc
|
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
|
||||||
|
Loading…
Reference in New Issue
Block a user