feat: leverage new list modeling, capture default markers (#1856)

* chore: update docling-core & regenerate test data

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* update backends to leverage new list modeling

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* repin docling-core

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* ensure availability of latest docling-core API

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

---------

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas
2025-06-27 16:37:15 +02:00
committed by GitHub
parent e79e4f0ab6
commit 0533da1923
90 changed files with 2252 additions and 2240 deletions

View File

@@ -17,6 +17,7 @@ from docling_core.types.doc import (
TableData,
)
from docling_core.types.doc.document import ContentLayer
from pydantic import BaseModel
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
]
class _Context(BaseModel):
list_ordered_flag_by_ref: dict[str, bool] = {}
list_start_by_ref: dict[str, int] = {}
class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10
self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
self.ctx = _Context()
for i in range(self.max_levels):
self.parents[i] = None
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
self.ctx = _Context() # reset context
self.walk(content, doc)
else:
raise RuntimeError(
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="list",
label=GroupLabel.LIST,
content_layer=self.content_layer,
)
elif element.name == "ol":
start: Optional[int] = None
if is_ordered := element.name == "ol":
start_attr = element.get("start")
start: int = (
int(start_attr)
if isinstance(start_attr, str) and start_attr.isnumeric()
else 1
)
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer,
)
if isinstance(start_attr, str) and start_attr.isnumeric():
start = int(start_attr)
name = "ordered list" + (f" start {start}" if start is not None else "")
else:
name = "list"
# create a list group
list_group = doc.add_list_group(
name=name,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.parents[self.level + 1] = list_group
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
if is_ordered and start is not None:
self.ctx.list_start_by_ref[list_group.self_ref] = start
self.level += 1
self.walk(element, doc)
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if parent is None:
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
return
parent_label: str = parent.label
index_in_list = len(parent.children) + 1
if (
parent_label == GroupLabel.ORDERED_LIST
and isinstance(parent, GroupItem)
and parent.name
):
start_in_list: str = parent.name.split(" ")[-1]
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
index_in_list += start - 1
enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
marker = f"{start + len(parent.children)}."
else:
marker = ""
if nested_list:
# Text in list item can be hidden within hierarchy, hence
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
marker = ""
enumerated = False
if parent_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
if len(text) > 0:
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif element.text.strip():
text = element.text.strip()
marker = ""
enumerated = False
if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{index_in_list!s}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,

View File

@@ -14,13 +14,12 @@ from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docling_core.types.doc.document import Formatting
from marko import Markdown
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
from typing_extensions import Annotated
@@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel):
class _ListItemCreationPayload(BaseModel):
kind: Literal["list_item"] = "list_item"
enumerated: bool
_CreationPayload = Annotated[
@@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
parent_item: Optional[NodeItem],
text: str,
enumerated: bool,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
):
if not isinstance(parent_item, (OrderedList, UnorderedList)):
_log.warning("ListItem would have not had a list parent, adding one.")
parent_item = doc.add_unordered_list(parent=parent_item)
item = doc.add_list_item(
text=text,
enumerated=(isinstance(parent_item, OrderedList)),
enumerated=enumerated,
parent=parent_item,
formatting=formatting,
hyperlink=hyperlink,
@@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
creation_stack: list[
_CreationPayload
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
list_ordered_flag_by_ref: dict[str, bool],
parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._close_table(doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_item = doc.add_group(
label=label, name="list", parent=parent_item
)
parent_item = doc.add_list_group(name="list", parent=parent_item)
list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
elif (
isinstance(element, marko.block.ListItem)
@@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._close_table(doc)
_log.debug(" - List item")
enumerated = (
list_ordered_flag_by_ref.get(parent_item.self_ref, False)
if parent_item
else False
)
if len(child.children) > 1: # inline group will be created further down
parent_item = self._create_list_item(
doc=doc,
parent_item=parent_item,
text="",
enumerated=enumerated,
formatting=formatting,
hyperlink=hyperlink,
)
else:
creation_stack.append(_ListItemCreationPayload())
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
@@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
while len(creation_stack) > 0:
to_create = creation_stack.pop()
if isinstance(to_create, _ListItemCreationPayload):
enumerated = (
list_ordered_flag_by_ref.get(
parent_item.self_ref, False
)
if parent_item
else False
)
parent_item = self._create_list_item(
doc=doc,
parent_item=parent_item,
text=snippet_text,
enumerated=enumerated,
formatting=formatting,
hyperlink=hyperlink,
)
@@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc,
visited=visited,
creation_stack=creation_stack,
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
@@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item=None,
visited=set(),
creation_stack=[],
list_ordered_flag_by_ref={},
)
self._close_table(doc=doc) # handle any last hanging table

View File

@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
def handle_text_elements(
self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
):
is_list_group_created = False
enum_list_item_value = 0
new_list = None
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enumerated = bullet_type == "Numbered"
if not is_list_group_created:
new_list = doc.add_group(
label=GroupLabel.ORDERED_LIST
if enumerated
else GroupLabel.LIST,
new_list = doc.add_list_group(
name="list",
parent=parent_slide,
)

View File

@@ -10,11 +10,12 @@ from docling_core.types.doc import (
DocumentOrigin,
GroupLabel,
ImageRef,
ListGroup,
NodeItem,
TableCell,
TableData,
)
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docling_core.types.doc.document import Formatting
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements: list,
) -> Optional[NodeItem]:
return (
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
doc.add_inline_group(parent=prev_parent)
if len(paragraph_elements) > 1
else prev_parent
)
@@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
# Inline equation
level = self._get_level()
inline_equation = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
)
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
text_tmp = text
for eq in equations:
if len(text_tmp) == 0:
@@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level: int,
) -> None:
# This should not happen by construction
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
if not isinstance(self.parents[level], ListGroup):
return
if not elements:
return
if len(elements) == 1:
text, format, hyperlink = elements[0]
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
if text:
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
new_item = doc.add_list_item(
marker=marker,
@@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[level],
text="",
)
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
new_parent = doc.add_inline_group(parent=new_item)
for text, format, hyperlink in elements:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
if text:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
def _add_list_item(
self,
@@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if self._prev_numid() is None: # Open new list
self.level_at_new_list = level
self.parents[level] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
self.parents[level] = doc.add_list_group(
name="list", parent=self.parents[level - 1]
)
# Set marker and enumerated arguments if this is an enumeration element.
@@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1,
):
# Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0
if is_numbered:
self.parents[i] = doc.add_group(
label=GroupLabel.ORDERED_LIST,
name="list",
parent=self.parents[i - 1],
)
else:
self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
)
self.parents[i] = doc.add_list_group(
name="list", parent=self.parents[i - 1]
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1