mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 06:08:09 +00:00
feat: leverage new list modeling, capture default markers (#1856)
* chore: update docling-core & regenerate test data Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * update backends to leverage new list modeling Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * repin docling-core Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * ensure availability of latest docling-core API Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
||||
]
|
||||
|
||||
|
||||
class _Context(BaseModel):
|
||||
list_ordered_flag_by_ref: dict[str, bool] = {}
|
||||
list_start_by_ref: dict[str, int] = {}
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
@override
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.max_levels = 10
|
||||
self.level = 0
|
||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||
self.ctx = _Context()
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.content_layer = (
|
||||
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
||||
)
|
||||
self.ctx = _Context() # reset context
|
||||
self.walk(content, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
|
||||
if element.name == "ul":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level],
|
||||
name="list",
|
||||
label=GroupLabel.LIST,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
elif element.name == "ol":
|
||||
start: Optional[int] = None
|
||||
if is_ordered := element.name == "ol":
|
||||
start_attr = element.get("start")
|
||||
start: int = (
|
||||
int(start_attr)
|
||||
if isinstance(start_attr, str) and start_attr.isnumeric()
|
||||
else 1
|
||||
)
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level],
|
||||
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
if isinstance(start_attr, str) and start_attr.isnumeric():
|
||||
start = int(start_attr)
|
||||
name = "ordered list" + (f" start {start}" if start is not None else "")
|
||||
else:
|
||||
name = "list"
|
||||
# create a list group
|
||||
list_group = doc.add_list_group(
|
||||
name=name,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.parents[self.level + 1] = list_group
|
||||
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
|
||||
if is_ordered and start is not None:
|
||||
self.ctx.list_start_by_ref[list_group.self_ref] = start
|
||||
|
||||
self.level += 1
|
||||
|
||||
self.walk(element, doc)
|
||||
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if parent is None:
|
||||
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
|
||||
return
|
||||
parent_label: str = parent.label
|
||||
index_in_list = len(parent.children) + 1
|
||||
if (
|
||||
parent_label == GroupLabel.ORDERED_LIST
|
||||
and isinstance(parent, GroupItem)
|
||||
and parent.name
|
||||
):
|
||||
start_in_list: str = parent.name.split(" ")[-1]
|
||||
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
|
||||
index_in_list += start - 1
|
||||
enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
|
||||
if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
|
||||
marker = f"{start + len(parent.children)}."
|
||||
else:
|
||||
marker = ""
|
||||
|
||||
if nested_list:
|
||||
# Text in list item can be hidden within hierarchy, hence
|
||||
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text = text.replace("\n", "").replace("\r", "")
|
||||
text = " ".join(text.split()).strip()
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = str(index_in_list)
|
||||
enumerated = True
|
||||
|
||||
if len(text) > 0:
|
||||
# create a list-item
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif element.text.strip():
|
||||
text = element.text.strip()
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{index_in_list!s}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
|
||||
@@ -14,13 +14,12 @@ from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from marko import Markdown
|
||||
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
||||
from typing_extensions import Annotated
|
||||
@@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel):
|
||||
|
||||
class _ListItemCreationPayload(BaseModel):
|
||||
kind: Literal["list_item"] = "list_item"
|
||||
enumerated: bool
|
||||
|
||||
|
||||
_CreationPayload = Annotated[
|
||||
@@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
parent_item: Optional[NodeItem],
|
||||
text: str,
|
||||
enumerated: bool,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
):
|
||||
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
||||
_log.warning("ListItem would have not had a list parent, adding one.")
|
||||
parent_item = doc.add_unordered_list(parent=parent_item)
|
||||
item = doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=(isinstance(parent_item, OrderedList)),
|
||||
enumerated=enumerated,
|
||||
parent=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
@@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
creation_stack: list[
|
||||
_CreationPayload
|
||||
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
||||
list_ordered_flag_by_ref: dict[str, bool],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
@@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._close_table(doc)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
parent_item = doc.add_group(
|
||||
label=label, name="list", parent=parent_item
|
||||
)
|
||||
parent_item = doc.add_list_group(name="list", parent=parent_item)
|
||||
list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
|
||||
|
||||
elif (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
@@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._close_table(doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
enumerated = (
|
||||
list_ordered_flag_by_ref.get(parent_item.self_ref, False)
|
||||
if parent_item
|
||||
else False
|
||||
)
|
||||
if len(child.children) > 1: # inline group will be created further down
|
||||
parent_item = self._create_list_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
text="",
|
||||
enumerated=enumerated,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
creation_stack.append(_ListItemCreationPayload())
|
||||
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self._close_table(doc)
|
||||
@@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
while len(creation_stack) > 0:
|
||||
to_create = creation_stack.pop()
|
||||
if isinstance(to_create, _ListItemCreationPayload):
|
||||
enumerated = (
|
||||
list_ordered_flag_by_ref.get(
|
||||
parent_item.self_ref, False
|
||||
)
|
||||
if parent_item
|
||||
else False
|
||||
)
|
||||
parent_item = self._create_list_item(
|
||||
doc=doc,
|
||||
parent_item=parent_item,
|
||||
text=snippet_text,
|
||||
enumerated=enumerated,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
@@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc=doc,
|
||||
visited=visited,
|
||||
creation_stack=creation_stack,
|
||||
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
||||
parent_item=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
@@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent_item=None,
|
||||
visited=set(),
|
||||
creation_stack=[],
|
||||
list_ordered_flag_by_ref={},
|
||||
)
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
|
||||
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
return prov
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
def handle_text_elements(
|
||||
self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
|
||||
):
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
new_list = None
|
||||
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
enumerated = bullet_type == "Numbered"
|
||||
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=GroupLabel.ORDERED_LIST
|
||||
if enumerated
|
||||
else GroupLabel.LIST,
|
||||
new_list = doc.add_list_group(
|
||||
name="list",
|
||||
parent=parent_slide,
|
||||
)
|
||||
|
||||
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
ListGroup,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docx import Document
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.oxml.table import CT_Tc
|
||||
@@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
paragraph_elements: list,
|
||||
) -> Optional[NodeItem]:
|
||||
return (
|
||||
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
|
||||
doc.add_inline_group(parent=prev_parent)
|
||||
if len(paragraph_elements) > 1
|
||||
else prev_parent
|
||||
)
|
||||
@@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
# Inline equation
|
||||
level = self._get_level()
|
||||
inline_equation = doc.add_group(
|
||||
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||
)
|
||||
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
|
||||
text_tmp = text
|
||||
for eq in equations:
|
||||
if len(text_tmp) == 0:
|
||||
@@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
level: int,
|
||||
) -> None:
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||
if not isinstance(self.parents[level], ListGroup):
|
||||
return
|
||||
if not elements:
|
||||
return
|
||||
|
||||
if len(elements) == 1:
|
||||
text, format, hyperlink = elements[0]
|
||||
doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
if text:
|
||||
doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
new_item = doc.add_list_item(
|
||||
marker=marker,
|
||||
@@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[level],
|
||||
text="",
|
||||
)
|
||||
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||
new_parent = doc.add_inline_group(parent=new_item)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
if text:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
@@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if self._prev_numid() is None: # Open new list
|
||||
self.level_at_new_list = level
|
||||
|
||||
self.parents[level] = doc.add_group(
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||
self.parents[level] = doc.add_list_group(
|
||||
name="list", parent=self.parents[level - 1]
|
||||
)
|
||||
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
@@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level_at_new_list + prev_indent + 1,
|
||||
self.level_at_new_list + ilevel + 1,
|
||||
):
|
||||
# Determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
self.listIter = 0
|
||||
if is_numbered:
|
||||
self.parents[i] = doc.add_group(
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
name="list",
|
||||
parent=self.parents[i - 1],
|
||||
)
|
||||
else:
|
||||
self.parents[i] = doc.add_group(
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
||||
)
|
||||
self.parents[i] = doc.add_list_group(
|
||||
name="list", parent=self.parents[i - 1]
|
||||
)
|
||||
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
|
||||
Reference in New Issue
Block a user