fix(docx): ensure list items have a list parent (#1827)

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-06-20 14:47:25 +02:00
committed by GitHub
parent 1350a8d3e5
commit d26dac61a8
4 changed files with 200 additions and 173 deletions

View File

@@ -14,7 +14,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import Formatting
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override
@@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def _str_to_int(
@@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
all_paragraphs = []
# Sort paragraphs within each container, then process containers
for container_id, paragraphs in container_paragraphs.items():
for paragraphs in container_paragraphs.values():
# Sort by vertical position within each container
sorted_container_paragraphs = sorted(
paragraphs,
@@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
) -> None:
paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text
)
if text is None:
return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip()
# Common styles for bullet and numbered lists.
@@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return
def _add_formatted_list_item(
self,
doc: DoclingDocument,
elements: list,
marker: str,
enumerated: bool,
level: int,
) -> None:
# This should not happen by construction
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
return
if len(elements) == 1:
text, format, hyperlink = elements[0]
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
new_item = doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text="",
)
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
for text, format, hyperlink in elements:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
def _add_list_item(
self,
*,
@@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elements: list,
is_numbered: bool = False,
) -> None:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return None
enum_marker = ""
level = self._get_level()
@@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
@@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and ilevel < prev_indent
): # Close list
for k, v in self.parents.items():
for k in self.parents:
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
@@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0
elif self._prev_numid() == numid or prev_indent == ilevel:
@@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
)
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return
def _handle_tables(