mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 06:08:09 +00:00
fix(docx): ensure list items have a list parent (#1827)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
1350a8d3e5
commit
d26dac61a8
@@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from docx import Document
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.oxml.table import CT_Tc
|
||||
@@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
@@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
|
||||
return doc
|
||||
|
||||
def _str_to_int(
|
||||
@@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
all_paragraphs = []
|
||||
|
||||
# Sort paragraphs within each container, then process containers
|
||||
for container_id, paragraphs in container_paragraphs.items():
|
||||
for paragraphs in container_paragraphs.values():
|
||||
# Sort by vertical position within each container
|
||||
sorted_container_paragraphs = sorted(
|
||||
paragraphs,
|
||||
@@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
element=element, text=paragraph.text
|
||||
)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
@@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def _add_formatted_list_item(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
elements: list,
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> None:
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||
return
|
||||
if len(elements) == 1:
|
||||
text, format, hyperlink = elements[0]
|
||||
doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
new_item = doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text="",
|
||||
)
|
||||
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
*,
|
||||
@@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> None:
|
||||
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return None
|
||||
enum_marker = ""
|
||||
|
||||
level = self._get_level()
|
||||
@@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
@@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
and prev_indent is not None
|
||||
and ilevel < prev_indent
|
||||
): # Close list
|
||||
for k, v in self.parents.items():
|
||||
for k in self.parents:
|
||||
if k > self.level_at_new_list + ilevel:
|
||||
self.parents[k] = None
|
||||
|
||||
@@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||
@@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level - 1],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level - 1
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
# Add the list item to the parent group
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
def _handle_tables(
|
||||
|
||||
Reference in New Issue
Block a user