mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
Fixes for lists handling in docx
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
d687f93d52
commit
115435a835
@ -88,7 +88,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
)
|
)
|
||||||
doc = DoclingDocument(
|
doc = DoclingDocument(
|
||||||
description=DescriptionItem(), name="name_without_extension", origin=origin
|
description=DescriptionItem(), name="name_without_extension", origin=origin
|
||||||
) # TODO must add origin information
|
) # must add origin information
|
||||||
doc = self.walk_linear(self.pptx_obj, doc)
|
doc = self.walk_linear(self.pptx_obj, doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
@ -177,7 +177,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
e_is_a_list_item = False
|
e_is_a_list_item = False
|
||||||
|
|
||||||
if e_is_a_list_item:
|
if e_is_a_list_item:
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# Set marker and enumerated arguments if this is an enumeration element.
|
||||||
enum_marker = str(enum_list_item_value) + "."
|
enum_marker = str(enum_list_item_value) + "."
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
|
@ -44,6 +44,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
self.level = 0
|
self.level = 0
|
||||||
|
self.listIter = 0
|
||||||
|
|
||||||
self.history = {
|
self.history = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
@ -115,6 +116,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
|
|
||||||
|
# Check for Inline Images (drawings or blip elements)
|
||||||
found_drawing = etree.ElementBase.xpath(
|
found_drawing = etree.ElementBase.xpath(
|
||||||
element, ".//w:drawing", namespaces=self.xml_namespaces
|
element, ".//w:drawing", namespaces=self.xml_namespaces
|
||||||
)
|
)
|
||||||
@ -128,8 +130,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.handle_tables(element, docx_obj, doc)
|
self.handle_tables(element, docx_obj, doc)
|
||||||
except Exception:
|
except Exception:
|
||||||
_log.error("could not parse a table, broken docx table")
|
_log.error("could not parse a table, broken docx table")
|
||||||
# Check for Inline Images (drawings or blip elements)
|
|
||||||
# elif element.xpath(".//w:drawing", namespaces = self.xml_namespaces) or element.xpath(".//w:pict", namespaces = self.xml_namespaces):
|
|
||||||
elif found_drawing or found_pict:
|
elif found_drawing or found_pict:
|
||||||
self.handle_pictures(element, docx_obj, doc)
|
self.handle_pictures(element, docx_obj, doc)
|
||||||
# Check for Text
|
# Check for Text
|
||||||
@ -157,7 +158,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Get the numId element and extract the value
|
# Get the numId element and extract the value
|
||||||
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
|
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
|
||||||
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
|
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
|
||||||
|
|
||||||
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
||||||
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
||||||
|
|
||||||
@ -206,15 +206,31 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text = paragraph.text.strip()
|
text = paragraph.text.strip()
|
||||||
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
||||||
|
|
||||||
|
# Common styles for bullet and numbered lists.
|
||||||
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
|
# TODO: reliably identify wether list is a numbered list or not
|
||||||
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||||
|
is_numbered = False
|
||||||
|
|
||||||
p_style_name, p_level = self.get_label_and_level(paragraph)
|
p_style_name, p_level = self.get_label_and_level(paragraph)
|
||||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
||||||
|
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
|
||||||
|
|
||||||
# print("paragraph.text: {} | numid: {} | ilevel: {}".format(paragraph.text, numid, ilevel))
|
if numid == 0:
|
||||||
|
numid = None
|
||||||
|
|
||||||
# Handle lists
|
# Handle lists
|
||||||
if numid is not None and ilevel is not None:
|
if numid is not None and ilevel is not None:
|
||||||
self.add_listitem(
|
self.add_listitem(
|
||||||
element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text
|
element,
|
||||||
|
docx_obj,
|
||||||
|
doc,
|
||||||
|
p_style_name,
|
||||||
|
p_level,
|
||||||
|
numid,
|
||||||
|
ilevel,
|
||||||
|
text,
|
||||||
|
is_numbered,
|
||||||
)
|
)
|
||||||
self.update_history(p_style_name, p_level, numid, ilevel)
|
self.update_history(p_style_name, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
@ -224,14 +240,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
self.level = self.level_at_new_list - 1
|
self.level = self.level_at_new_list - 1
|
||||||
self.level_at_new_list = None
|
self.level_at_new_list = None
|
||||||
|
|
||||||
if p_style_name in ["Title"]:
|
if p_style_name in ["Title"]:
|
||||||
for key, val in self.parents.items():
|
for key, val in self.parents.items():
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
self.parents[0] = doc.add_text(
|
self.parents[0] = doc.add_text(
|
||||||
parent=None, label=DocItemLabel.TITLE, text=text
|
parent=None, label=DocItemLabel.TITLE, text=text
|
||||||
)
|
)
|
||||||
|
|
||||||
elif "Heading" in p_style_name:
|
elif "Heading" in p_style_name:
|
||||||
self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
|
self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
|
||||||
|
|
||||||
@ -303,8 +317,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def add_listitem(
|
def add_listitem(
|
||||||
self, element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text: str
|
self,
|
||||||
|
element,
|
||||||
|
docx_obj,
|
||||||
|
doc,
|
||||||
|
p_style_name,
|
||||||
|
p_level,
|
||||||
|
numid,
|
||||||
|
ilevel,
|
||||||
|
text: str,
|
||||||
|
is_numbered=False,
|
||||||
):
|
):
|
||||||
|
# is_numbered = is_numbered
|
||||||
|
enum_marker = ""
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
if self.prev_numid() is None: # Open new list
|
if self.prev_numid() is None: # Open new list
|
||||||
self.level_at_new_list = level
|
self.level_at_new_list = level
|
||||||
@ -314,7 +340,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
doc.add_list_item(parent=self.parents[level], text=text)
|
self.listIter += 1
|
||||||
|
if is_numbered:
|
||||||
|
enum_marker = str(self.listIter) + "."
|
||||||
|
is_numbered = True
|
||||||
|
doc.add_list_item(
|
||||||
|
marker=enum_marker,
|
||||||
|
enumerated=is_numbered,
|
||||||
|
parent=self.parents[level],
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self.prev_numid() == numid and self.prev_indent() < ilevel
|
self.prev_numid() == numid and self.prev_indent() < ilevel
|
||||||
@ -325,12 +360,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
):
|
):
|
||||||
# TODO: determine if this is an unordered list or an ordered list.
|
# TODO: determine if this is an unordered list or an ordered list.
|
||||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||||
self.parents[i] = doc.add_group(
|
self.listIter = 0
|
||||||
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
if is_numbered:
|
||||||
)
|
self.parents[i] = doc.add_group(
|
||||||
|
label=GroupLabel.ORDERED_LIST,
|
||||||
|
name="list",
|
||||||
|
parent=self.parents[i - 1],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.parents[i] = doc.add_group(
|
||||||
|
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
|
self.listIter += 1
|
||||||
|
if is_numbered:
|
||||||
|
enum_marker = str(self.listIter) + "."
|
||||||
|
is_numbered = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
|
marker=enum_marker,
|
||||||
|
enumerated=is_numbered,
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
@ -341,14 +390,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents[k] = None
|
self.parents[k] = None
|
||||||
|
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
|
self.listIter += 1
|
||||||
|
if is_numbered:
|
||||||
|
enum_marker = str(self.listIter) + "."
|
||||||
|
is_numbered = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
|
marker=enum_marker,
|
||||||
|
enumerated=is_numbered,
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
|
self.listIter = 0
|
||||||
|
|
||||||
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
doc.add_list_item(parent=self.parents[level - 1], text=text)
|
self.listIter += 1
|
||||||
|
if is_numbered:
|
||||||
|
enum_marker = str(self.listIter) + "."
|
||||||
|
is_numbered = True
|
||||||
|
doc.add_list_item(
|
||||||
|
marker=enum_marker,
|
||||||
|
enumerated=is_numbered,
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_tables(self, element, docx_obj, doc):
|
def handle_tables(self, element, docx_obj, doc):
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user