diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 0544cc9c..1f939f2e 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -113,9 +113,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB def handle_text_elements(self, shape, parent_slide, slide_ind, doc): is_a_list = False enum_list_item_value = 0 + new_list = None + bullet_type = "None" + list_text = "" + list_label = GroupLabel.LIST + prov = self.generate_prov(shape, slide_ind, shape.text.strip()) + + # Identify if shape contains lists for paragraph in shape.text_frame.paragraphs: - enum_list_item_value += 1 - bullet_type = "None" # Check if paragraph is a bullet point using the `element` XML p = paragraph._element if ( @@ -136,29 +141,26 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB if paragraph.level > 0: # Most likely a sub-list is_a_list = True - list_text = paragraph.text.strip() - - prov = self.generate_prov(shape, slide_ind, shape.text.strip()) if is_a_list: # Determine if this is an unordered list or an ordered list. # Set GroupLabel.ORDERED_LIST when it fits. - list_label = GroupLabel.LIST if bullet_type == "Numbered": list_label = GroupLabel.ORDERED_LIST - new_list = doc.add_group( - label=list_label, name=f"list", parent=parent_slide - ) - else: - new_list = None - if is_a_list: _log.debug("LIST DETECTED!") else: _log.debug("No List") - # for e in p.iter(): + # Iterate through paragraphs to build up text + for paragraph in shape.text_frame.paragraphs: + # p_text = paragraph.text.strip() + p = paragraph._element + enum_list_item_value += 1 + inline_paragraph_text = "" + inline_list_item_text = "" + for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}): if len(e.text.strip()) > 0: e_is_a_list_item = False @@ -180,15 +182,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB e_is_a_list_item = False if e_is_a_list_item: + if len(inline_paragraph_text) > 0: + # output accumulated inline text: + doc.add_text( + label=doc_label, + parent=parent_slide, + text=inline_paragraph_text, + prov=prov, + ) # Set marker and enumerated arguments if this is an enumeration element. - enum_marker = str(enum_list_item_value) + "." - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_list, - text=list_text, - prov=prov, - ) + inline_list_item_text += e.text + # print(e.text) else: # Assign proper label to the text, depending if it's a Title or Section Header # For other types of text, assign - PARAGRAPH @@ -203,15 +207,35 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB doc_label = DocItemLabel.TITLE elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: DocItemLabel.SECTION_HEADER - enum_list_item_value = 0 + inline_paragraph_text += e.text - doc.add_text( - label=doc_label, - parent=parent_slide, - text=list_text, - prov=prov, - ) + if len(inline_paragraph_text) > 0: + # output accumulated inline text: + doc.add_text( + label=doc_label, + parent=parent_slide, + text=inline_paragraph_text, + prov=prov, + ) + + # If there is a list inside of the shape, create a new docling list to assign list items to + if is_a_list: + new_list = doc.add_group( + label=list_label, name=f"list", parent=parent_slide + ) + + if len(inline_list_item_text) > 0: + enum_marker = "" + if is_numbered: + enum_marker = str(enum_list_item_value) + "." + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=new_list, + text=inline_list_item_text, + prov=prov, + ) return def handle_title(self, shape, parent_slide, slide_ind, doc): diff --git a/tests/data/powerpoint_sample.pptx b/tests/data/powerpoint_sample.pptx index 6e33aafc..acabf415 100644 Binary files a/tests/data/powerpoint_sample.pptx and b/tests/data/powerpoint_sample.pptx differ