Fixed issues with duplicated paragraphs and incorrect lists in pptx

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-07-30 14:04:27 +00:00 · 2024-10-23 15:00:47 +02:00 · 2024-10-23 15:00:47 +02:00 · 82126e3871
commit 82126e3871
parent 0f81ffda74
2 changed files with 52 additions and 28 deletions
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -113,9 +113,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
        is_a_list = False
        enum_list_item_value = 0
+        new_list = None
+        bullet_type = "None"
+        list_text = ""
+        list_label = GroupLabel.LIST
+        prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+
+        # Identify if shape contains lists
        for paragraph in shape.text_frame.paragraphs:
-            enum_list_item_value += 1
-            bullet_type = "None"
            # Check if paragraph is a bullet point using the `element` XML
            p = paragraph._element
            if (
@ -136,29 +141,26 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            if paragraph.level > 0:
                # Most likely a sub-list
                is_a_list = True
-            list_text = paragraph.text.strip()
-
-            prov = self.generate_prov(shape, slide_ind, shape.text.strip())

            if is_a_list:
                # Determine if this is an unordered list or an ordered list.
                # Set GroupLabel.ORDERED_LIST when it fits.
-                list_label = GroupLabel.LIST
                if bullet_type == "Numbered":
                    list_label = GroupLabel.ORDERED_LIST

-                new_list = doc.add_group(
-                    label=list_label, name=f"list", parent=parent_slide
-                )
-            else:
-                new_list = None
-
            if is_a_list:
                _log.debug("LIST DETECTED!")
            else:
                _log.debug("No List")

-            # for e in p.iter():
+        # Iterate through paragraphs to build up text
+        for paragraph in shape.text_frame.paragraphs:
+            # p_text = paragraph.text.strip()
+            p = paragraph._element
+            enum_list_item_value += 1
+            inline_paragraph_text = ""
+            inline_list_item_text = ""
+
            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
                if len(e.text.strip()) > 0:
                    e_is_a_list_item = False
@ -180,15 +182,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                        e_is_a_list_item = False

                    if e_is_a_list_item:
+                        if len(inline_paragraph_text) > 0:
+                            # output accumulated inline text:
+                            doc.add_text(
+                                label=doc_label,
+                                parent=parent_slide,
+                                text=inline_paragraph_text,
+                                prov=prov,
+                            )
                        # Set marker and enumerated arguments if this is an enumeration element.
-                        enum_marker = str(enum_list_item_value) + "."
-                        doc.add_list_item(
-                            marker=enum_marker,
-                            enumerated=is_numbered,
-                            parent=new_list,
-                            text=list_text,
-                            prov=prov,
-                        )
+                        inline_list_item_text += e.text
+                        # print(e.text)
                    else:
                        # Assign proper label to the text, depending if it's a Title or Section Header
                        # For other types of text, assign - PARAGRAPH
@ -203,15 +207,35 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                                doc_label = DocItemLabel.TITLE
                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
                                DocItemLabel.SECTION_HEADER
-
                        enum_list_item_value = 0
+                        inline_paragraph_text += e.text

-                        doc.add_text(
-                            label=doc_label,
-                            parent=parent_slide,
-                            text=list_text,
-                            prov=prov,
-                        )
+            if len(inline_paragraph_text) > 0:
+                # output accumulated inline text:
+                doc.add_text(
+                    label=doc_label,
+                    parent=parent_slide,
+                    text=inline_paragraph_text,
+                    prov=prov,
+                )
+
+            # If there is a list inside of the shape, create a new docling list to assign list items to
+            if is_a_list:
+                new_list = doc.add_group(
+                    label=list_label, name=f"list", parent=parent_slide
+                )
+
+            if len(inline_list_item_text) > 0:
+                enum_marker = ""
+                if is_numbered:
+                    enum_marker = str(enum_list_item_value) + "."
+                doc.add_list_item(
+                    marker=enum_marker,
+                    enumerated=is_numbered,
+                    parent=new_list,
+                    text=inline_list_item_text,
+                    prov=prov,
+                )
        return

    def handle_title(self, shape, parent_slide, slide_ind, doc):
--- a/tests/data/powerpoint_sample.pptx
+++ b/tests/data/powerpoint_sample.pptx