Merge branch 'main' of github.com:DS4SD/docling into cau/dp4-test-diff

2025-07-25 19:44:34 +00:00 · 2025-06-20 16:51:35 +02:00 · 2025-06-20 16:51:35 +02:00 · a6efb2eb3d
commit a6efb2eb3d
parent 0e63cb09e6 d26dac61a8
8 changed files with 562 additions and 174 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -14,7 +14,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            self.valid = True
        except Exception as e:
            raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
            ) from e

    @override
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    self._handle_tables(element, docx_obj, doc)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
-
+            # Check for Image
            elif drawing_blip:
                self._handle_pictures(docx_obj, drawing_blip, doc)
+                # Check for Text after the Image
+                if (
+                    tag_name in ["p"]
+                    and element.find(".//w:t", namespaces=namespaces) is not None
+                ):
+                    self._handle_text_elements(element, docx_obj, doc)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self._handle_text_elements(element, docx_obj, doc)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
+
        return doc

    def _str_to_int(
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        all_paragraphs = []

        # Sort paragraphs within each container, then process containers
-        for container_id, paragraphs in container_paragraphs.items():
+        for paragraphs in container_paragraphs.values():
            # Sort by vertical position within each container
            sorted_container_paragraphs = sorted(
                paragraphs,
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc: DoclingDocument,
    ) -> None:
        paragraph = Paragraph(element, docx_obj)
-
+        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
            element=element, text=paragraph.text
        )

        if text is None:
            return
-        paragraph_elements = self._get_paragraph_elements(paragraph)
        text = text.strip()

        # Common styles for bullet and numbered lists.
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        )
        return

+    def _add_formatted_list_item(
+        self,
+        doc: DoclingDocument,
+        elements: list,
+        marker: str,
+        enumerated: bool,
+        level: int,
+    ) -> None:
+        # This should not happen by construction
+        if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
+            return
+        if len(elements) == 1:
+            text, format, hyperlink = elements[0]
+            doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text=text,
+                formatting=format,
+                hyperlink=hyperlink,
+            )
+        else:
+            new_item = doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text="",
+            )
+            new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
+            for text, format, hyperlink in elements:
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=new_parent,
+                    text=text,
+                    formatting=format,
+                    hyperlink=hyperlink,
+                )
+
    def _add_list_item(
        self,
        *,
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        elements: list,
        is_numbered: bool = False,
    ) -> None:
+        # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
+        if not elements:
+            return None
        enum_marker = ""

        level = self._get_level()
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level
            )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
-
        elif (
            self._prev_numid() == numid
            and self.level_at_new_list is not None
@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
            )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
        elif (
            self._prev_numid() == numid
            and self.level_at_new_list is not None
            and prev_indent is not None
            and ilevel < prev_indent
        ):  # Close list
-            for k, v in self.parents.items():
+            for k in self.parents:
                if k > self.level_at_new_list + ilevel:
                    self.parents[k] = None

@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
            )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
            self.listIter = 0

        elif self._prev_numid() == numid or prev_indent == ilevel:
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level - 1],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level - 1
            )
-            for text, format, hyperlink in elements:
-                # Add the list item to the parent group
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
+
        return

    def _handle_tables(
--- a/tests/data/docx/word_image_anchors.docx
+++ b/tests/data/docx/word_image_anchors.docx
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
    item-16 at level 2: list_item: Italic bullet 1
    item-17 at level 2: list_item: Bold bullet 2
    item-18 at level 2: list_item: Underline bullet 3
-    item-19 at level 2: inline: group group
-      item-20 at level 3: list_item: Some
-      item-21 at level 3: list_item: italic
-      item-22 at level 3: list_item: bold
-      item-23 at level 3: list_item: underline
-    item-24 at level 2: list: group list
-      item-25 at level 3: inline: group group
-        item-26 at level 4: list_item: Nested
-        item-27 at level 4: list_item: italic
-        item-28 at level 4: list_item: bold
-  item-29 at level 1: paragraph: 
+    item-19 at level 2: list_item: 
+      item-20 at level 3: inline: group group
+        item-21 at level 4: text: Some
+        item-22 at level 4: text: italic
+        item-23 at level 4: text: bold
+        item-24 at level 4: text: underline
+    item-25 at level 2: list: group list
+      item-26 at level 3: list_item: 
+        item-27 at level 4: inline: group group
+          item-28 at level 5: text: Nested
+          item-29 at level 5: text: italic
+          item-30 at level 5: text: bold
+  item-31 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
@ -42,7 +42,7 @@
        "$ref": "#/groups/1"
      },
      {
-        "$ref": "#/texts/23"
+        "$ref": "#/texts/25"
      }
    ],
    "content_layer": "body",
@ -98,7 +98,7 @@
          "$ref": "#/texts/15"
        },
        {
-          "$ref": "#/groups/2"
+          "$ref": "#/texts/16"
        },
        {
          "$ref": "#/groups/3"
@ -111,12 +111,9 @@
    {
      "self_ref": "#/groups/2",
      "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/texts/16"
      },
      "children": [
-        {
-          "$ref": "#/texts/16"
-        },
        {
          "$ref": "#/texts/17"
        },
@ -125,6 +122,9 @@
        },
        {
          "$ref": "#/texts/19"
+        },
+        {
+          "$ref": "#/texts/20"
        }
      ],
      "content_layer": "body",
@ -138,7 +138,7 @@
      },
      "children": [
        {
-          "$ref": "#/groups/4"
+          "$ref": "#/texts/21"
        }
      ],
      "content_layer": "body",
@ -148,17 +148,17 @@
    {
      "self_ref": "#/groups/4",
      "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/texts/21"
      },
      "children": [
-        {
-          "$ref": "#/texts/20"
-        },
-        {
-          "$ref": "#/texts/21"
-        },
        {
          "$ref": "#/texts/22"
+        },
+        {
+          "$ref": "#/texts/23"
+        },
+        {
+          "$ref": "#/texts/24"
        }
      ],
      "content_layer": "body",
@ -461,20 +461,18 @@
    {
      "self_ref": "#/texts/16",
      "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/1"
      },
-      "children": [],
+      "children": [
+        {
+          "$ref": "#/groups/2"
+        }
+      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
-      "orig": "Some",
-      "text": "Some",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false
-      },
+      "orig": "",
+      "text": "",
      "enumerated": false,
      "marker": "-"
    },
@ -485,18 +483,16 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
-      "orig": "italic",
-      "text": "italic",
+      "orig": "Some",
+      "text": "Some",
      "formatting": {
        "bold": false,
-        "italic": true,
+        "italic": false,
        "underline": false,
        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
+      }
    },
    {
      "self_ref": "#/texts/18",
@ -505,67 +501,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "bold",
-      "text": "bold",
-      "formatting": {
-        "bold": true,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "underline",
-      "text": "underline",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": true,
-        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "Nested",
-      "text": "Nested",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
      "orig": "italic",
      "text": "italic",
@ -574,7 +510,59 @@
        "italic": true,
        "underline": false,
        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/19",
+      "parent": {
+        "$ref": "#/groups/2"
      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "bold",
+      "text": "bold",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/20",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "underline",
+      "text": "underline",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": true,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/21",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [
+        {
+          "$ref": "#/groups/4"
+        }
+      ],
+      "content_layer": "body",
+      "label": "list_item",
+      "prov": [],
+      "orig": "",
+      "text": "",
      "enumerated": false,
      "marker": "-"
    },
@ -585,7 +573,43 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
+      "prov": [],
+      "orig": "Nested",
+      "text": "Nested",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/23",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "italic",
+      "text": "italic",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/24",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
@ -594,12 +618,10 @@
        "italic": false,
        "underline": false,
        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
+      }
    },
    {
-      "self_ref": "#/texts/23",
+      "self_ref": "#/texts/25",
      "parent": {
        "$ref": "#/body"
      },
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
@ -0,0 +1,16 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: paragraph: Transcript
+  item-2 at level 1: paragraph: February 20, 2025, 8:32PM
+  item-3 at level 1: picture
+  item-4 at level 1: inline: group group
+    item-5 at level 2: paragraph: This is test 1
+    item-6 at level 2: paragraph: 0:08
+Correct, he is not.
+  item-7 at level 1: paragraph: 
+  item-8 at level 1: picture
+  item-9 at level 1: inline: group group
+    item-10 at level 2: paragraph: This is test 2
+    item-11 at level 2: paragraph: 0:16
+Yeah, exactly.
+  item-12 at level 1: paragraph: 
+  item-13 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
@ -0,0 +1,286 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.3.0",
+  "name": "word_image_anchors",
+  "origin": {
+    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "binary_hash": 2428692234257307633,
+    "filename": "word_image_anchors.docx"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/pictures/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/pictures/1"
+      },
+      {
+        "$ref": "#/groups/1"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "Transcript",
+      "text": "Transcript",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "February 20, 2025, 8:32PM",
+      "text": "February 20, 2025, 8:32PM",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "This is test 1",
+      "text": "This is test 1",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "0:08\nCorrect, he is not.",
+      "text": "0:08\nCorrect, he is not.",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "This is test 2",
+      "text": "This is test 2",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "0:16\nYeah, exactly.",
+      "text": "0:16\nYeah, exactly.",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    }
+  ],
+  "pictures": [
+    {
+      "self_ref": "#/pictures/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "picture",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "image": {
+        "mimetype": "image/png",
+        "dpi": 72,
+        "size": {
+          "width": 100.0,
+          "height": 100.0
+        },
+        "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC"
+      },
+      "annotations": []
+    },
+    {
+      "self_ref": "#/pictures/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "picture",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "image": {
+        "mimetype": "image/png",
+        "dpi": 72,
+        "size": {
+          "width": 100.0,
+          "height": 100.0
+        },
+        "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII="
+      },
+      "annotations": []
+    }
+  ],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
@ -0,0 +1,13 @@
+**Transcript**
+
+February 20, 2025, 8:32PM
+
+<!-- image -->
+
+**This is test 1** 0:08
+Correct, he is not.
+
+<!-- image -->
+
+**This is test 2** 0:16
+Yeah, exactly.
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -9,6 +9,7 @@ from docling.datamodel.document import (
    DoclingDocument,
    InputDocument,
    SectionHeaderItem,
+    TextItem,
 )
 from docling.document_converter import DocumentConverter

@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):

        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
-            "export to md"
+            f"export to markdown failed on {docx_path}"
        )

        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
-            "export to indented-text"
+            f"export to indented-text failed on {docx_path}"
        )

        assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
-            "document document"
+            f"DoclingDocument verification failed on {docx_path}"
        )

        if docx_path.name == "word_tables.docx":
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
                pred_text=pred_html,
                gtfile=str(gt_path) + ".html",
                generate=GENERATE,
-            ), "export to html"
+            ), f"export to html failed on {docx_path}"


 flaky_path = Path("tests/data/docx/textbox.docx")
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
@pytest.mark.xfail(strict=False)
 def test_textbox_conversion():
    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
+
+
+def test_text_after_image_anchors():
+    """
+    Test to analyse whether text gets parsed after image anchors.
+    """
+
+    in_path = Path("tests/data/docx/word_image_anchors.docx")
+    in_doc = InputDocument(
+        path_or_stream=in_path,
+        format=InputFormat.DOCX,
+        backend=MsWordDocumentBackend,
+    )
+    backend = MsWordDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=in_path,
+    )
+    doc = backend.convert()
+
+    found_text_after_anchor_1 = found_text_after_anchor_2 = (
+        found_text_after_anchor_3
+    ) = found_text_after_anchor_4 = False
+    for item, _ in doc.iterate_items():
+        if isinstance(item, TextItem):
+            if item.text == "This is test 1":
+                found_text_after_anchor_1 = True
+            elif item.text == "0:08\nCorrect, he is not.":
+                found_text_after_anchor_2 = True
+            elif item.text == "This is test 2":
+                found_text_after_anchor_3 = True
+            elif item.text == "0:16\nYeah, exactly.":
+                found_text_after_anchor_4 = True
+
+    assert (
+        found_text_after_anchor_1
+        and found_text_after_anchor_2
+        and found_text_after_anchor_3
+        and found_text_after_anchor_4
+    )