fix(msword_backend): Identify text in the same line after an image #1425 (#1610)

* fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com> * test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com> * test: added groundtruth test files for fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com> * fix: extraneous empty paragraphs for test files Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com> --------- Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com> Co-authored-by: Michael Krissgau <michael.krissgau@ibm.com>
2025-12-11 06:08:09 +00:00 · 2025-06-20 10:55:30 +02:00
parent 64ac043786
commit 1350a8d3e5
6 changed files with 362 additions and 1 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    self._handle_tables(element, docx_obj, doc)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
-
+            # Check for Image
            elif drawing_blip:
                self._handle_pictures(docx_obj, drawing_blip, doc)
+                # Check for Text after the Image
+                if (
+                    tag_name in ["p"]
+                    and element.find(".//w:t", namespaces=namespaces) is not None
+                ):
+                    self._handle_text_elements(element, docx_obj, doc)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)