From af4aaa28af83fd6e829448d75d0ac2b14bd18dcf Mon Sep 17 00:00:00 2001 From: Michael Krissgau Date: Thu, 22 May 2025 17:45:15 +0200 Subject: [PATCH] fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau --- docling/backend/msword_backend.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 6cfa0860..9a4d0396 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -253,9 +253,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") - + # Check for Image elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + # Check for Text after the Image + if ( + tag_name in ["p"] + or element.find(".//w:p", namespaces=namespaces) is not None + ): + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)