From 387dd659c18c93e392c0a48e488cac290ffc7eb1 Mon Sep 17 00:00:00 2001 From: Manuel030 Date: Mon, 28 Apr 2025 13:46:08 +0200 Subject: [PATCH] fix: find paragraphs in elements with images in docx Signed-off-by: Manuel030 --- docling/backend/msword_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index f1362220..be1c8223 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -123,6 +123,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None + doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) return doc else: @@ -188,6 +189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)