From 508bbed8f807b933ebd8106dcba14246b998fafd Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 25 Nov 2024 16:42:48 +0100 Subject: [PATCH] fixes for referencing drawing blip in wordx Signed-off-by: Maksym Lysak --- docling/backend/msword_backend.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 089e94c2..a8b7e9d6 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -14,6 +14,7 @@ from docling_core.types.doc import ( TableData, ) from lxml import etree +from lxml.etree import XPath from PIL import Image from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname + # Check for Inline Images (blip elements) - drawing_blip = element.xpath(".//a:blip") + namespaces = { + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + } + xpath_expr = XPath(".//a:blip", namespaces=namespaces) + drawing_blip = xpath_expr(element) # Check for Tables if element.tag.endswith("tbl"):