mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix: find paragraphs in elements with images in docx
Signed-off-by: Manuel030 <manuelenrique.plank@gmail.com>
This commit is contained in:
parent
d8959c6b19
commit
387dd659c1
@ -123,6 +123,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
|
|
||||||
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
@ -188,6 +189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif drawing_blip:
|
elif drawing_blip:
|
||||||
self._handle_pictures(docx_obj, drawing_blip, doc)
|
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||||
|
self._handle_text_elements(element, docx_obj, doc)
|
||||||
# Check for the sdt containers, like table of contents
|
# Check for the sdt containers, like table of contents
|
||||||
elif tag_name in ["sdt"]:
|
elif tag_name in ["sdt"]:
|
||||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||||
|
Loading…
Reference in New Issue
Block a user