From a54f583415a3a8dfb783bb10ca109b20e223edb4 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 15 Nov 2024 14:35:25 +0100 Subject: [PATCH] Added picture data for pptx pictures Signed-off-by: Maksym Lysak --- docling/backend/mspowerpoint_backend.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index b71cd859..fc59adb3 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -10,11 +10,13 @@ from docling_core.types.doc import ( DoclingDocument, DocumentOrigin, GroupLabel, + ImageRef, ProvenanceItem, Size, TableCell, TableData, ) +from PIL import Image from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER @@ -268,9 +270,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB return def handle_pictures(self, shape, parent_slide, slide_ind, doc): + # Get the image bytes + image = shape.image + image_bytes = image.blob + # Open it with PIL + pil_image = Image.open(BytesIO(image_bytes)) + # shape has picture prov = self.generate_prov(shape, slide_ind, "") - doc.add_picture(parent=parent_slide, caption=None, prov=prov) + doc.add_picture( + parent=parent_slide, + image=ImageRef.from_pil(image=pil_image, dpi=72), + caption=None, + prov=prov, + ) return def handle_tables(self, shape, parent_slide, slide_ind, doc):