fix/implementing the capture of pptx_image with the same method from docx backend by extracting the drawing blip

Signed-off-by: Benichou <fbenichou@deloitte.ca>
This commit is contained in:
Benichou 2025-04-08 00:53:24 -04:00
parent 95e49705e8
commit b0553e8812

View File

@ -2,6 +2,8 @@ import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from lxml import etree
from lxml.etree import XPath
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox, BoundingBox,
@ -278,20 +280,31 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
) )
return return
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size): def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide):
def get_pptx_image(drawing_blip):
rId = drawing_blip[0].get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
rel = slide.part.rels.get(rId)
image_part = rel.target_part
image_data = image_part.blob
return image_data
# Open it with PIL # Open it with PIL
try: try:
# Get the image bytes # Get the image bytes
image = shape.image # Open it with PIL
image_bytes = image.blob image_data = get_pptx_image(drawing_blip)
im_dpi, _ = image.dpi image_bytes = BytesIO(image_data)
pil_image = Image.open(BytesIO(image_bytes)) pil_image = Image.open(image_bytes)
im_dpi, _ = pil_image.info.get("dpi", (72, 72))
# shape has picture # shape has picture
prov = self.generate_prov(shape, slide_ind, "", slide_size) prov = self.generate_prov(shape, slide_ind, "", slide_size)
doc.add_picture( doc.add_picture(
parent=parent_slide, parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), image=ImageRef.from_pil(image=pil_image, dpi=int(im_dpi)),
caption=None, caption=None,
prov=prov, prov=prov,
) )
@ -392,10 +405,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size) self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Pictures # Handle Pictures
if hasattr(shape, "image"): drawing_blip = self.xpath_expr(shape.element)
self.handle_pictures( if drawing_blip:
shape, parent_slide, slide_ind, doc, slide_size self.handle_pictures(shape, parent_slide, slide_ind, doc, drawing_blip, slide)
)
# If shape doesn't have any text, move on to the next shape # If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"): if not hasattr(shape, "text"):
return return