This commit is contained in:
benichou 2025-07-11 12:13:33 +02:00 committed by GitHub
commit b6765b0c09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,6 +17,8 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling_core.types.doc.document import ContentLayer from docling_core.types.doc.document import ContentLayer
from lxml import etree
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from pptx import Presentation from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@ -45,6 +47,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.pptx_obj = None self.pptx_obj = None
self.valid = False self.valid = False
self.xpath_expr = etree.XPath(
".//a:blip",
namespaces={
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
},
)
self.xpath_expr = etree.XPath(".//a:blip", namespaces={
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
})
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
self.pptx_obj = Presentation(self.path_or_stream) self.pptx_obj = Presentation(self.path_or_stream)
@ -232,20 +245,35 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
) )
return return
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size): def handle_pictures(
self, shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide
):
def get_pptx_image(drawing_blip):
rId = drawing_blip[0].get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
rel = slide.part.rels.get(rId)
image_part = rel.target_part
image_data = image_part.blob
return image_data
# Open it with PIL # Open it with PIL
try: try:
# Get the image bytes # Get the image bytes
image = shape.image # Open it with PIL
image_bytes = image.blob image_data = get_pptx_image(drawing_blip)
im_dpi, _ = image.dpi image_bytes = BytesIO(image_data)
pil_image = Image.open(BytesIO(image_bytes)) pil_image = Image.open(image_bytes)
im_dpi, _ = pil_image.info.get("dpi", (72, 72))
# shape has picture # shape has picture
prov = self.generate_prov(shape, slide_ind, "", slide_size) prov = self.generate_prov(shape, slide_ind, "", slide_size)
doc.add_picture( doc.add_picture(
parent=parent_slide, parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), image=ImageRef.from_pil(image=pil_image, dpi=int(im_dpi)),
image=ImageRef.from_pil(image=pil_image, dpi=int(im_dpi)),
caption=None, caption=None,
prov=prov, prov=prov,
) )
@ -344,9 +372,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size) self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Pictures # Handle Pictures
if hasattr(shape, "image"): drawing_blip = self.xpath_expr(shape.element)
<<<<<<< HEAD
if drawing_blip: #ensure there is a drwaing blip
=======
if drawing_blip: # ensure there is a drawing blip
>>>>>>> 4e8bf2c (fix/adding the missing slide size argument in the handle pictures in the mspowerpoint_backend.py file and adding generate=True in the verify export method in the pytest for pptx to ensure the pytest passes appropriately Signed-off-by: Franck Benichou franck.benichou@sciencespo.fr)
self.handle_pictures( self.handle_pictures(
shape, parent_slide, slide_ind, doc, slide_size shape,
parent_slide,
slide_ind,
doc,
slide_size,
drawing_blip,
slide,
) )
# If shape doesn't have any text, move on to the next shape # If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"): if not hasattr(shape, "text"):