mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
fix/implementing the capture of pptx_image with the same method from docx backend by extracting the drawing blip
Signed-off-by: Benichou <fbenichou@deloitte.ca>
This commit is contained in:
parent
95e49705e8
commit
b0553e8812
@ -2,6 +2,8 @@ import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from lxml import etree
|
||||
from lxml.etree import XPath
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
@ -278,20 +280,31 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
)
|
||||
return
|
||||
|
||||
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide):
|
||||
|
||||
def get_pptx_image(drawing_blip):
|
||||
rId = drawing_blip[0].get(
|
||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||
)
|
||||
rel = slide.part.rels.get(rId)
|
||||
image_part = rel.target_part
|
||||
image_data = image_part.blob
|
||||
|
||||
return image_data
|
||||
# Open it with PIL
|
||||
try:
|
||||
# Get the image bytes
|
||||
image = shape.image
|
||||
image_bytes = image.blob
|
||||
im_dpi, _ = image.dpi
|
||||
pil_image = Image.open(BytesIO(image_bytes))
|
||||
|
||||
# Open it with PIL
|
||||
image_data = get_pptx_image(drawing_blip)
|
||||
image_bytes = BytesIO(image_data)
|
||||
pil_image = Image.open(image_bytes)
|
||||
im_dpi, _ = pil_image.info.get("dpi", (72, 72))
|
||||
|
||||
# shape has picture
|
||||
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
||||
doc.add_picture(
|
||||
parent=parent_slide,
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=int(im_dpi)),
|
||||
caption=None,
|
||||
prov=prov,
|
||||
)
|
||||
@ -392,10 +405,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
# Handle Pictures
|
||||
if hasattr(shape, "image"):
|
||||
self.handle_pictures(
|
||||
shape, parent_slide, slide_ind, doc, slide_size
|
||||
)
|
||||
drawing_blip = self.xpath_expr(shape.element)
|
||||
if drawing_blip:
|
||||
self.handle_pictures(shape, parent_slide, slide_ind, doc, drawing_blip, slide)
|
||||
# If shape doesn't have any text, move on to the next shape
|
||||
if not hasattr(shape, "text"):
|
||||
return
|
||||
|
Loading…
Reference in New Issue
Block a user