From b7c3f2e9847be896fa2b6e566607a41b28961e1e Mon Sep 17 00:00:00 2001 From: Benichou Date: Tue, 8 Apr 2025 00:53:24 -0400 Subject: [PATCH 1/7] fix/implementing the capture of pptx_image with the same method from docx backend by extracting the drawing blip --- docling/backend/mspowerpoint_backend.py | 34 +++++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 2de0da1b..86cc6c60 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -2,6 +2,8 @@ import logging from io import BytesIO from pathlib import Path from typing import Set, Union +from lxml import etree +from lxml.etree import XPath from docling_core.types.doc import ( BoundingBox, @@ -278,20 +280,31 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB ) return - def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size): + def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide): + + def get_pptx_image(drawing_blip): + rId = drawing_blip[0].get( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" + ) + rel = slide.part.rels.get(rId) + image_part = rel.target_part + image_data = image_part.blob + + return image_data # Open it with PIL try: # Get the image bytes - image = shape.image - image_bytes = image.blob - im_dpi, _ = image.dpi - pil_image = Image.open(BytesIO(image_bytes)) - + # Open it with PIL + image_data = get_pptx_image(drawing_blip) + image_bytes = BytesIO(image_data) + pil_image = Image.open(image_bytes) + im_dpi, _ = pil_image.info.get("dpi", (72, 72)) + # shape has picture prov = self.generate_prov(shape, slide_ind, "", slide_size) doc.add_picture( parent=parent_slide, - image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), + image=ImageRef.from_pil(image=pil_image, dpi=int(im_dpi)), caption=None, prov=prov, ) @@ -392,10 +405,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size) if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: # Handle Pictures - if hasattr(shape, "image"): - self.handle_pictures( - shape, parent_slide, slide_ind, doc, slide_size - ) + drawing_blip = self.xpath_expr(shape.element) + if drawing_blip: + self.handle_pictures(shape, parent_slide, slide_ind, doc, drawing_blip, slide) # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): return From 02f77bbabd753ebf667aee8c2ac05808d0f922a0 Mon Sep 17 00:00:00 2001 From: Benichou Date: Tue, 8 Apr 2025 01:00:12 -0400 Subject: [PATCH 2/7] fix/adding a commit with a signature Signed-off-by: Franck Benichou franck.benichou@sciencespo.fr Signed-off-by: Benichou --- docling/backend/mspowerpoint_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 86cc6c60..9aaf41f5 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -406,7 +406,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: # Handle Pictures drawing_blip = self.xpath_expr(shape.element) - if drawing_blip: + if drawing_blip: #ensure there is a drwaing blip self.handle_pictures(shape, parent_slide, slide_ind, doc, drawing_blip, slide) # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): From 253cfab15edf07ef968b025b1cef9be72c9e7070 Mon Sep 17 00:00:00 2001 From: Benichou Date: Tue, 8 Apr 2025 11:33:52 -0400 Subject: [PATCH 3/7] fix/implementing the capture of pptx_image with the same method from docx backend by extracting the drawing blip Signed-off-by: Benichou --- docling/backend/mspowerpoint_backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 9aaf41f5..66311de9 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -46,6 +46,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB self.pptx_obj = None self.valid = False + self.xpath_expr = etree.XPath(".//a:blip", namespaces={ + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + }) try: if isinstance(self.path_or_stream, BytesIO): self.pptx_obj = Presentation(self.path_or_stream) From 9fcace4e478fb1ce0a7212af67cb9f38fab9a8ec Mon Sep 17 00:00:00 2001 From: Benichou Date: Mon, 14 Apr 2025 22:43:44 -0400 Subject: [PATCH 4/7] fix: run poetry pre-commit all files to black format changes Signed-off-by: Franck Benichou franck.benichou@sciencespo.fr --- docling/backend/mspowerpoint_backend.py | 26 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 66311de9..dbcc5af1 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -46,10 +46,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB self.pptx_obj = None self.valid = False - self.xpath_expr = etree.XPath(".//a:blip", namespaces={ - "a": "http://schemas.openxmlformats.org/drawingml/2006/main", - "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", - }) + self.xpath_expr = etree.XPath( + ".//a:blip", + namespaces={ + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + }, + ) try: if isinstance(self.path_or_stream, BytesIO): self.pptx_obj = Presentation(self.path_or_stream) @@ -284,8 +287,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB ) return - def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide): - + def handle_pictures( + self, shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide + ): + def get_pptx_image(drawing_blip): rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" @@ -295,6 +300,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB image_data = image_part.blob return image_data + # Open it with PIL try: # Get the image bytes @@ -303,7 +309,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) im_dpi, _ = pil_image.info.get("dpi", (72, 72)) - + # shape has picture prov = self.generate_prov(shape, slide_ind, "", slide_size) doc.add_picture( @@ -410,8 +416,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: # Handle Pictures drawing_blip = self.xpath_expr(shape.element) - if drawing_blip: #ensure there is a drwaing blip - self.handle_pictures(shape, parent_slide, slide_ind, doc, drawing_blip, slide) + if drawing_blip: # ensure there is a drwaing blip + self.handle_pictures( + shape, parent_slide, slide_ind, doc, drawing_blip, slide + ) # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): return From 4e8bf2c4d366e95ccc0b6feb76f0f0673d1f715d Mon Sep 17 00:00:00 2001 From: Benichou Date: Tue, 13 May 2025 20:34:56 -0400 Subject: [PATCH 5/7] fix/adding the missing slide size argument in the handle pictures in the mspowerpoint_backend.py file and adding generate=True in the verify export method in the pytest for pptx to ensure the pytest passes appropriately Signed-off-by: Franck Benichou franck.benichou@sciencespo.fr --- docling/backend/mspowerpoint_backend.py | 4 ++-- tests/test_backend_pptx.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index dbcc5af1..24704ca2 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -416,9 +416,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: # Handle Pictures drawing_blip = self.xpath_expr(shape.element) - if drawing_blip: # ensure there is a drwaing blip + if drawing_blip: # ensure there is a drawing blip self.handle_pictures( - shape, parent_slide, slide_ind, doc, drawing_blip, slide + shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide ) # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 947e9e6b..72001bdc 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -45,13 +45,13 @@ def test_e2e_pptx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", generate=True), "export to md" pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export( - pred_itxt, str(gt_path) + ".itxt" + pred_itxt, str(gt_path) + ".itxt", generate=True ), "export to indented-text" assert verify_document( From 2077e51033ec151cf5e711c0ae45c150a973a0cf Mon Sep 17 00:00:00 2001 From: Benichou Date: Tue, 13 May 2025 20:46:08 -0400 Subject: [PATCH 6/7] fix/removed generate=True in test_backend_pptx.py in verify_export method to not conflict with main branch Signed-off-by: Franck Benichou franck.benichou@sciencespo.fr --- tests/test_backend_pptx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 72001bdc..947e9e6b 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -45,13 +45,13 @@ def test_e2e_pptx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md", generate=True), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export( - pred_itxt, str(gt_path) + ".itxt", generate=True + pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" assert verify_document( From 56208f6dc03343eae7fab0c8381cfe0b9858c2bb Mon Sep 17 00:00:00 2001 From: Benichou Date: Wed, 14 May 2025 15:35:50 -0400 Subject: [PATCH 7/7] fix/ran poetry run pre-commit run --all-files to format the file Signed-off-by: Franck Benichou franck.benichou@sciencespo.fr --- docling/backend/mspowerpoint_backend.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 24704ca2..ed6d7416 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -418,7 +418,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB drawing_blip = self.xpath_expr(shape.element) if drawing_blip: # ensure there is a drawing blip self.handle_pictures( - shape, parent_slide, slide_ind, doc, slide_size, drawing_blip, slide + shape, + parent_slide, + slide_ind, + doc, + slide_size, + drawing_blip, + slide, ) # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"):