feat: Add PPTX notes slides (#474)

* feat: Add PPTX notes slides

Presenter notes may have useful information and should also be extracted.

Signed-off-by: Maciej Wieczorek <maciej@wieczorek.co>

* feat: Move presenter notes into furniture

Signed-off-by: Maciej Wieczorek <maciej@wieczorek.co>

---------

Signed-off-by: Maciej Wieczorek <maciej@wieczorek.co>
This commit is contained in:
Maciej Wieczorek
2025-03-19 14:52:09 +01:00
committed by GitHub
parent f5adfb9724
commit b454aa1551
3 changed files with 111 additions and 33 deletions

View File

@@ -16,6 +16,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
from PIL import Image, UnidentifiedImageError
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
for shape in slide.shapes:
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
# Handle notes slide
if slide.has_notes_slide:
notes_slide = slide.notes_slide
notes_text = notes_slide.notes_text_frame.text.strip()
if notes_text:
bbox = BoundingBox(l=0, t=0, r=0, b=0)
prov = ProvenanceItem(
page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
)
doc.add_text(
label=DocItemLabel.TEXT,
parent=parent_slide,
text=notes_text,
prov=prov,
content_layer=ContentLayer.FURNITURE,
)
return doc