diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 231d6224..9681e1d0 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -421,4 +421,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB for shape in slide.shapes: handle_shapes(shape, parent_slide, slide_ind, doc, slide_size) + # Handle notes slide + if slide.has_notes_slide: + notes_slide = slide.notes_slide + notes_text = notes_slide.notes_text_frame.text.strip() + if notes_text: + bbox = BoundingBox(l=0, t=0, r=0, b=0) + prov = ProvenanceItem( + page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox + ) + doc.add_text( + label=DocItemLabel.TEXT, + parent=parent_slide, + text=notes_text, + prov=prov, + ) + return doc diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt index ba86c3ba..1456647e 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt @@ -10,26 +10,29 @@ item-0 at level 0: unspecified: group _root_ item-9 at level 2: paragraph: Bar item-10 at level 2: paragraph: And baz things item-11 at level 2: paragraph: A rectangle shape with this text inside. - item-12 at level 1: chapter: group slide-2 - item-13 at level 2: ordered_list: group list - item-14 at level 3: list_item: List item4 - item-15 at level 3: list_item: List item5 - item-16 at level 3: list_item: List item6 - item-17 at level 2: list: group list - item-18 at level 3: list_item: I1 - item-19 at level 3: list_item: I2 - item-20 at level 3: list_item: I3 - item-21 at level 3: list_item: I4 - item-22 at level 2: paragraph: Some info: - item-23 at level 2: list: group list - item-24 at level 3: list_item: Item A - item-25 at level 3: list_item: Item B - item-26 at level 2: paragraph: Maybe a list? - item-27 at level 2: ordered_list: group list - item-28 at level 3: list_item: List1 - item-29 at level 3: list_item: List2 - item-30 at level 3: list_item: List3 - item-31 at level 2: list: group list - item-32 at level 3: list_item: l1 - item-33 at level 3: list_item: l2 - item-34 at level 3: list_item: l3 \ No newline at end of file + item-12 at level 2: text: Some notes on the second slide. + item-13 at level 1: chapter: group slide-2 + item-14 at level 2: ordered_list: group list + item-15 at level 3: list_item: List item4 + item-16 at level 3: list_item: List item5 + item-17 at level 3: list_item: List item6 + item-18 at level 2: list: group list + item-19 at level 3: list_item: I1 + item-20 at level 3: list_item: I2 + item-21 at level 3: list_item: I3 + item-22 at level 3: list_item: I4 + item-23 at level 2: paragraph: Some info: + item-24 at level 2: list: group list + item-25 at level 3: list_item: Item A + item-26 at level 3: list_item: Item B + item-27 at level 2: paragraph: Maybe a list? + item-28 at level 2: ordered_list: group list + item-29 at level 3: list_item: List1 + item-30 at level 3: list_item: List2 + item-31 at level 3: list_item: List3 + item-32 at level 2: list: group list + item-33 at level 3: list_item: l1 + item-34 at level 3: list_item: l2 + item-35 at level 3: list_item: l3 + item-36 at level 2: text: Final notes on the third slide. +Second line of notes. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index b24c46ed..3ada4834 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -4,7 +4,7 @@ "name": "powerpoint_sample", "origin": { "mimetype": "application/vnd.ms-powerpoint", - "binary_hash": 1640759611026400292, + "binary_hash": 15572290240354948364, "filename": "powerpoint_sample.pptx" }, "furniture": { @@ -75,6 +75,9 @@ }, { "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" } ], "content_layer": "body", @@ -94,19 +97,22 @@ "$ref": "#/groups/4" }, { - "$ref": "#/texts/15" + "$ref": "#/texts/16" }, { "$ref": "#/groups/5" }, { - "$ref": "#/texts/18" + "$ref": "#/texts/19" }, { "$ref": "#/groups/6" }, { "$ref": "#/groups/7" + }, + { + "$ref": "#/texts/26" } ], "content_layer": "body", @@ -119,14 +125,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/8" - }, { "$ref": "#/texts/9" }, { "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" } ], "content_layer": "body", @@ -139,9 +145,6 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/11" - }, { "$ref": "#/texts/12" }, @@ -150,6 +153,9 @@ }, { "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" } ], "content_layer": "body", @@ -163,10 +169,10 @@ }, "children": [ { - "$ref": "#/texts/16" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -179,14 +185,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/19" - }, { "$ref": "#/texts/20" }, { "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" } ], "content_layer": "body", @@ -199,14 +205,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/22" - }, { "$ref": "#/texts/23" }, { "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -433,6 +439,32 @@ }, { "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "text", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 31 + ] + } + ], + "orig": "Some notes on the second slide.", + "text": "Some notes on the second slide." + }, + { + "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/3" }, @@ -461,7 +493,7 @@ "marker": "1." }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/3" }, @@ -490,7 +522,7 @@ "marker": "2." }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/11", "parent": { "$ref": "#/groups/3" }, @@ -519,7 +551,7 @@ "marker": "3." }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/12", "parent": { "$ref": "#/groups/4" }, @@ -548,7 +580,7 @@ "marker": "-" }, { - "self_ref": "#/texts/12", + "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/4" }, @@ -577,7 +609,7 @@ "marker": "-" }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/14", "parent": { "$ref": "#/groups/4" }, @@ -606,7 +638,7 @@ "marker": "-" }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/4" }, @@ -635,7 +667,7 @@ "marker": "-" }, { - "self_ref": "#/texts/15", + "self_ref": "#/texts/16", "parent": { "$ref": "#/groups/2" }, @@ -662,7 +694,7 @@ "text": "Some info:" }, { - "self_ref": "#/texts/16", + "self_ref": "#/texts/17", "parent": { "$ref": "#/groups/5" }, @@ -691,7 +723,7 @@ "marker": "-" }, { - "self_ref": "#/texts/17", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -720,7 +752,7 @@ "marker": "-" }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/19", "parent": { "$ref": "#/groups/2" }, @@ -747,7 +779,7 @@ "text": "Maybe a list?" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/20", "parent": { "$ref": "#/groups/6" }, @@ -776,7 +808,7 @@ "marker": "1." }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/21", "parent": { "$ref": "#/groups/6" }, @@ -805,7 +837,7 @@ "marker": "2." }, { - "self_ref": "#/texts/21", + "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -834,7 +866,7 @@ "marker": "3." }, { - "self_ref": "#/texts/22", + "self_ref": "#/texts/23", "parent": { "$ref": "#/groups/7" }, @@ -863,7 +895,7 @@ "marker": "-" }, { - "self_ref": "#/texts/23", + "self_ref": "#/texts/24", "parent": { "$ref": "#/groups/7" }, @@ -892,7 +924,7 @@ "marker": "-" }, { - "self_ref": "#/texts/24", + "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -919,6 +951,32 @@ "text": "l3", "enumerated": false, "marker": "-" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "text", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 53 + ] + } + ], + "orig": "Final notes on the third slide.\nSecond line of notes.", + "text": "Final notes on the third slide.\nSecond line of notes." } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md index ec26faa4..f2342578 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md @@ -25,6 +25,8 @@ And baz things A rectangle shape with this text inside. +Some notes on the second slide. + 1. List item4 2. List item5 3. List item6 @@ -47,4 +49,7 @@ Maybe a list? - l1 - l2 -- l3 \ No newline at end of file +- l3 + +Final notes on the third slide. +Second line of notes. \ No newline at end of file diff --git a/tests/data/pptx/powerpoint_sample.pptx b/tests/data/pptx/powerpoint_sample.pptx index acabf415..0818f283 100644 Binary files a/tests/data/pptx/powerpoint_sample.pptx and b/tests/data/pptx/powerpoint_sample.pptx differ