From 4c5b0f7894811f92d7f2358380ca5dace81c0433 Mon Sep 17 00:00:00 2001 From: Maciej Wieczorek Date: Wed, 19 Mar 2025 11:59:55 +0100 Subject: [PATCH] feat: Move presenter notes into furniture Signed-off-by: Maciej Wieczorek --- docling/backend/mspowerpoint_backend.py | 2 + .../docling_v2/powerpoint_sample.pptx.itxt | 49 +++++++++---------- .../docling_v2/powerpoint_sample.pptx.json | 2 + .../docling_v2/powerpoint_sample.pptx.md | 7 +-- 4 files changed, 28 insertions(+), 32 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 9681e1d0..a752e8dc 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -16,6 +16,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer from PIL import Image, UnidentifiedImageError from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER @@ -435,6 +436,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB parent=parent_slide, text=notes_text, prov=prov, + content_layer=ContentLayer.FURNITURE, ) return doc diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt index 1456647e..ba86c3ba 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt @@ -10,29 +10,26 @@ item-0 at level 0: unspecified: group _root_ item-9 at level 2: paragraph: Bar item-10 at level 2: paragraph: And baz things item-11 at level 2: paragraph: A rectangle shape with this text inside. - item-12 at level 2: text: Some notes on the second slide. - item-13 at level 1: chapter: group slide-2 - item-14 at level 2: ordered_list: group list - item-15 at level 3: list_item: List item4 - item-16 at level 3: list_item: List item5 - item-17 at level 3: list_item: List item6 - item-18 at level 2: list: group list - item-19 at level 3: list_item: I1 - item-20 at level 3: list_item: I2 - item-21 at level 3: list_item: I3 - item-22 at level 3: list_item: I4 - item-23 at level 2: paragraph: Some info: - item-24 at level 2: list: group list - item-25 at level 3: list_item: Item A - item-26 at level 3: list_item: Item B - item-27 at level 2: paragraph: Maybe a list? - item-28 at level 2: ordered_list: group list - item-29 at level 3: list_item: List1 - item-30 at level 3: list_item: List2 - item-31 at level 3: list_item: List3 - item-32 at level 2: list: group list - item-33 at level 3: list_item: l1 - item-34 at level 3: list_item: l2 - item-35 at level 3: list_item: l3 - item-36 at level 2: text: Final notes on the third slide. -Second line of notes. \ No newline at end of file + item-12 at level 1: chapter: group slide-2 + item-13 at level 2: ordered_list: group list + item-14 at level 3: list_item: List item4 + item-15 at level 3: list_item: List item5 + item-16 at level 3: list_item: List item6 + item-17 at level 2: list: group list + item-18 at level 3: list_item: I1 + item-19 at level 3: list_item: I2 + item-20 at level 3: list_item: I3 + item-21 at level 3: list_item: I4 + item-22 at level 2: paragraph: Some info: + item-23 at level 2: list: group list + item-24 at level 3: list_item: Item A + item-25 at level 3: list_item: Item B + item-26 at level 2: paragraph: Maybe a list? + item-27 at level 2: ordered_list: group list + item-28 at level 3: list_item: List1 + item-29 at level 3: list_item: List2 + item-30 at level 3: list_item: List3 + item-31 at level 2: list: group list + item-32 at level 3: list_item: l1 + item-33 at level 3: list_item: l2 + item-34 at level 3: list_item: l3 \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index 3ada4834..fb441563 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -443,6 +443,7 @@ "$ref": "#/groups/1" }, "children": [], + "content_layer": "furniture", "label": "text", "prov": [ { @@ -958,6 +959,7 @@ "$ref": "#/groups/2" }, "children": [], + "content_layer": "furniture", "label": "text", "prov": [ { diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md index f2342578..ec26faa4 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.md @@ -25,8 +25,6 @@ And baz things A rectangle shape with this text inside. -Some notes on the second slide. - 1. List item4 2. List item5 3. List item6 @@ -49,7 +47,4 @@ Maybe a list? - l1 - l2 -- l3 - -Final notes on the third slide. -Second line of notes. \ No newline at end of file +- l3 \ No newline at end of file