mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
fix: pptx shape order
Signed-off-by: Martin Wind <martin.wind@im-c.at>
This commit is contained in:
parent
2579d89510
commit
c75b75e8af
@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||
from pptx.util import Mm
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
@ -416,8 +417,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
groupedshape, parent_slide, slide_ind, doc, slide_size
|
||||
)
|
||||
|
||||
# Loop through each shape in the slide
|
||||
for shape in slide.shapes:
|
||||
# Generate sort keys for shapes based on their top (cluster 3mm) and left positions.
|
||||
# Manually positioned boxes with a deviation of less than 3mm in their top position
|
||||
# will be sorted on the same line.
|
||||
def gen_sort_keys(shapes, max_top_distance=Mm(3)):
|
||||
top = None
|
||||
for shape in sorted(shapes, key=lambda s: s.top):
|
||||
if top is None or abs(top - shape.top) > max_top_distance:
|
||||
top = shape.top
|
||||
yield (shape, (top, shape.left))
|
||||
|
||||
# Loop through each shapes on the slide and sort them by top cluster and left
|
||||
for shape, sort in sorted(
|
||||
gen_sort_keys(slide.shapes),
|
||||
key=lambda s: s[1],
|
||||
):
|
||||
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
||||
|
||||
# Handle notes slide
|
||||
|
@ -1,15 +1,15 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: chapter: group slide-0
|
||||
item-2 at level 2: title: Test Table Slide
|
||||
item-3 at level 2: paragraph: With footnote
|
||||
item-4 at level 2: table with [9x7]
|
||||
item-3 at level 2: table with [9x7]
|
||||
item-4 at level 2: paragraph: With footnote
|
||||
item-5 at level 1: chapter: group slide-1
|
||||
item-6 at level 2: title: Second slide title
|
||||
item-7 at level 2: paragraph: Let’s introduce a list
|
||||
item-8 at level 2: paragraph: With foo
|
||||
item-9 at level 2: paragraph: Bar
|
||||
item-10 at level 2: paragraph: And baz things
|
||||
item-11 at level 2: paragraph: A rectangle shape with this text inside.
|
||||
item-7 at level 2: paragraph: A rectangle shape with this text inside.
|
||||
item-8 at level 2: paragraph: Let’s introduce a list
|
||||
item-9 at level 2: paragraph: With foo
|
||||
item-10 at level 2: paragraph: Bar
|
||||
item-11 at level 2: paragraph: And baz things
|
||||
item-12 at level 1: chapter: group slide-2
|
||||
item-13 at level 2: ordered_list: group list
|
||||
item-14 at level 3: list_item: List item4
|
||||
|
@ -42,10 +42,10 @@
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
"$ref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -310,6 +310,33 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 6180463.0,
|
||||
"t": 5221995.0,
|
||||
"r": 10256704.0,
|
||||
"b": 1344058.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
40
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "A rectangle shape with this text inside.",
|
||||
"text": "A rectangle shape with this text inside."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
@ -330,7 +357,7 @@
|
||||
"text": "Let’s introduce a list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
@ -357,7 +384,7 @@
|
||||
"text": "With foo"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
@ -384,7 +411,7 @@
|
||||
"text": "Bar"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
@ -410,33 +437,6 @@
|
||||
"orig": "And baz things",
|
||||
"text": "And baz things"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 6180463.0,
|
||||
"t": 5221995.0,
|
||||
"r": 10256704.0,
|
||||
"b": 1344058.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
40
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "A rectangle shape with this text inside.",
|
||||
"text": "A rectangle shape with this text inside."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
|
@ -1,7 +1,5 @@
|
||||
# Test Table Slide
|
||||
|
||||
With footnote
|
||||
|
||||
| | Class1 | Class1 | Class1 | Class2 | Class2 | Class2 |
|
||||
|----|-----------------|-----------------|----------|----------|----------|----------|
|
||||
| | A merged with B | A merged with B | C | A | B | C |
|
||||
@ -13,8 +11,12 @@ With footnote
|
||||
| R4 | | True | | True | False | False |
|
||||
| R4 | True | False | True | False | True | False |
|
||||
|
||||
With footnote
|
||||
|
||||
# Second slide title
|
||||
|
||||
A rectangle shape with this text inside.
|
||||
|
||||
Let’s introduce a list
|
||||
|
||||
With foo
|
||||
@ -23,8 +25,6 @@ Bar
|
||||
|
||||
And baz things
|
||||
|
||||
A rectangle shape with this text inside.
|
||||
|
||||
1. List item4
|
||||
2. List item5
|
||||
3. List item6
|
||||
|
@ -0,0 +1,35 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: chapter: group slide-0
|
||||
item-2 at level 2: title: Test Table Slide
|
||||
item-3 at level 2: table with [9x7]
|
||||
item-4 at level 2: paragraph: With footnote
|
||||
item-5 at level 1: chapter: group slide-1
|
||||
item-6 at level 2: title: Second slide title
|
||||
item-7 at level 2: paragraph: A rectangle shape with this text inside.
|
||||
item-8 at level 2: paragraph: Let’s introduce a list
|
||||
item-9 at level 2: paragraph: With foo
|
||||
item-10 at level 2: paragraph: Bar
|
||||
item-11 at level 2: paragraph: And baz things
|
||||
item-12 at level 1: chapter: group slide-2
|
||||
item-13 at level 2: ordered_list: group list
|
||||
item-14 at level 3: list_item: List item4
|
||||
item-15 at level 3: list_item: List item5
|
||||
item-16 at level 3: list_item: List item6
|
||||
item-17 at level 2: list: group list
|
||||
item-18 at level 3: list_item: I1
|
||||
item-19 at level 3: list_item: I2
|
||||
item-20 at level 3: list_item: I3
|
||||
item-21 at level 3: list_item: I4
|
||||
item-22 at level 2: paragraph: Some info:
|
||||
item-23 at level 2: list: group list
|
||||
item-24 at level 3: list_item: Item A
|
||||
item-25 at level 3: list_item: Item B
|
||||
item-26 at level 2: paragraph: Maybe a list?
|
||||
item-27 at level 2: ordered_list: group list
|
||||
item-28 at level 3: list_item: List1
|
||||
item-29 at level 3: list_item: List2
|
||||
item-30 at level 3: list_item: List3
|
||||
item-31 at level 2: list: group list
|
||||
item-32 at level 3: list_item: l1
|
||||
item-33 at level 3: list_item: l2
|
||||
item-34 at level 3: list_item: l3
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,50 @@
|
||||
# Test Table Slide
|
||||
|
||||
| | Class1 | Class1 | Class1 | Class2 | Class2 | Class2 |
|
||||
|----|-----------------|-----------------|----------|----------|----------|----------|
|
||||
| | A merged with B | A merged with B | C | A | B | C |
|
||||
| R1 | True | False | | False | True | True |
|
||||
| R2 | | | True | False | | |
|
||||
| R3 | False | | | | False | |
|
||||
| R3 | | True | | True | | |
|
||||
| R4 | | | False | | False | |
|
||||
| R4 | | True | | True | False | False |
|
||||
| R4 | True | False | True | False | True | False |
|
||||
|
||||
With footnote
|
||||
|
||||
# Second slide title
|
||||
|
||||
A rectangle shape with this text inside.
|
||||
|
||||
Let’s introduce a list
|
||||
|
||||
With foo
|
||||
|
||||
Bar
|
||||
|
||||
And baz things
|
||||
|
||||
1. List item4
|
||||
2. List item5
|
||||
3. List item6
|
||||
|
||||
- I1
|
||||
- I2
|
||||
- I3
|
||||
- I4
|
||||
|
||||
Some info:
|
||||
|
||||
- Item A
|
||||
- Item B
|
||||
|
||||
Maybe a list?
|
||||
|
||||
1. List1
|
||||
2. List2
|
||||
3. List3
|
||||
|
||||
- l1
|
||||
- l2
|
||||
- l3
|
BIN
tests/data/pptx/powerpoint_sample_unordered.pptx
Normal file
BIN
tests/data/pptx/powerpoint_sample_unordered.pptx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user