mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix: pptx shape order
Signed-off-by: Martin Wind <martin.wind@im-c.at>
This commit is contained in:
parent
2579d89510
commit
c75b75e8af
@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
|
|||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||||
|
from pptx.util import Mm
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import (
|
||||||
DeclarativeDocumentBackend,
|
DeclarativeDocumentBackend,
|
||||||
@ -416,8 +417,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
groupedshape, parent_slide, slide_ind, doc, slide_size
|
groupedshape, parent_slide, slide_ind, doc, slide_size
|
||||||
)
|
)
|
||||||
|
|
||||||
# Loop through each shape in the slide
|
# Generate sort keys for shapes based on their top (cluster 3mm) and left positions.
|
||||||
for shape in slide.shapes:
|
# Manually positioned boxes with a deviation of less than 3mm in their top position
|
||||||
|
# will be sorted on the same line.
|
||||||
|
def gen_sort_keys(shapes, max_top_distance=Mm(3)):
|
||||||
|
top = None
|
||||||
|
for shape in sorted(shapes, key=lambda s: s.top):
|
||||||
|
if top is None or abs(top - shape.top) > max_top_distance:
|
||||||
|
top = shape.top
|
||||||
|
yield (shape, (top, shape.left))
|
||||||
|
|
||||||
|
# Loop through each shapes on the slide and sort them by top cluster and left
|
||||||
|
for shape, sort in sorted(
|
||||||
|
gen_sort_keys(slide.shapes),
|
||||||
|
key=lambda s: s[1],
|
||||||
|
):
|
||||||
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
||||||
|
|
||||||
# Handle notes slide
|
# Handle notes slide
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
item-0 at level 0: unspecified: group _root_
|
item-0 at level 0: unspecified: group _root_
|
||||||
item-1 at level 1: chapter: group slide-0
|
item-1 at level 1: chapter: group slide-0
|
||||||
item-2 at level 2: title: Test Table Slide
|
item-2 at level 2: title: Test Table Slide
|
||||||
item-3 at level 2: paragraph: With footnote
|
item-3 at level 2: table with [9x7]
|
||||||
item-4 at level 2: table with [9x7]
|
item-4 at level 2: paragraph: With footnote
|
||||||
item-5 at level 1: chapter: group slide-1
|
item-5 at level 1: chapter: group slide-1
|
||||||
item-6 at level 2: title: Second slide title
|
item-6 at level 2: title: Second slide title
|
||||||
item-7 at level 2: paragraph: Let’s introduce a list
|
item-7 at level 2: paragraph: A rectangle shape with this text inside.
|
||||||
item-8 at level 2: paragraph: With foo
|
item-8 at level 2: paragraph: Let’s introduce a list
|
||||||
item-9 at level 2: paragraph: Bar
|
item-9 at level 2: paragraph: With foo
|
||||||
item-10 at level 2: paragraph: And baz things
|
item-10 at level 2: paragraph: Bar
|
||||||
item-11 at level 2: paragraph: A rectangle shape with this text inside.
|
item-11 at level 2: paragraph: And baz things
|
||||||
item-12 at level 1: chapter: group slide-2
|
item-12 at level 1: chapter: group slide-2
|
||||||
item-13 at level 2: ordered_list: group list
|
item-13 at level 2: ordered_list: group list
|
||||||
item-14 at level 3: list_item: List item4
|
item-14 at level 3: list_item: List item4
|
||||||
|
@ -42,10 +42,10 @@
|
|||||||
"$ref": "#/texts/0"
|
"$ref": "#/texts/0"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/1"
|
"$ref": "#/tables/0"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/tables/0"
|
"$ref": "#/texts/1"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -310,6 +310,33 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 6180463.0,
|
||||||
|
"t": 5221995.0,
|
||||||
|
"r": 10256704.0,
|
||||||
|
"b": 1344058.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
40
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"orig": "A rectangle shape with this text inside.",
|
||||||
|
"text": "A rectangle shape with this text inside."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
"page_no": 2,
|
"page_no": 2,
|
||||||
@ -330,7 +357,7 @@
|
|||||||
"text": "Let’s introduce a list"
|
"text": "Let’s introduce a list"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/5",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/1"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
@ -357,7 +384,7 @@
|
|||||||
"text": "With foo"
|
"text": "With foo"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/5",
|
"self_ref": "#/texts/6",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/1"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
@ -384,7 +411,7 @@
|
|||||||
"text": "Bar"
|
"text": "Bar"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/6",
|
"self_ref": "#/texts/7",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/1"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
@ -410,33 +437,6 @@
|
|||||||
"orig": "And baz things",
|
"orig": "And baz things",
|
||||||
"text": "And baz things"
|
"text": "And baz things"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"self_ref": "#/texts/7",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/1"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "paragraph",
|
|
||||||
"prov": [
|
|
||||||
{
|
|
||||||
"page_no": 2,
|
|
||||||
"bbox": {
|
|
||||||
"l": 6180463.0,
|
|
||||||
"t": 5221995.0,
|
|
||||||
"r": 10256704.0,
|
|
||||||
"b": 1344058.0,
|
|
||||||
"coord_origin": "BOTTOMLEFT"
|
|
||||||
},
|
|
||||||
"charspan": [
|
|
||||||
0,
|
|
||||||
40
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"orig": "A rectangle shape with this text inside.",
|
|
||||||
"text": "A rectangle shape with this text inside."
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/8",
|
"self_ref": "#/texts/8",
|
||||||
"parent": {
|
"parent": {
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
# Test Table Slide
|
# Test Table Slide
|
||||||
|
|
||||||
With footnote
|
|
||||||
|
|
||||||
| | Class1 | Class1 | Class1 | Class2 | Class2 | Class2 |
|
| | Class1 | Class1 | Class1 | Class2 | Class2 | Class2 |
|
||||||
|----|-----------------|-----------------|----------|----------|----------|----------|
|
|----|-----------------|-----------------|----------|----------|----------|----------|
|
||||||
| | A merged with B | A merged with B | C | A | B | C |
|
| | A merged with B | A merged with B | C | A | B | C |
|
||||||
@ -13,8 +11,12 @@ With footnote
|
|||||||
| R4 | | True | | True | False | False |
|
| R4 | | True | | True | False | False |
|
||||||
| R4 | True | False | True | False | True | False |
|
| R4 | True | False | True | False | True | False |
|
||||||
|
|
||||||
|
With footnote
|
||||||
|
|
||||||
# Second slide title
|
# Second slide title
|
||||||
|
|
||||||
|
A rectangle shape with this text inside.
|
||||||
|
|
||||||
Let’s introduce a list
|
Let’s introduce a list
|
||||||
|
|
||||||
With foo
|
With foo
|
||||||
@ -23,8 +25,6 @@ Bar
|
|||||||
|
|
||||||
And baz things
|
And baz things
|
||||||
|
|
||||||
A rectangle shape with this text inside.
|
|
||||||
|
|
||||||
1. List item4
|
1. List item4
|
||||||
2. List item5
|
2. List item5
|
||||||
3. List item6
|
3. List item6
|
||||||
|
@ -0,0 +1,35 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: chapter: group slide-0
|
||||||
|
item-2 at level 2: title: Test Table Slide
|
||||||
|
item-3 at level 2: table with [9x7]
|
||||||
|
item-4 at level 2: paragraph: With footnote
|
||||||
|
item-5 at level 1: chapter: group slide-1
|
||||||
|
item-6 at level 2: title: Second slide title
|
||||||
|
item-7 at level 2: paragraph: A rectangle shape with this text inside.
|
||||||
|
item-8 at level 2: paragraph: Let’s introduce a list
|
||||||
|
item-9 at level 2: paragraph: With foo
|
||||||
|
item-10 at level 2: paragraph: Bar
|
||||||
|
item-11 at level 2: paragraph: And baz things
|
||||||
|
item-12 at level 1: chapter: group slide-2
|
||||||
|
item-13 at level 2: ordered_list: group list
|
||||||
|
item-14 at level 3: list_item: List item4
|
||||||
|
item-15 at level 3: list_item: List item5
|
||||||
|
item-16 at level 3: list_item: List item6
|
||||||
|
item-17 at level 2: list: group list
|
||||||
|
item-18 at level 3: list_item: I1
|
||||||
|
item-19 at level 3: list_item: I2
|
||||||
|
item-20 at level 3: list_item: I3
|
||||||
|
item-21 at level 3: list_item: I4
|
||||||
|
item-22 at level 2: paragraph: Some info:
|
||||||
|
item-23 at level 2: list: group list
|
||||||
|
item-24 at level 3: list_item: Item A
|
||||||
|
item-25 at level 3: list_item: Item B
|
||||||
|
item-26 at level 2: paragraph: Maybe a list?
|
||||||
|
item-27 at level 2: ordered_list: group list
|
||||||
|
item-28 at level 3: list_item: List1
|
||||||
|
item-29 at level 3: list_item: List2
|
||||||
|
item-30 at level 3: list_item: List3
|
||||||
|
item-31 at level 2: list: group list
|
||||||
|
item-32 at level 3: list_item: l1
|
||||||
|
item-33 at level 3: list_item: l2
|
||||||
|
item-34 at level 3: list_item: l3
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,50 @@
|
|||||||
|
# Test Table Slide
|
||||||
|
|
||||||
|
| | Class1 | Class1 | Class1 | Class2 | Class2 | Class2 |
|
||||||
|
|----|-----------------|-----------------|----------|----------|----------|----------|
|
||||||
|
| | A merged with B | A merged with B | C | A | B | C |
|
||||||
|
| R1 | True | False | | False | True | True |
|
||||||
|
| R2 | | | True | False | | |
|
||||||
|
| R3 | False | | | | False | |
|
||||||
|
| R3 | | True | | True | | |
|
||||||
|
| R4 | | | False | | False | |
|
||||||
|
| R4 | | True | | True | False | False |
|
||||||
|
| R4 | True | False | True | False | True | False |
|
||||||
|
|
||||||
|
With footnote
|
||||||
|
|
||||||
|
# Second slide title
|
||||||
|
|
||||||
|
A rectangle shape with this text inside.
|
||||||
|
|
||||||
|
Let’s introduce a list
|
||||||
|
|
||||||
|
With foo
|
||||||
|
|
||||||
|
Bar
|
||||||
|
|
||||||
|
And baz things
|
||||||
|
|
||||||
|
1. List item4
|
||||||
|
2. List item5
|
||||||
|
3. List item6
|
||||||
|
|
||||||
|
- I1
|
||||||
|
- I2
|
||||||
|
- I3
|
||||||
|
- I4
|
||||||
|
|
||||||
|
Some info:
|
||||||
|
|
||||||
|
- Item A
|
||||||
|
- Item B
|
||||||
|
|
||||||
|
Maybe a list?
|
||||||
|
|
||||||
|
1. List1
|
||||||
|
2. List2
|
||||||
|
3. List3
|
||||||
|
|
||||||
|
- l1
|
||||||
|
- l2
|
||||||
|
- l3
|
BIN
tests/data/pptx/powerpoint_sample_unordered.pptx
Normal file
BIN
tests/data/pptx/powerpoint_sample_unordered.pptx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user