add test file

Signed-off-by: Manuel030 <manuelenrique.plank@gmail.com>
This commit is contained in:
Manuel030 2025-04-29 16:18:09 +02:00
parent 387dd659c1
commit 50c108c6d3
8 changed files with 328 additions and 85 deletions

Binary file not shown.

View File

@ -0,0 +1,9 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Transkript
item-2 at level 1: paragraph: 5. März 2025, 01:35PM
item-3 at level 1: paragraph:
item-4 at level 1: picture
item-5 at level 1: inline: group group
item-6 at level 2: paragraph: User
item-7 at level 2: paragraph: 0:08
Ein beispielhafter Paragraph.

View File

@ -0,0 +1,162 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "paragraph_in_image",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 15839552996279065250,
"filename": "paragraph_in_image.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Transkript",
"text": "Transkript",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "5. M\u00e4rz 2025, 01:35PM",
"text": "5. M\u00e4rz 2025, 01:35PM"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "User",
"text": "User",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:08\nEin beispielhafter Paragraph.",
"text": "0:08\nEin beispielhafter Paragraph."
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,8 @@
**Transkript**
5. März 2025, 01:35PM
<!-- image -->
**User** 0:08
Ein beispielhafter Paragraph.

View File

@ -2,7 +2,10 @@ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Test with three images in unusual formats item-1 at level 1: paragraph: Test with three images in unusual formats
item-2 at level 1: paragraph: Raster in emf: item-2 at level 1: paragraph: Raster in emf:
item-3 at level 1: picture item-3 at level 1: picture
item-4 at level 1: paragraph: Vector in emf: item-4 at level 1: paragraph:
item-5 at level 1: picture item-5 at level 1: paragraph: Vector in emf:
item-6 at level 1: paragraph: Raster in webp: item-6 at level 1: picture
item-7 at level 1: picture item-7 at level 1: paragraph:
item-8 at level 1: paragraph: Raster in webp:
item-9 at level 1: picture
item-10 at level 1: paragraph:

View File

@ -29,14 +29,23 @@
{ {
"$ref": "#/texts/2" "$ref": "#/texts/2"
}, },
{
"$ref": "#/pictures/1"
},
{ {
"$ref": "#/texts/3" "$ref": "#/texts/3"
}, },
{
"$ref": "#/pictures/1"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
},
{ {
"$ref": "#/pictures/2" "$ref": "#/pictures/2"
},
{
"$ref": "#/texts/6"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -78,8 +87,8 @@
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "Vector in emf:", "orig": "",
"text": "Vector in emf:" "text": ""
}, },
{ {
"self_ref": "#/texts/3", "self_ref": "#/texts/3",
@ -90,8 +99,44 @@
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "Vector in emf:",
"text": "Vector in emf:"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Raster in webp:", "orig": "Raster in webp:",
"text": "Raster in webp:" "text": "Raster in webp:"
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
} }
], ],
"pictures": [ "pictures": [

View File

@ -3,27 +3,28 @@ item-0 at level 0: unspecified: group _root_
item-2 at level 1: title: Swimming in the lake item-2 at level 1: title: Swimming in the lake
item-3 at level 2: paragraph: Duck item-3 at level 2: paragraph: Duck
item-4 at level 2: picture item-4 at level 2: picture
item-5 at level 2: paragraph: Figure 1: This is a cute duckling item-5 at level 2: paragraph:
item-6 at level 2: section_header: Lets swim! item-6 at level 2: paragraph: Figure 1: This is a cute duckling
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: item-7 at level 2: section_header: Lets swim!
item-8 at level 3: list: group list item-8 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
item-9 at level 4: list_item: You can relax and look around item-9 at level 3: list: group list
item-10 at level 4: list_item: Paddle about item-10 at level 4: list_item: You can relax and look around
item-11 at level 4: list_item: Enjoy summer warmth item-11 at level 4: list_item: Paddle about
item-12 at level 3: paragraph: Also, dont forget: item-12 at level 4: list_item: Enjoy summer warmth
item-13 at level 3: list: group list item-13 at level 3: paragraph: Also, dont forget:
item-14 at level 4: list_item: Wear sunglasses item-14 at level 3: list: group list
item-15 at level 4: list_item: Dont forget to drink water item-15 at level 4: list_item: Wear sunglasses
item-16 at level 4: list_item: Use sun cream item-16 at level 4: list_item: Dont forget to drink water
item-17 at level 3: paragraph: Hmm, what else… item-17 at level 4: list_item: Use sun cream
item-18 at level 3: section_header: Lets eat item-18 at level 3: paragraph: Hmm, what else…
item-19 at level 4: paragraph: After we had a good day of swimm ... , its important to eat something nice item-19 at level 3: section_header: Lets eat
item-20 at level 4: paragraph: I like to eat leaves item-20 at level 4: paragraph: After we had a good day of swimm ... , its important to eat something nice
item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat: item-21 at level 4: paragraph: I like to eat leaves
item-22 at level 4: table with [4x3] item-22 at level 4: paragraph: Here are some interesting things a respectful duck could eat:
item-23 at level 4: paragraph: item-23 at level 4: table with [4x3]
item-24 at level 4: paragraph: And lets add another list in the end: item-24 at level 4: paragraph:
item-25 at level 4: list: group list item-25 at level 4: paragraph: And lets add another list in the end:
item-26 at level 5: list_item: Leaves item-26 at level 4: list: group list
item-27 at level 5: list_item: Berries item-27 at level 5: list_item: Leaves
item-28 at level 5: list_item: Grain item-28 at level 5: list_item: Berries
item-29 at level 5: list_item: Grain

View File

@ -32,17 +32,17 @@
{ {
"self_ref": "#/groups/0", "self_ref": "#/groups/0",
"parent": { "parent": {
"$ref": "#/texts/4" "$ref": "#/texts/5"
}, },
"children": [ "children": [
{
"$ref": "#/texts/6"
},
{ {
"$ref": "#/texts/7" "$ref": "#/texts/7"
}, },
{ {
"$ref": "#/texts/8" "$ref": "#/texts/8"
},
{
"$ref": "#/texts/9"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -52,17 +52,17 @@
{ {
"self_ref": "#/groups/1", "self_ref": "#/groups/1",
"parent": { "parent": {
"$ref": "#/texts/4" "$ref": "#/texts/5"
}, },
"children": [ "children": [
{
"$ref": "#/texts/10"
},
{ {
"$ref": "#/texts/11" "$ref": "#/texts/11"
}, },
{ {
"$ref": "#/texts/12" "$ref": "#/texts/12"
},
{
"$ref": "#/texts/13"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -72,17 +72,17 @@
{ {
"self_ref": "#/groups/2", "self_ref": "#/groups/2",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [ "children": [
{
"$ref": "#/texts/20"
},
{ {
"$ref": "#/texts/21" "$ref": "#/texts/21"
}, },
{ {
"$ref": "#/texts/22" "$ref": "#/texts/22"
},
{
"$ref": "#/texts/23"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -120,6 +120,9 @@
}, },
{ {
"$ref": "#/texts/4" "$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -149,32 +152,44 @@
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "Figure 1: This is a cute duckling", "orig": "",
"text": "Figure 1: This is a cute duckling" "text": ""
}, },
{ {
"self_ref": "#/texts/4", "self_ref": "#/texts/4",
"parent": { "parent": {
"$ref": "#/texts/1" "$ref": "#/texts/1"
}, },
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Figure 1: This is a cute duckling",
"text": "Figure 1: This is a cute duckling"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/texts/1"
},
"children": [ "children": [
{ {
"$ref": "#/texts/5" "$ref": "#/texts/6"
}, },
{ {
"$ref": "#/groups/0" "$ref": "#/groups/0"
}, },
{ {
"$ref": "#/texts/9" "$ref": "#/texts/10"
}, },
{ {
"$ref": "#/groups/1" "$ref": "#/groups/1"
}, },
{ {
"$ref": "#/texts/13" "$ref": "#/texts/14"
}, },
{ {
"$ref": "#/texts/14" "$ref": "#/texts/15"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -185,9 +200,9 @@
"level": 1 "level": 1
}, },
{ {
"self_ref": "#/texts/5", "self_ref": "#/texts/6",
"parent": { "parent": {
"$ref": "#/texts/4" "$ref": "#/texts/5"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -197,7 +212,7 @@
"text": "To get started with swimming, first lay down in a water and try not to drown:" "text": "To get started with swimming, first lay down in a water and try not to drown:"
}, },
{ {
"self_ref": "#/texts/6", "self_ref": "#/texts/7",
"parent": { "parent": {
"$ref": "#/groups/0" "$ref": "#/groups/0"
}, },
@ -211,7 +226,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/7", "self_ref": "#/texts/8",
"parent": { "parent": {
"$ref": "#/groups/0" "$ref": "#/groups/0"
}, },
@ -225,7 +240,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/8", "self_ref": "#/texts/9",
"parent": { "parent": {
"$ref": "#/groups/0" "$ref": "#/groups/0"
}, },
@ -239,9 +254,9 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/9", "self_ref": "#/texts/10",
"parent": { "parent": {
"$ref": "#/texts/4" "$ref": "#/texts/5"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -251,7 +266,7 @@
"text": "Also, don\u2019t forget:" "text": "Also, don\u2019t forget:"
}, },
{ {
"self_ref": "#/texts/10", "self_ref": "#/texts/11",
"parent": { "parent": {
"$ref": "#/groups/1" "$ref": "#/groups/1"
}, },
@ -265,7 +280,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/11", "self_ref": "#/texts/12",
"parent": { "parent": {
"$ref": "#/groups/1" "$ref": "#/groups/1"
}, },
@ -279,7 +294,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/12", "self_ref": "#/texts/13",
"parent": { "parent": {
"$ref": "#/groups/1" "$ref": "#/groups/1"
}, },
@ -293,9 +308,9 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/13", "self_ref": "#/texts/14",
"parent": { "parent": {
"$ref": "#/texts/4" "$ref": "#/texts/5"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -305,29 +320,29 @@
"text": "Hmm, what else\u2026" "text": "Hmm, what else\u2026"
}, },
{ {
"self_ref": "#/texts/14", "self_ref": "#/texts/15",
"parent": { "parent": {
"$ref": "#/texts/4" "$ref": "#/texts/5"
}, },
"children": [ "children": [
{
"$ref": "#/texts/15"
},
{ {
"$ref": "#/texts/16" "$ref": "#/texts/16"
}, },
{ {
"$ref": "#/texts/17" "$ref": "#/texts/17"
}, },
{
"$ref": "#/tables/0"
},
{ {
"$ref": "#/texts/18" "$ref": "#/texts/18"
}, },
{
"$ref": "#/tables/0"
},
{ {
"$ref": "#/texts/19" "$ref": "#/texts/19"
}, },
{
"$ref": "#/texts/20"
},
{ {
"$ref": "#/groups/2" "$ref": "#/groups/2"
} }
@ -340,9 +355,9 @@
"level": 2 "level": 2
}, },
{ {
"self_ref": "#/texts/15", "self_ref": "#/texts/16",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -352,9 +367,9 @@
"text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice" "text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice"
}, },
{ {
"self_ref": "#/texts/16", "self_ref": "#/texts/17",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -364,9 +379,9 @@
"text": "I like to eat leaves" "text": "I like to eat leaves"
}, },
{ {
"self_ref": "#/texts/17", "self_ref": "#/texts/18",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -376,9 +391,9 @@
"text": "Here are some interesting things a respectful duck could eat:" "text": "Here are some interesting things a respectful duck could eat:"
}, },
{ {
"self_ref": "#/texts/18", "self_ref": "#/texts/19",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -388,9 +403,9 @@
"text": "" "text": ""
}, },
{ {
"self_ref": "#/texts/19", "self_ref": "#/texts/20",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -400,7 +415,7 @@
"text": "And let\u2019s add another list in the end:" "text": "And let\u2019s add another list in the end:"
}, },
{ {
"self_ref": "#/texts/20", "self_ref": "#/texts/21",
"parent": { "parent": {
"$ref": "#/groups/2" "$ref": "#/groups/2"
}, },
@ -414,7 +429,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/21", "self_ref": "#/texts/22",
"parent": { "parent": {
"$ref": "#/groups/2" "$ref": "#/groups/2"
}, },
@ -428,7 +443,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/22", "self_ref": "#/texts/23",
"parent": { "parent": {
"$ref": "#/groups/2" "$ref": "#/groups/2"
}, },
@ -471,7 +486,7 @@
{ {
"self_ref": "#/tables/0", "self_ref": "#/tables/0",
"parent": { "parent": {
"$ref": "#/texts/14" "$ref": "#/texts/15"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",