diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 6cfa0860..ee43caa9 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -127,6 +127,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None + doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) return doc else: @@ -256,6 +257,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) diff --git a/tests/data/docx/paragraph_in_image.docx b/tests/data/docx/paragraph_in_image.docx new file mode 100644 index 00000000..8795e408 Binary files /dev/null and b/tests/data/docx/paragraph_in_image.docx differ diff --git a/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.itxt b/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.itxt new file mode 100644 index 00000000..4dd6a246 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.itxt @@ -0,0 +1,9 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Transkript + item-2 at level 1: paragraph: 5. März 2025, 01:35PM + item-3 at level 1: paragraph: + item-4 at level 1: picture + item-5 at level 1: inline: group group + item-6 at level 2: paragraph: User + item-7 at level 2: paragraph: 0:08 +Ein beispielhafter Paragraph. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.json b/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.json new file mode 100644 index 00000000..a365244a --- /dev/null +++ b/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.json @@ -0,0 +1,162 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "paragraph_in_image", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 15839552996279065250, + "filename": "paragraph_in_image.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Transkript", + "text": "Transkript", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "5. M\u00e4rz 2025, 01:35PM", + "text": "5. M\u00e4rz 2025, 01:35PM" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "User", + "text": "User", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:08\nEin beispielhafter Paragraph.", + "text": "0:08\nEin beispielhafter Paragraph." + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "" + }, + "annotations": [] + } + ], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.md b/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.md new file mode 100644 index 00000000..034f3bfd --- /dev/null +++ b/tests/data/groundtruth/docling_v2/paragraph_in_image.docx.md @@ -0,0 +1,8 @@ +**Transkript** + +5. März 2025, 01:35PM + + + +**User** 0:08 +Ein beispielhafter Paragraph. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt index 220b5533..346093c2 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt @@ -2,7 +2,10 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: paragraph: Test with three images in unusual formats item-2 at level 1: paragraph: Raster in emf: item-3 at level 1: picture - item-4 at level 1: paragraph: Vector in emf: - item-5 at level 1: picture - item-6 at level 1: paragraph: Raster in webp: - item-7 at level 1: picture \ No newline at end of file + item-4 at level 1: paragraph: + item-5 at level 1: paragraph: Vector in emf: + item-6 at level 1: picture + item-7 at level 1: paragraph: + item-8 at level 1: paragraph: Raster in webp: + item-9 at level 1: picture + item-10 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json index bb8807f6..98f2abc1 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json @@ -29,14 +29,23 @@ { "$ref": "#/texts/2" }, - { - "$ref": "#/pictures/1" - }, { "$ref": "#/texts/3" }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, { "$ref": "#/pictures/2" + }, + { + "$ref": "#/texts/6" } ], "content_layer": "body", @@ -78,8 +87,8 @@ "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Vector in emf:", - "text": "Vector in emf:" + "orig": "", + "text": "" }, { "self_ref": "#/texts/3", @@ -90,8 +99,44 @@ "content_layer": "body", "label": "paragraph", "prov": [], + "orig": "Vector in emf:", + "text": "Vector in emf:" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], "orig": "Raster in webp:", "text": "Raster in webp:" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt index ce60ad26..b4d98b44 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt @@ -3,27 +3,28 @@ item-0 at level 0: unspecified: group _root_ item-2 at level 1: title: Swimming in the lake item-3 at level 2: paragraph: Duck item-4 at level 2: picture - item-5 at level 2: paragraph: Figure 1: This is a cute duckling - item-6 at level 2: section_header: Let’s swim! - item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: - item-8 at level 3: list: group list - item-9 at level 4: list_item: You can relax and look around - item-10 at level 4: list_item: Paddle about - item-11 at level 4: list_item: Enjoy summer warmth - item-12 at level 3: paragraph: Also, don’t forget: - item-13 at level 3: list: group list - item-14 at level 4: list_item: Wear sunglasses - item-15 at level 4: list_item: Don’t forget to drink water - item-16 at level 4: list_item: Use sun cream - item-17 at level 3: paragraph: Hmm, what else… - item-18 at level 3: section_header: Let’s eat - item-19 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice - item-20 at level 4: paragraph: I like to eat leaves - item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat: - item-22 at level 4: table with [4x3] - item-23 at level 4: paragraph: - item-24 at level 4: paragraph: And let’s add another list in the end: - item-25 at level 4: list: group list - item-26 at level 5: list_item: Leaves - item-27 at level 5: list_item: Berries - item-28 at level 5: list_item: Grain \ No newline at end of file + item-5 at level 2: paragraph: + item-6 at level 2: paragraph: Figure 1: This is a cute duckling + item-7 at level 2: section_header: Let’s swim! + item-8 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: + item-9 at level 3: list: group list + item-10 at level 4: list_item: You can relax and look around + item-11 at level 4: list_item: Paddle about + item-12 at level 4: list_item: Enjoy summer warmth + item-13 at level 3: paragraph: Also, don’t forget: + item-14 at level 3: list: group list + item-15 at level 4: list_item: Wear sunglasses + item-16 at level 4: list_item: Don’t forget to drink water + item-17 at level 4: list_item: Use sun cream + item-18 at level 3: paragraph: Hmm, what else… + item-19 at level 3: section_header: Let’s eat + item-20 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice + item-21 at level 4: paragraph: I like to eat leaves + item-22 at level 4: paragraph: Here are some interesting things a respectful duck could eat: + item-23 at level 4: table with [4x3] + item-24 at level 4: paragraph: + item-25 at level 4: paragraph: And let’s add another list in the end: + item-26 at level 4: list: group list + item-27 at level 5: list_item: Leaves + item-28 at level 5: list_item: Berries + item-29 at level 5: list_item: Grain \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 1d305cbc..355ac741 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -32,17 +32,17 @@ { "self_ref": "#/groups/0", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/6" - }, { "$ref": "#/texts/7" }, { "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -52,17 +52,17 @@ { "self_ref": "#/groups/1", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/10" - }, { "$ref": "#/texts/11" }, { "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" } ], "content_layer": "body", @@ -72,17 +72,17 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [ - { - "$ref": "#/texts/20" - }, { "$ref": "#/texts/21" }, { "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" } ], "content_layer": "body", @@ -120,6 +120,9 @@ }, { "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -149,32 +152,44 @@ "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Figure 1: This is a cute duckling", - "text": "Figure 1: This is a cute duckling" + "orig": "", + "text": "" }, { "self_ref": "#/texts/4", "parent": { "$ref": "#/texts/1" }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Figure 1: This is a cute duckling", + "text": "Figure 1: This is a cute duckling" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/1" + }, "children": [ { - "$ref": "#/texts/5" + "$ref": "#/texts/6" }, { "$ref": "#/groups/0" }, { - "$ref": "#/texts/9" + "$ref": "#/texts/10" }, { "$ref": "#/groups/1" }, { - "$ref": "#/texts/13" + "$ref": "#/texts/14" }, { - "$ref": "#/texts/14" + "$ref": "#/texts/15" } ], "content_layer": "body", @@ -185,9 +200,9 @@ "level": 1 }, { - "self_ref": "#/texts/5", + "self_ref": "#/texts/6", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -197,7 +212,7 @@ "text": "To get started with swimming, first lay down in a water and try not to drown:" }, { - "self_ref": "#/texts/6", + "self_ref": "#/texts/7", "parent": { "$ref": "#/groups/0" }, @@ -211,7 +226,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/8", "parent": { "$ref": "#/groups/0" }, @@ -225,7 +240,7 @@ "marker": "-" }, { - "self_ref": "#/texts/8", + "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/0" }, @@ -239,9 +254,9 @@ "marker": "-" }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/10", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -251,7 +266,7 @@ "text": "Also, don\u2019t forget:" }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/11", "parent": { "$ref": "#/groups/1" }, @@ -265,7 +280,7 @@ "marker": "-" }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/12", "parent": { "$ref": "#/groups/1" }, @@ -279,7 +294,7 @@ "marker": "-" }, { - "self_ref": "#/texts/12", + "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/1" }, @@ -293,9 +308,9 @@ "marker": "-" }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/14", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -305,29 +320,29 @@ "text": "Hmm, what else\u2026" }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/15", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/15" - }, { "$ref": "#/texts/16" }, { "$ref": "#/texts/17" }, - { - "$ref": "#/tables/0" - }, { "$ref": "#/texts/18" }, + { + "$ref": "#/tables/0" + }, { "$ref": "#/texts/19" }, + { + "$ref": "#/texts/20" + }, { "$ref": "#/groups/2" } @@ -340,9 +355,9 @@ "level": 2 }, { - "self_ref": "#/texts/15", + "self_ref": "#/texts/16", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -352,9 +367,9 @@ "text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice" }, { - "self_ref": "#/texts/16", + "self_ref": "#/texts/17", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -364,9 +379,9 @@ "text": "I like to eat leaves" }, { - "self_ref": "#/texts/17", + "self_ref": "#/texts/18", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -376,9 +391,9 @@ "text": "Here are some interesting things a respectful duck could eat:" }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/19", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -388,9 +403,9 @@ "text": "" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/20", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -400,7 +415,7 @@ "text": "And let\u2019s add another list in the end:" }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/21", "parent": { "$ref": "#/groups/2" }, @@ -414,7 +429,7 @@ "marker": "-" }, { - "self_ref": "#/texts/21", + "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/2" }, @@ -428,7 +443,7 @@ "marker": "-" }, { - "self_ref": "#/texts/22", + "self_ref": "#/texts/23", "parent": { "$ref": "#/groups/2" }, @@ -471,7 +486,7 @@ { "self_ref": "#/tables/0", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body",