diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 44a0f2cf..8386082a 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -14,7 +14,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.valid = True except Exception as e: raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" + f"MsWordDocumentBackend could not load document with hash {self.document_hash}" ) from e @override @@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") - + # Check for Image elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + # Check for Text after the Image + if ( + tag_name in ["p"] + and element.find(".//w:t", namespaces=namespaces) is not None + ): + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") + return doc def _str_to_int( @@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): all_paragraphs = [] # Sort paragraphs within each container, then process containers - for container_id, paragraphs in container_paragraphs.items(): + for paragraphs in container_paragraphs.values(): # Sort by vertical position within each container sorted_container_paragraphs = sorted( paragraphs, @@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, ) -> None: paragraph = Paragraph(element, docx_obj) - + paragraph_elements = self._get_paragraph_elements(paragraph) text, equations = self._handle_equations_in_text( element=element, text=paragraph.text ) if text is None: return - paragraph_elements = self._get_paragraph_elements(paragraph) text = text.strip() # Common styles for bullet and numbered lists. @@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return + def _add_formatted_list_item( + self, + doc: DoclingDocument, + elements: list, + marker: str, + enumerated: bool, + level: int, + ) -> None: + # This should not happen by construction + if not isinstance(self.parents[level], (OrderedList, UnorderedList)): + return + if len(elements) == 1: + text, format, hyperlink = elements[0] + doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text=text, + formatting=format, + hyperlink=hyperlink, + ) + else: + new_item = doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text="", + ) + new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item) + for text, format, hyperlink in elements: + doc.add_text( + label=DocItemLabel.TEXT, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) + def _add_list_item( self, *, @@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elements: list, is_numbered: bool = False, ) -> None: + # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed. + if not elements: + return None enum_marker = "" level = self._get_level() @@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) - elif ( self._prev_numid() == numid and self.level_at_new_list is not None @@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) elif ( self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and ilevel < prev_indent ): # Close list - for k, v in self.parents.items(): + for k in self.parents: if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) self.listIter = 0 elif self._prev_numid() == numid or prev_indent == ilevel: @@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level - 1], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level - 1 ) - for text, format, hyperlink in elements: - # Add the list item to the parent group - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) + return def _handle_tables( diff --git a/tests/data/docx/word_image_anchors.docx b/tests/data/docx/word_image_anchors.docx new file mode 100644 index 00000000..c0b030c3 Binary files /dev/null and b/tests/data/docx/word_image_anchors.docx differ diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index 2860c30b..fccb44c6 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_ item-16 at level 2: list_item: Italic bullet 1 item-17 at level 2: list_item: Bold bullet 2 item-18 at level 2: list_item: Underline bullet 3 - item-19 at level 2: inline: group group - item-20 at level 3: list_item: Some - item-21 at level 3: list_item: italic - item-22 at level 3: list_item: bold - item-23 at level 3: list_item: underline - item-24 at level 2: list: group list - item-25 at level 3: inline: group group - item-26 at level 4: list_item: Nested - item-27 at level 4: list_item: italic - item-28 at level 4: list_item: bold - item-29 at level 1: paragraph: \ No newline at end of file + item-19 at level 2: list_item: + item-20 at level 3: inline: group group + item-21 at level 4: text: Some + item-22 at level 4: text: italic + item-23 at level 4: text: bold + item-24 at level 4: text: underline + item-25 at level 2: list: group list + item-26 at level 3: list_item: + item-27 at level 4: inline: group group + item-28 at level 5: text: Nested + item-29 at level 5: text: italic + item-30 at level 5: text: bold + item-31 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index 8b6ee9db..967aff11 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -42,7 +42,7 @@ "$ref": "#/groups/1" }, { - "$ref": "#/texts/23" + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -98,7 +98,7 @@ "$ref": "#/texts/15" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/16" }, { "$ref": "#/groups/3" @@ -111,12 +111,9 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/groups/1" + "$ref": "#/texts/16" }, "children": [ - { - "$ref": "#/texts/16" - }, { "$ref": "#/texts/17" }, @@ -125,6 +122,9 @@ }, { "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -138,7 +138,7 @@ }, "children": [ { - "$ref": "#/groups/4" + "$ref": "#/texts/21" } ], "content_layer": "body", @@ -148,17 +148,17 @@ { "self_ref": "#/groups/4", "parent": { - "$ref": "#/groups/3" + "$ref": "#/texts/21" }, "children": [ - { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" - }, { "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" } ], "content_layer": "body", @@ -461,20 +461,18 @@ { "self_ref": "#/texts/16", "parent": { - "$ref": "#/groups/2" + "$ref": "#/groups/1" }, - "children": [], + "children": [ + { + "$ref": "#/groups/2" + } + ], "content_layer": "body", "label": "list_item", "prov": [], - "orig": "Some", - "text": "Some", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, + "orig": "", + "text": "", "enumerated": false, "marker": "-" }, @@ -485,18 +483,16 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], - "orig": "italic", - "text": "italic", + "orig": "Some", + "text": "Some", "formatting": { "bold": false, - "italic": true, + "italic": false, "underline": false, "strikethrough": false - }, - "enumerated": false, - "marker": "-" + } }, { "self_ref": "#/texts/18", @@ -505,67 +501,7 @@ }, "children": [], "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "bold", - "text": "bold", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "underline", - "text": "underline", - "formatting": { - "bold": false, - "italic": false, - "underline": true, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "Nested", - "text": "Nested", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], "orig": "italic", "text": "italic", @@ -574,7 +510,59 @@ "italic": true, "underline": false, "strikethrough": false + } + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/2" }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", "enumerated": false, "marker": "-" }, @@ -585,7 +573,43 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "text", + "prov": [], + "orig": "Nested", + "text": "Nested", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [], "orig": "bold", "text": "bold", @@ -594,12 +618,10 @@ "italic": false, "underline": false, "strikethrough": false - }, - "enumerated": false, - "marker": "-" + } }, { - "self_ref": "#/texts/23", + "self_ref": "#/texts/25", "parent": { "$ref": "#/body" }, diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt new file mode 100644 index 00000000..ebc5cebf --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt @@ -0,0 +1,16 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Transcript + item-2 at level 1: paragraph: February 20, 2025, 8:32PM + item-3 at level 1: picture + item-4 at level 1: inline: group group + item-5 at level 2: paragraph: This is test 1 + item-6 at level 2: paragraph: 0:08 +Correct, he is not. + item-7 at level 1: paragraph: + item-8 at level 1: picture + item-9 at level 1: inline: group group + item-10 at level 2: paragraph: This is test 2 + item-11 at level 2: paragraph: 0:16 +Yeah, exactly. + item-12 at level 1: paragraph: + item-13 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json new file mode 100644 index 00000000..b5433eb1 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json @@ -0,0 +1,286 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "word_image_anchors", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 2428692234257307633, + "filename": "word_image_anchors.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Transcript", + "text": "Transcript", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "February 20, 2025, 8:32PM", + "text": "February 20, 2025, 8:32PM", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is test 1", + "text": "This is test 1", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:08\nCorrect, he is not.", + "text": "0:08\nCorrect, he is not.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is test 2", + "text": "This is test 2", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:16\nYeah, exactly.", + "text": "0:16\nYeah, exactly.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "" + }, + "annotations": [] + }, + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "" + }, + "annotations": [] + } + ], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md new file mode 100644 index 00000000..a3ce2fc7 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md @@ -0,0 +1,13 @@ +**Transcript** + +February 20, 2025, 8:32PM + + + +**This is test 1** 0:08 +Correct, he is not. + + + +**This is test 2** 0:16 +Yeah, exactly. \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f37b4874..9da0ea25 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -9,6 +9,7 @@ from docling.datamodel.document import ( DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( - "export to md" + f"export to markdown failed on {docx_path}" ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( - "export to indented-text" + f"export to indented-text failed on {docx_path}" ) assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( - "document document" + f"DoclingDocument verification failed on {docx_path}" ) if docx_path.name == "word_tables.docx": @@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_text=pred_html, gtfile=str(gt_path) + ".html", generate=GENERATE, - ), "export to html" + ), f"export to html failed on {docx_path}" flaky_path = Path("tests/data/docx/textbox.docx") @@ -131,3 +132,42 @@ def test_e2e_docx_conversions(): @pytest.mark.xfail(strict=False) def test_textbox_conversion(): _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) + + +def test_text_after_image_anchors(): + """ + Test to analyse whether text gets parsed after image anchors. + """ + + in_path = Path("tests/data/docx/word_image_anchors.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_text_after_anchor_1 = found_text_after_anchor_2 = ( + found_text_after_anchor_3 + ) = found_text_after_anchor_4 = False + for item, _ in doc.iterate_items(): + if isinstance(item, TextItem): + if item.text == "This is test 1": + found_text_after_anchor_1 = True + elif item.text == "0:08\nCorrect, he is not.": + found_text_after_anchor_2 = True + elif item.text == "This is test 2": + found_text_after_anchor_3 = True + elif item.text == "0:16\nYeah, exactly.": + found_text_after_anchor_4 = True + + assert ( + found_text_after_anchor_1 + and found_text_after_anchor_2 + and found_text_after_anchor_3 + and found_text_after_anchor_4 + )