diff --git a/tests/data/docx/word_image_anchors.docx b/tests/data/docx/word_image_anchors.docx new file mode 100644 index 00000000..c0b030c3 Binary files /dev/null and b/tests/data/docx/word_image_anchors.docx differ diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 3c1500ef..8cd66b39 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -7,6 +7,7 @@ from docling.datamodel.document import ( DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -115,3 +116,42 @@ def test_e2e_docx_conversions(): gtfile=str(gt_path) + ".html", generate=GENERATE, ), "export to html" + + +def test_text_after_image_anchors(): + """ + Test to analyse whether text gets parsed after image anchors. + """ + + in_path = Path("tests/data/docx/word_image_anchors.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_text_after_anchor_1 = found_text_after_anchor_2 = ( + found_text_after_anchor_3 + ) = found_text_after_anchor_4 = False + for item, _ in doc.iterate_items(): + if isinstance(item, TextItem): + if item.text == "This is test 1": + found_text_after_anchor_1 = True + elif item.text == "0:08\nCorrect, he is not.": + found_text_after_anchor_2 = True + elif item.text == "This is test 2": + found_text_after_anchor_3 = True + elif item.text == "0:16\nYeah, exactly.": + found_text_after_anchor_4 = True + + assert ( + found_text_after_anchor_1 + and found_text_after_anchor_2 + and found_text_after_anchor_3 + and found_text_after_anchor_4 + )