test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425

Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
2025-12-11 14:18:30 +00:00 · 2025-05-22 19:02:59 +02:00
parent af4aaa28af
commit fffa865014
2 changed files with 40 additions and 0 deletions
--- a/tests/data/docx/word_image_anchors.docx
+++ b/tests/data/docx/word_image_anchors.docx
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -7,6 +7,7 @@ from docling.datamodel.document import (
    DoclingDocument,
    InputDocument,
    SectionHeaderItem,
    TextItem,
 )
 from docling.document_converter import DocumentConverter
@@ -115,3 +116,42 @@ def test_e2e_docx_conversions():
                gtfile=str(gt_path) + ".html",
                generate=GENERATE,
            ), "export to html"
 def test_text_after_image_anchors():
    """
    Test to analyse whether text gets parsed after image anchors.
    """
    in_path = Path("tests/data/docx/word_image_anchors.docx")
    in_doc = InputDocument(
        path_or_stream=in_path,
        format=InputFormat.DOCX,
        backend=MsWordDocumentBackend,
    )
    backend = MsWordDocumentBackend(
        in_doc=in_doc,
        path_or_stream=in_path,
    )
    doc = backend.convert()
    found_text_after_anchor_1 = found_text_after_anchor_2 = (
        found_text_after_anchor_3
    ) = found_text_after_anchor_4 = False
    for item, _ in doc.iterate_items():
        if isinstance(item, TextItem):
            if item.text == "This is test 1":
                found_text_after_anchor_1 = True
            elif item.text == "0:08\nCorrect, he is not.":
                found_text_after_anchor_2 = True
            elif item.text == "This is test 2":
                found_text_after_anchor_3 = True
            elif item.text == "0:16\nYeah, exactly.":
                found_text_after_anchor_4 = True
    assert (
        found_text_after_anchor_1
        and found_text_after_anchor_2
        and found_text_after_anchor_3
        and found_text_after_anchor_4
    )