test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425

Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
Michael Krissgau 2025-05-22 19:02:59 +02:00
parent af4aaa28af
commit fffa865014
2 changed files with 40 additions and 0 deletions

Binary file not shown.

View File

@ -7,6 +7,7 @@ from docling.datamodel.document import (
DoclingDocument, DoclingDocument,
InputDocument, InputDocument,
SectionHeaderItem, SectionHeaderItem,
TextItem,
) )
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -115,3 +116,42 @@ def test_e2e_docx_conversions():
gtfile=str(gt_path) + ".html", gtfile=str(gt_path) + ".html",
generate=GENERATE, generate=GENERATE,
), "export to html" ), "export to html"
def test_text_after_image_anchors():
"""
Test to analyse whether text gets parsed after image anchors.
"""
in_path = Path("tests/data/docx/word_image_anchors.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_text_after_anchor_1 = found_text_after_anchor_2 = (
found_text_after_anchor_3
) = found_text_after_anchor_4 = False
for item, _ in doc.iterate_items():
if isinstance(item, TextItem):
if item.text == "This is test 1":
found_text_after_anchor_1 = True
elif item.text == "0:08\nCorrect, he is not.":
found_text_after_anchor_2 = True
elif item.text == "This is test 2":
found_text_after_anchor_3 = True
elif item.text == "0:16\nYeah, exactly.":
found_text_after_anchor_4 = True
assert (
found_text_after_anchor_1
and found_text_after_anchor_2
and found_text_after_anchor_3
and found_text_after_anchor_4
)