mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425
Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
parent
af4aaa28af
commit
fffa865014
BIN
tests/data/docx/word_image_anchors.docx
Normal file
BIN
tests/data/docx/word_image_anchors.docx
Normal file
Binary file not shown.
@ -7,6 +7,7 @@ from docling.datamodel.document import (
|
||||
DoclingDocument,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
@ -115,3 +116,42 @@ def test_e2e_docx_conversions():
|
||||
gtfile=str(gt_path) + ".html",
|
||||
generate=GENERATE,
|
||||
), "export to html"
|
||||
|
||||
|
||||
def test_text_after_image_anchors():
|
||||
"""
|
||||
Test to analyse whether text gets parsed after image anchors.
|
||||
"""
|
||||
|
||||
in_path = Path("tests/data/docx/word_image_anchors.docx")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=InputFormat.DOCX,
|
||||
backend=MsWordDocumentBackend,
|
||||
)
|
||||
backend = MsWordDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
)
|
||||
doc = backend.convert()
|
||||
|
||||
found_text_after_anchor_1 = found_text_after_anchor_2 = (
|
||||
found_text_after_anchor_3
|
||||
) = found_text_after_anchor_4 = False
|
||||
for item, _ in doc.iterate_items():
|
||||
if isinstance(item, TextItem):
|
||||
if item.text == "This is test 1":
|
||||
found_text_after_anchor_1 = True
|
||||
elif item.text == "0:08\nCorrect, he is not.":
|
||||
found_text_after_anchor_2 = True
|
||||
elif item.text == "This is test 2":
|
||||
found_text_after_anchor_3 = True
|
||||
elif item.text == "0:16\nYeah, exactly.":
|
||||
found_text_after_anchor_4 = True
|
||||
|
||||
assert (
|
||||
found_text_after_anchor_1
|
||||
and found_text_after_anchor_2
|
||||
and found_text_after_anchor_3
|
||||
and found_text_after_anchor_4
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user