mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425
Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
parent
af4aaa28af
commit
fffa865014
BIN
tests/data/docx/word_image_anchors.docx
Normal file
BIN
tests/data/docx/word_image_anchors.docx
Normal file
Binary file not shown.
@ -7,6 +7,7 @@ from docling.datamodel.document import (
|
|||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
|
TextItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
@ -115,3 +116,42 @@ def test_e2e_docx_conversions():
|
|||||||
gtfile=str(gt_path) + ".html",
|
gtfile=str(gt_path) + ".html",
|
||||||
generate=GENERATE,
|
generate=GENERATE,
|
||||||
), "export to html"
|
), "export to html"
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_after_image_anchors():
|
||||||
|
"""
|
||||||
|
Test to analyse whether text gets parsed after image anchors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
in_path = Path("tests/data/docx/word_image_anchors.docx")
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=in_path,
|
||||||
|
format=InputFormat.DOCX,
|
||||||
|
backend=MsWordDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = MsWordDocumentBackend(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=in_path,
|
||||||
|
)
|
||||||
|
doc = backend.convert()
|
||||||
|
|
||||||
|
found_text_after_anchor_1 = found_text_after_anchor_2 = (
|
||||||
|
found_text_after_anchor_3
|
||||||
|
) = found_text_after_anchor_4 = False
|
||||||
|
for item, _ in doc.iterate_items():
|
||||||
|
if isinstance(item, TextItem):
|
||||||
|
if item.text == "This is test 1":
|
||||||
|
found_text_after_anchor_1 = True
|
||||||
|
elif item.text == "0:08\nCorrect, he is not.":
|
||||||
|
found_text_after_anchor_2 = True
|
||||||
|
elif item.text == "This is test 2":
|
||||||
|
found_text_after_anchor_3 = True
|
||||||
|
elif item.text == "0:16\nYeah, exactly.":
|
||||||
|
found_text_after_anchor_4 = True
|
||||||
|
|
||||||
|
assert (
|
||||||
|
found_text_after_anchor_1
|
||||||
|
and found_text_after_anchor_2
|
||||||
|
and found_text_after_anchor_3
|
||||||
|
and found_text_after_anchor_4
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user