feat: add textbox content extraction in msword_backend

Signed-off-by: Andrew <tsai247365@gmail.com>
This commit is contained in:
Andrew 2025-05-14 14:59:38 +08:00
parent 498fc79392
commit 8d9fbff1a7
2 changed files with 21 additions and 0 deletions

Binary file not shown.

View File

@ -16,6 +16,27 @@ from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA GENERATE = GEN_TEST_DATA
def test_textbox_extraction():
in_path = Path("tests/data/docx/textbox.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
# Verify if a particular textbox content is extracted
textbox_found = False
for item, _ in doc.iterate_items():
if item.text[:30] == """Suggested Reportable Symptoms:""":
textbox_found = True
assert textbox_found
def test_heading_levels(): def test_heading_levels():
in_path = Path("tests/data/docx/word_sample.docx") in_path = Path("tests/data/docx/word_sample.docx")
in_doc = InputDocument( in_doc = InputDocument(