feat: add textbox content extraction in msword_backend (#1538)

* feat: add textbox content extraction in msword_backend

Signed-off-by: Andrew <tsai247365@gmail.com>

* feat: add textbox content extraction in msword_backend

Signed-off-by: Andrew <tsai247365@gmail.com>

* feat: add textbox content extraction in msword_backend

Signed-off-by: Andrew <tsai247365@gmail.com>

---------

Signed-off-by: Andrew <tsai247365@gmail.com>
This commit is contained in:
AndrewTsai0406
2025-05-19 21:01:36 +08:00
committed by GitHub
parent 7c4c356e76
commit 12a0e64892
3 changed files with 290 additions and 12 deletions

Binary file not shown.

View File

@@ -16,6 +16,27 @@ from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_textbox_extraction():
in_path = Path("tests/data/docx/textbox.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
# Verify if a particular textbox content is extracted
textbox_found = False
for item, _ in doc.iterate_items():
if item.text[:30] == """Suggested Reportable Symptoms:""":
textbox_found = True
assert textbox_found
def test_heading_levels():
in_path = Path("tests/data/docx/word_sample.docx")
in_doc = InputDocument(