feat: add textbox content extraction in msword_backend

Signed-off-by: Andrew <tsai247365@gmail.com>
This commit is contained in:
Andrew 2025-05-06 17:35:55 +08:00
parent f1658edbad
commit 80832d9f39

View File

@ -2,7 +2,7 @@ import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Any, Optional, Union
from typing import Any, Optional, Set, Union
from docling_core.types.doc import (
DocItemLabel,
@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml import etree
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl
from typing_extensions import override
@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents: dict[int, Optional[NodeItem]] = {}
self.numbered_headers: dict[int, int] = {}
self.equation_bookends: str = "<eq>{EQ}</eq>"
# Track processed textbox elements to avoid duplication
self.processed_textbox_elements: Set[int] = set()
# Track content hash of processed paragraphs to avoid duplicate content
self.processed_paragraph_content: Set[str] = set()
for i in range(-1, self.max_levels):
self.parents[i] = None
@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
"v": "urn:schemas-microsoft-com:vml",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"w10": "urn:schemas-microsoft-com:office:word",
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
}
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
drawing_blip = xpath_expr(element)
# Check for textbox content - check multiple textbox formats
# Only process if the element hasn't been processed before
element_id = id(element)
if element_id not in self.processed_textbox_elements:
# Modern Word textboxes
txbx_xpath = etree.XPath(
".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
)
textbox_elements = txbx_xpath(element)
# No modern textboxes found, check for alternate/legacy textbox formats
if not textbox_elements and tag_name in ["drawing", "pict"]:
# Additional checks for textboxes in DrawingML and VML formats
alt_txbx_xpath = etree.XPath(
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
namespaces=namespaces,
)
textbox_elements = alt_txbx_xpath(element)
# Check for shape text that's not in a standard textbox
if not textbox_elements:
shape_text_xpath = etree.XPath(
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
namespaces=namespaces,
)
shape_text_elements = shape_text_xpath(element)
if shape_text_elements:
# Create custom text elements from shape text
text_content = " ".join(
[t.text for t in shape_text_elements if t.text]
)
if text_content.strip():
_log.debug(f"Found shape text: {text_content[:50]}...")
# Create a paragraph-like element to process with standard handler
level = self._get_level()
shape_group = doc.add_group(
label=GroupLabel.SECTION,
parent=self.parents[level - 1],
name="shape-text",
)
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=shape_group,
text=text_content,
)
if textbox_elements:
# Mark the parent element as processed
self.processed_textbox_elements.add(element_id)
# Also mark all found textbox elements as processed
for tb_element in textbox_elements:
self.processed_textbox_elements.add(id(tb_element))
_log.debug(
f"Found textbox content with {len(textbox_elements)} elements"
)
self._handle_textbox_content(textbox_elements, docx_obj, doc)
# Check for Tables
if element.tag.endswith("tbl"):
try:
@ -423,10 +491,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
is_from_textbox: bool = False,
) -> None:
paragraph = Paragraph(element, docx_obj)
# Skip if from a textbox and this exact paragraph content was already processed
raw_text = paragraph.text
if is_from_textbox and raw_text:
# Create a simple hash of content to detect duplicates
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
if content_hash in self.processed_paragraph_content:
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
return
self.processed_paragraph_content.add(content_hash)
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
if text is None:
@ -886,3 +964,64 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
caption=None,
)
return
def _handle_textbox_content(
self,
textbox_elements: list,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
"""Process textbox content and add it to the document structure."""
level = self._get_level()
# Create a textbox group to contain all text from the textbox
textbox_group = doc.add_group(
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
)
# Set this as the current parent to ensure textbox content
# is properly nested in document structure
original_parent = self.parents[level]
self.parents[level] = textbox_group
# Track processed paragraphs within this textbox to avoid duplication
processed_paragraphs = set()
# Process all elements within the textbox
for element in textbox_elements:
element_id = id(element)
# Skip if we've already processed this exact element
if element_id in processed_paragraphs:
continue
tag_name = etree.QName(element).localname
processed_paragraphs.add(element_id)
# Handle paragraphs directly found (VML textboxes)
if tag_name == "p":
# Direct paragraph from v:textbox
self._handle_text_elements(element, docx_obj, doc, is_from_textbox=True)
# Handle txbxContent elements (Word DrawingML textboxes)
elif tag_name == "txbxContent":
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
for p in paragraphs:
p_id = id(p)
if p_id not in processed_paragraphs:
processed_paragraphs.add(p_id)
self._handle_text_elements(
p, docx_obj, doc, is_from_textbox=True
)
else:
# Try to extract any paragraphs from unknown elements
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
for p in paragraphs:
p_id = id(p)
if p_id not in processed_paragraphs:
processed_paragraphs.add(p_id)
self._handle_text_elements(
p, docx_obj, doc, is_from_textbox=True
)
# Restore original parent
self.parents[level] = original_parent
return