feat: add textbox content extraction in msword_backend

Signed-off-by: Andrew <tsai247365@gmail.com>
This commit is contained in:
Andrew 2025-05-06 17:35:55 +08:00
parent 80832d9f39
commit fc5a9492a3

View File

@ -2,7 +2,7 @@ import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Any, Optional, Set, Union
from typing import Any, List, Optional, Union
from docling_core.types.doc import (
DocItemLabel,
@ -59,9 +59,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.numbered_headers: dict[int, int] = {}
self.equation_bookends: str = "<eq>{EQ}</eq>"
# Track processed textbox elements to avoid duplication
self.processed_textbox_elements: Set[int] = set()
self.processed_textbox_elements: List[int] = []
# Track content hash of processed paragraphs to avoid duplicate content
self.processed_paragraph_content: Set[str] = set()
self.processed_paragraph_content: List[str] = []
for i in range(-1, self.max_levels):
self.parents[i] = None
@ -185,8 +185,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"w10": "urn:schemas-microsoft-com:office:word",
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
"v": "urn:schemas-microsoft-com:vml",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"w10": "urn:schemas-microsoft-com:office:word",
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
}
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
drawing_blip = xpath_expr(element)
# Check for textbox content - check multiple textbox formats
@ -237,10 +244,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if textbox_elements:
# Mark the parent element as processed
self.processed_textbox_elements.add(element_id)
self.processed_textbox_elements.append(element_id)
# Also mark all found textbox elements as processed
for tb_element in textbox_elements:
self.processed_textbox_elements.add(id(tb_element))
self.processed_textbox_elements.append(id(tb_element))
_log.debug(
f"Found textbox content with {len(textbox_elements)} elements"
@ -423,6 +430,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return paragraph_elements
def _get_paragraph_position(self, paragraph_element):
"""Extract vertical position information from paragraph element."""
# First try to directly get the index from w:p element that has an order-related attribute
if (
hasattr(paragraph_element, "getparent")
and paragraph_element.getparent() is not None
):
parent = paragraph_element.getparent()
# Get all paragraph siblings
paragraphs = [
p for p in parent.getchildren() if etree.QName(p).localname == "p"
]
# Find index of current paragraph within its siblings
try:
paragraph_index = paragraphs.index(paragraph_element)
return paragraph_index # Use index as position for consistent ordering
except ValueError:
pass
# Look for position hints in element attributes and ancestor elements
for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
# Check for direct position attributes
for attr_name in ["y", "top", "positionY", "y-position", "position"]:
value = elem.get(attr_name)
if value:
try:
# Remove any non-numeric characters (like 'pt', 'px', etc.)
clean_value = re.sub(r"[^0-9.]", "", value)
if clean_value:
return float(clean_value)
except (ValueError, TypeError):
pass
# Check for position in transform attribute
transform = elem.get("transform")
if transform:
# Extract translation component from transform matrix
match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
if match:
try:
return float(match.group(1))
except ValueError:
pass
# Check for anchors or relative position indicators in Word format
# 'dist' attributes can indicate relative positioning
for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
if elem.get(attr_name) is not None:
return elem.sourceline # Use the XML source line number as fallback
# For VML shapes, look for specific attributes
for ns_uri in paragraph_element.nsmap.values():
if "vml" in ns_uri:
# Try to extract position from style attribute
style = paragraph_element.get("style")
if style:
match = re.search(r"top:([0-9.]+)pt", style)
if match:
try:
return float(match.group(1))
except ValueError:
pass
# If no better position indicator found, use XML source line number as proxy for order
return (
paragraph_element.sourceline
if hasattr(paragraph_element, "sourceline")
else None
)
def _collect_textbox_paragraphs(self, textbox_elements):
"""Collect and organize paragraphs from textbox elements."""
processed_paragraphs = []
container_paragraphs = {}
for element in textbox_elements:
element_id = id(element)
# Skip if we've already processed this exact element
if element_id in processed_paragraphs:
continue
tag_name = etree.QName(element).localname
processed_paragraphs.append(element_id)
# Handle paragraphs directly found (VML textboxes)
if tag_name == "p":
# Find the containing textbox or shape element
container_id = None
for ancestor in element.iterancestors():
if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
container_id = id(ancestor)
break
if container_id not in container_paragraphs:
container_paragraphs[container_id] = []
container_paragraphs[container_id].append(
(element, self._get_paragraph_position(element))
)
# Handle txbxContent elements (Word DrawingML textboxes)
elif tag_name == "txbxContent":
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
container_id = id(element)
if container_id not in container_paragraphs:
container_paragraphs[container_id] = []
for p in paragraphs:
p_id = id(p)
if p_id not in processed_paragraphs:
processed_paragraphs.append(p_id)
container_paragraphs[container_id].append(
(p, self._get_paragraph_position(p))
)
else:
# Try to extract any paragraphs from unknown elements
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
container_id = id(element)
if container_id not in container_paragraphs:
container_paragraphs[container_id] = []
for p in paragraphs:
p_id = id(p)
if p_id not in processed_paragraphs:
processed_paragraphs.append(p_id)
container_paragraphs[container_id].append(
(p, self._get_paragraph_position(p))
)
return container_paragraphs
def _handle_textbox_content(
self,
textbox_elements: list,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
"""Process textbox content and add it to the document structure."""
level = self._get_level()
# Create a textbox group to contain all text from the textbox
textbox_group = doc.add_group(
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
)
# Set this as the current parent to ensure textbox content
# is properly nested in document structure
original_parent = self.parents[level]
self.parents[level] = textbox_group
# Collect and organize paragraphs
container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
# Process all paragraphs
all_paragraphs = []
# Sort paragraphs within each container, then process containers
for container_id, paragraphs in container_paragraphs.items():
# Sort by vertical position within each container
sorted_container_paragraphs = sorted(
paragraphs,
key=lambda x: (
x[1] is None,
x[1] if x[1] is not None else float("inf"),
),
)
# Add the sorted paragraphs to our processing list
all_paragraphs.extend(sorted_container_paragraphs)
# Process all the paragraphs
for p, _ in all_paragraphs:
self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
# Restore original parent
self.parents[level] = original_parent
return
def _handle_equations_in_text(self, element, text):
only_texts = []
only_equations = []
@ -492,9 +675,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
docx_obj: DocxDocument,
doc: DoclingDocument,
is_from_textbox: bool = False,
is_from_textbox: bool = False,
) -> None:
paragraph = Paragraph(element, docx_obj)
# Skip if from a textbox and this exact paragraph content was already processed
# Skip if from a textbox and this exact paragraph content was already processed
raw_text = paragraph.text
if is_from_textbox and raw_text:
@ -503,7 +688,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if content_hash in self.processed_paragraph_content:
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
return
self.processed_paragraph_content.add(content_hash)
self.processed_paragraph_content.append(content_hash)
text, equations = self._handle_equations_in_text(element=element, text=raw_text)