chore: add type hinting to docx backend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-01-31 18:30:00 +01:00
parent 40145b59b3
commit e0f89029db

View File

@ -2,23 +2,28 @@ import logging
import re import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Optional, Set, Union from typing import Any, Optional, Union
import docx
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupLabel, GroupLabel,
ImageRef, ImageRef,
NodeItem,
TableCell, TableCell,
TableData, TableData,
) )
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc from docx.oxml.table import CT_Tc
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.table import Table, _Cell from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from lxml import etree from lxml import etree
from lxml.etree import XPath from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -28,7 +33,10 @@ _log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend): class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): @override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
self.XML_KEY = ( self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
@ -38,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
} }
# self.initialise(path_or_stream) # self.initialise(path_or_stream)
# Word file: # Word file:
self.path_or_stream = path_or_stream self.path_or_stream: Union[BytesIO, Path] = path_or_stream
self.valid = False self.valid: bool = False
# Initialise the parents for the hierarchy # Initialise the parents for the hierarchy
self.max_levels = 10 self.max_levels: int = 10
self.level_at_new_list = None self.level_at_new_list: Optional[int] = None
self.parents = {} # type: ignore self.parents: dict[int, Optional[NodeItem]] = {}
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
self.level = 0 self.level = 0
self.listIter = 0 self.listIter = 0
self.history = { self.history: dict[str, Any] = {
"names": [None], "names": [None],
"levels": [None], "levels": [None],
"numids": [None], "numids": [None],
@ -60,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.docx_obj = None self.docx_obj = None
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
self.docx_obj = docx.Document(self.path_or_stream) self.docx_obj = Document(self.path_or_stream)
elif isinstance(self.path_or_stream, Path): elif isinstance(self.path_or_stream, Path):
self.docx_obj = docx.Document(str(self.path_or_stream)) self.docx_obj = Document(str(self.path_or_stream))
self.valid = True self.valid = True
except Exception as e: except Exception as e:
@ -70,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
@override
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@classmethod @classmethod
@override
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
@override
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close() self.path_or_stream.close()
@ -84,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.path_or_stream = None self.path_or_stream = None
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: @override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.DOCX} return {InputFormat.DOCX}
@override
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model. """Parses the DOCX into a structured document model.
Returns:
The parsed document.
"""
origin = DocumentOrigin( origin = DocumentOrigin(
filename=self.file.name or "file", filename=self.file.name or "file",
@ -106,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f"Cannot convert doc with {self.document_hash} because the backend failed to init." f"Cannot convert doc with {self.document_hash} because the backend failed to init."
) )
def update_history(self, name, level, numid, ilevel): def update_history(
self,
name: str,
level: Optional[int],
numid: Optional[int],
ilevel: Optional[int],
):
self.history["names"].append(name) self.history["names"].append(name)
self.history["levels"].append(level) self.history["levels"].append(level)
self.history["numids"].append(numid) self.history["numids"].append(numid)
self.history["indents"].append(ilevel) self.history["indents"].append(ilevel)
def prev_name(self): def prev_name(self) -> Optional[str]:
return self.history["names"][-1] return self.history["names"][-1]
def prev_level(self): def prev_level(self) -> Optional[int]:
return self.history["levels"][-1] return self.history["levels"][-1]
def prev_numid(self): def prev_numid(self) -> Optional[int]:
return self.history["numids"][-1] return self.history["numids"][-1]
def prev_indent(self): def prev_indent(self) -> Optional[int]:
return self.history["indents"][-1] return self.history["indents"][-1]
def get_level(self) -> int: def get_level(self) -> int:
@ -132,7 +155,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return k return k
return 0 return 0
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: def walk_linear(
self,
body: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> DoclingDocument:
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements) # Check for Inline Images (blip elements)
@ -152,7 +180,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
_log.debug("could not parse a table, broken docx table") _log.debug("could not parse a table, broken docx table")
elif drawing_blip: elif drawing_blip:
self.handle_pictures(element, docx_obj, drawing_blip, doc) self.handle_pictures(docx_obj, drawing_blip, doc)
# Check for the sdt containers, like table of contents # Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]: elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -169,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}") _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc return doc
def str_to_int(self, s, default=0): def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
if s is None: if s is None:
return None return None
try: try:
@ -177,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
except ValueError: except ValueError:
return default return default
def split_text_and_number(self, input_string): def split_text_and_number(self, input_string: str) -> list[str]:
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
if match: if match:
parts = list(filter(None, match.groups())) parts = list(filter(None, match.groups()))
@ -185,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else: else:
return [input_string] return [input_string]
def get_numId_and_ilvl(self, paragraph): def get_numId_and_ilvl(
self, paragraph: Paragraph
) -> tuple[Optional[int], Optional[int]]:
# Access the XML element of the paragraph # Access the XML element of the paragraph
numPr = paragraph._element.find( numPr = paragraph._element.find(
".//w:numPr", namespaces=paragraph._element.nsmap ".//w:numPr", namespaces=paragraph._element.nsmap
@ -198,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
return self.str_to_int(numId, default=None), self.str_to_int( return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
ilvl, default=None
)
return None, None # If the paragraph is not part of a list return None, None # If the paragraph is not part of a list
def get_label_and_level(self, paragraph): def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
if paragraph.style is None: if paragraph.style is None:
return "Normal", None return "Normal", None
label = paragraph.style.style_id label = paragraph.style.style_id
@ -220,20 +248,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if "Heading" in label and len(parts) == 2: if "Heading" in label and len(parts) == 2:
parts.sort() parts.sort()
label_str = "" label_str: str = ""
label_level = 0 label_level: Optional[int] = 0
if parts[0] == "Heading": if parts[0] == "Heading":
label_str = parts[0] label_str = parts[0]
label_level = self.str_to_int(parts[1], default=None) label_level = self.str_to_int(parts[1], None)
if parts[1] == "Heading": if parts[1] == "Heading":
label_str = parts[1] label_str = parts[1]
label_level = self.str_to_int(parts[0], default=None) label_level = self.str_to_int(parts[0], None)
return label_str, label_level return label_str, label_level
else: else:
return label, None return label, None
def handle_text_elements(self, element, docx_obj, doc): def handle_text_elements(
paragraph = docx.text.paragraph.Paragraph(element, docx_obj) self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
paragraph = Paragraph(element, docx_obj)
if paragraph.text is None: if paragraph.text is None:
return return
@ -257,11 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and p_style_id not in ["Title", "Heading"] and p_style_id not in ["Title", "Heading"]
): ):
self.add_listitem( self.add_listitem(
element,
docx_obj,
doc, doc,
p_style_id,
p_level,
numid, numid,
ilevel, ilevel,
text, text,
@ -286,13 +315,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level = 0 self.level = 0
if p_style_id in ["Title"]: if p_style_id in ["Title"]:
for key, val in self.parents.items(): for key in range(len(self.parents)):
self.parents[key] = None self.parents[key] = None
self.parents[0] = doc.add_text( self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text parent=None, label=DocItemLabel.TITLE, text=text
) )
elif "Heading" in p_style_id: elif "Heading" in p_style_id:
self.add_header(element, docx_obj, doc, p_style_id, p_level, text) self.add_header(doc, p_level, text)
elif p_style_id in [ elif p_style_id in [
"Paragraph", "Paragraph",
@ -320,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.update_history(p_style_id, p_level, numid, ilevel) self.update_history(p_style_id, p_level, numid, ilevel)
return return
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): def add_header(
self, doc: DoclingDocument, curr_level: Optional[int], text: str
) -> None:
level = self.get_level() level = self.get_level()
if isinstance(curr_level, int): if isinstance(curr_level, int):
if curr_level > level: if curr_level > level:
@ -333,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
elif curr_level < level: elif curr_level < level:
# remove the tail # remove the tail
for key, val in self.parents.items(): for key in range(len(self.parents)):
if key >= curr_level: if key >= curr_level:
self.parents[key] = None self.parents[key] = None
@ -352,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def add_listitem( def add_listitem(
self, self,
element, doc: DoclingDocument,
docx_obj, numid: int,
doc, ilevel: int,
p_style_id,
p_level,
numid,
ilevel,
text: str, text: str,
is_numbered=False, is_numbered: bool = False,
): ) -> None:
# is_numbered = is_numbered
enum_marker = "" enum_marker = ""
level = self.get_level() level = self.get_level()
prev_indent = self.prev_indent()
if self.prev_numid() is None: # Open new list if self.prev_numid() is None: # Open new list
self.level_at_new_list = level # type: ignore self.level_at_new_list = level
self.parents[level] = doc.add_group( self.parents[level] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
@ -386,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
elif ( elif (
self.prev_numid() == numid and self.prev_indent() < ilevel self.prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and prev_indent < ilevel
): # Open indented list ): # Open indented list
for i in range( for i in range(
self.level_at_new_list + self.prev_indent() + 1, self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1, self.level_at_new_list + ilevel + 1,
): ):
# Determine if this is an unordered list or an ordered list. # Determine if this is an unordered list or an ordered list.
@ -418,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text=text, text=text,
) )
elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list elif (
self.prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and ilevel < prev_indent
): # Close list
for k, v in self.parents.items(): for k, v in self.parents.items():
if k > self.level_at_new_list + ilevel: if k > self.level_at_new_list + ilevel:
self.parents[k] = None self.parents[k] = None
@ -436,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
self.listIter = 0 self.listIter = 0
elif self.prev_numid() == numid or self.prev_indent() == ilevel: elif self.prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1 self.listIter += 1
if is_numbered: if is_numbered:
@ -450,7 +485,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
return return
def handle_tables(self, element, docx_obj, doc): def handle_tables(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
table: Table = Table(element, docx_obj) table: Table = Table(element, docx_obj)
num_rows = len(table.rows) num_rows = len(table.rows)
num_cols = len(table.columns) num_cols = len(table.columns)
@ -509,8 +549,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=data, parent=self.parents[level - 1]) doc.add_table(data=data, parent=self.parents[level - 1])
return return
def handle_pictures(self, element, docx_obj, drawing_blip, doc): def handle_pictures(
def get_docx_image(element, drawing_blip): self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None:
def get_docx_image(drawing_blip):
rId = drawing_blip[0].get( rId = drawing_blip[0].get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
) )
@ -523,7 +565,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level = self.get_level() level = self.get_level()
# Open the BytesIO object with PIL to create an Image # Open the BytesIO object with PIL to create an Image
try: try:
image_data = get_docx_image(element, drawing_blip) image_data = get_docx_image(drawing_blip)
image_bytes = BytesIO(image_data) image_bytes = BytesIO(image_data)
pil_image = Image.open(image_bytes) pil_image = Image.open(image_bytes)
doc.add_picture( doc.add_picture(