diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 18b0418c..4d4026e3 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -2,23 +2,28 @@ import logging import re from io import BytesIO from pathlib import Path -from typing import Optional, Set, Union +from typing import Any, Optional, Union -import docx from docling_core.types.doc import ( DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, ImageRef, + NodeItem, TableCell, TableData, ) +from docx import Document +from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc +from docx.oxml.xmlchemy import BaseOxmlElement from docx.table import Table, _Cell +from docx.text.paragraph import Paragraph from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -28,7 +33,10 @@ _log = logging.getLogger(__name__) class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + @override + def __init__( + self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + ) -> None: super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" @@ -38,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): } # self.initialise(path_or_stream) # Word file: - self.path_or_stream = path_or_stream - self.valid = False + self.path_or_stream: Union[BytesIO, Path] = path_or_stream + self.valid: bool = False # Initialise the parents for the hierarchy - self.max_levels = 10 - self.level_at_new_list = None - self.parents = {} # type: ignore + self.max_levels: int = 10 + self.level_at_new_list: Optional[int] = None + self.parents: dict[int, Optional[NodeItem]] = {} for i in range(-1, self.max_levels): self.parents[i] = None self.level = 0 self.listIter = 0 - self.history = { + self.history: dict[str, Any] = { "names": [None], "levels": [None], "numids": [None], @@ -60,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.docx_obj = None try: if isinstance(self.path_or_stream, BytesIO): - self.docx_obj = docx.Document(self.path_or_stream) + self.docx_obj = Document(self.path_or_stream) elif isinstance(self.path_or_stream, Path): - self.docx_obj = docx.Document(str(self.path_or_stream)) + self.docx_obj = Document(str(self.path_or_stream)) self.valid = True except Exception as e: @@ -70,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" ) from e + @override def is_valid(self) -> bool: return self.valid @classmethod + @override def supports_pagination(cls) -> bool: return False + @override def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() @@ -84,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.path_or_stream = None @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.DOCX} + @override def convert(self) -> DoclingDocument: - # Parses the DOCX into a structured document model. + """Parses the DOCX into a structured document model. + + Returns: + The parsed document. + """ origin = DocumentOrigin( filename=self.file.name or "file", @@ -106,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) - def update_history(self, name, level, numid, ilevel): + def update_history( + self, + name: str, + level: Optional[int], + numid: Optional[int], + ilevel: Optional[int], + ): self.history["names"].append(name) self.history["levels"].append(level) self.history["numids"].append(numid) self.history["indents"].append(ilevel) - def prev_name(self): + def prev_name(self) -> Optional[str]: return self.history["names"][-1] - def prev_level(self): + def prev_level(self) -> Optional[int]: return self.history["levels"][-1] - def prev_numid(self): + def prev_numid(self) -> Optional[int]: return self.history["numids"][-1] - def prev_indent(self): + def prev_indent(self) -> Optional[int]: return self.history["indents"][-1] def get_level(self) -> int: @@ -132,7 +155,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return k return 0 - def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: + def walk_linear( + self, + body: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname # Check for Inline Images (blip elements) @@ -152,7 +180,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): _log.debug("could not parse a table, broken docx table") elif drawing_blip: - self.handle_pictures(element, docx_obj, drawing_blip, doc) + self.handle_pictures(docx_obj, drawing_blip, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -169,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc - def str_to_int(self, s, default=0): + def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]: if s is None: return None try: @@ -177,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): except ValueError: return default - def split_text_and_number(self, input_string): + def split_text_and_number(self, input_string: str) -> list[str]: match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) if match: parts = list(filter(None, match.groups())) @@ -185,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return [input_string] - def get_numId_and_ilvl(self, paragraph): + def get_numId_and_ilvl( + self, paragraph: Paragraph + ) -> tuple[Optional[int], Optional[int]]: # Access the XML element of the paragraph numPr = paragraph._element.find( ".//w:numPr", namespaces=paragraph._element.nsmap @@ -198,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None - return self.str_to_int(numId, default=None), self.str_to_int( - ilvl, default=None - ) + return self.str_to_int(numId, None), self.str_to_int(ilvl, None) return None, None # If the paragraph is not part of a list - def get_label_and_level(self, paragraph): + def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: if paragraph.style is None: return "Normal", None label = paragraph.style.style_id @@ -220,20 +248,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if "Heading" in label and len(parts) == 2: parts.sort() - label_str = "" - label_level = 0 + label_str: str = "" + label_level: Optional[int] = 0 if parts[0] == "Heading": label_str = parts[0] - label_level = self.str_to_int(parts[1], default=None) + label_level = self.str_to_int(parts[1], None) if parts[1] == "Heading": label_str = parts[1] - label_level = self.str_to_int(parts[0], default=None) + label_level = self.str_to_int(parts[0], None) return label_str, label_level else: return label, None - def handle_text_elements(self, element, docx_obj, doc): - paragraph = docx.text.paragraph.Paragraph(element, docx_obj) + def handle_text_elements( + self, + element: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> None: + paragraph = Paragraph(element, docx_obj) if paragraph.text is None: return @@ -257,11 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): and p_style_id not in ["Title", "Heading"] ): self.add_listitem( - element, - docx_obj, doc, - p_style_id, - p_level, numid, ilevel, text, @@ -286,13 +315,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level = 0 if p_style_id in ["Title"]: - for key, val in self.parents.items(): + for key in range(len(self.parents)): self.parents[key] = None self.parents[0] = doc.add_text( parent=None, label=DocItemLabel.TITLE, text=text ) elif "Heading" in p_style_id: - self.add_header(element, docx_obj, doc, p_style_id, p_level, text) + self.add_header(doc, p_level, text) elif p_style_id in [ "Paragraph", @@ -320,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.update_history(p_style_id, p_level, numid, ilevel) return - def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): + def add_header( + self, doc: DoclingDocument, curr_level: Optional[int], text: str + ) -> None: level = self.get_level() if isinstance(curr_level, int): if curr_level > level: @@ -333,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) elif curr_level < level: # remove the tail - for key, val in self.parents.items(): + for key in range(len(self.parents)): if key >= curr_level: self.parents[key] = None @@ -352,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def add_listitem( self, - element, - docx_obj, - doc, - p_style_id, - p_level, - numid, - ilevel, + doc: DoclingDocument, + numid: int, + ilevel: int, text: str, - is_numbered=False, - ): - # is_numbered = is_numbered + is_numbered: bool = False, + ) -> None: enum_marker = "" level = self.get_level() + prev_indent = self.prev_indent() if self.prev_numid() is None: # Open new list - self.level_at_new_list = level # type: ignore + self.level_at_new_list = level self.parents[level] = doc.add_group( label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] @@ -386,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) elif ( - self.prev_numid() == numid and self.prev_indent() < ilevel + self.prev_numid() == numid + and self.level_at_new_list is not None + and prev_indent is not None + and prev_indent < ilevel ): # Open indented list for i in range( - self.level_at_new_list + self.prev_indent() + 1, + self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): # Determine if this is an unordered list or an ordered list. @@ -418,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, ) - elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list + elif ( + self.prev_numid() == numid + and self.level_at_new_list is not None + and prev_indent is not None + and ilevel < prev_indent + ): # Close list for k, v in self.parents.items(): if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -436,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) self.listIter = 0 - elif self.prev_numid() == numid or self.prev_indent() == ilevel: + elif self.prev_numid() == numid or prev_indent == ilevel: # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 if is_numbered: @@ -450,7 +485,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return - def handle_tables(self, element, docx_obj, doc): + def handle_tables( + self, + element: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> None: table: Table = Table(element, docx_obj) num_rows = len(table.rows) num_cols = len(table.columns) @@ -509,8 +549,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[level - 1]) return - def handle_pictures(self, element, docx_obj, drawing_blip, doc): - def get_docx_image(element, drawing_blip): + def handle_pictures( + self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument + ) -> None: + def get_docx_image(drawing_blip): rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" ) @@ -523,7 +565,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self.get_level() # Open the BytesIO object with PIL to create an Image try: - image_data = get_docx_image(element, drawing_blip) + image_data = get_docx_image(drawing_blip) image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) doc.add_picture(