fix(docx): slow table parsing (#2553)

* chore(docx): remove unnecessary import Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): simplify parsing of simple tables Simplify the parsing of tables with just text (no rich cells). Move nested function group_cell_elements out of _handle_tables for readability. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): reuse method for finding inline pictures Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): format strikethrough text Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(docx): use fixtures to avoid converting same file multiple times Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): remove unnecessary argument docx_obj in functions Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(docx): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): small improvements in backend and its unit tests Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): parse superscript and subscript formatted text Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-11-06 05:25:53 +01:00
parent 0ba8d5d9e3
commit ef623ffcee
6 changed files with 3366 additions and 218 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -3,7 +3,7 @@ import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Final, Optional, Union

 from docling_core.types.doc import (
    DocItemLabel,
@@ -17,9 +17,9 @@ from docling_core.types.doc import (
    RichTableCell,
    TableCell,
    TableData,
-    TextItem,
+    TableItem,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, Script
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@@ -36,7 +36,6 @@ from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.docx.drawingml.utils import (
    get_docx_to_pdf_converter,
-    get_libreoffice_cmd,
    get_pil_from_dml_docx,
 )
 from docling.backend.docx.latex.omml import oMath2Latex
@@ -47,6 +46,18 @@ _log = logging.getLogger(__name__)


 class MsWordDocumentBackend(DeclarativeDocumentBackend):
+    _BLIP_NAMESPACES: Final = {
+        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+        "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+        "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
+        "v": "urn:schemas-microsoft-com:vml",
+        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
+        "w10": "urn:schemas-microsoft-com:office:word",
+        "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
+    }
+
    @override
    def __init__(
        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
@@ -58,6 +69,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.xml_namespaces = {
            "w": "http://schemas.microsoft.com/office/word/2003/wordml"
        }
+        self.blip_xpath_expr = etree.XPath(
+            ".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+        )
        # self.initialise(path_or_stream)
        # Word file:
        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
@@ -133,8 +147,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
-            doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
-            # doc, _ = doc_info
+            doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
+
            return doc
        else:
            raise RuntimeError(
@@ -192,7 +206,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _walk_linear(
        self,
        body: BaseOxmlElement,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
        # parent:
    ) -> tuple[DoclingDocument, list[RefItem]]:
@@ -200,20 +213,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for element in body:
            tag_name = etree.QName(element).localname
            # Check for Inline Images (blip elements)
-            namespaces = {
-                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
-                "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
-                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
-                "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
-                "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
-                "v": "urn:schemas-microsoft-com:vml",
-                "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
-                "w10": "urn:schemas-microsoft-com:office:word",
-                "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
-            }
-            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
-            drawing_blip = xpath_expr(element)
-            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
+            drawing_blip = self.blip_xpath_expr(element)
+            drawingml_els = element.findall(
+                ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+            )

            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -221,7 +224,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if element_id not in self.processed_textbox_elements:
                # Modern Word textboxes
                txbx_xpath = etree.XPath(
-                    ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
+                    ".//w:txbxContent|.//v:textbox//w:p",
+                    namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                )
                textbox_elements = txbx_xpath(element)

@@ -230,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    # Additional checks for textboxes in DrawingML and VML formats
                    alt_txbx_xpath = etree.XPath(
                        ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
-                        namespaces=namespaces,
+                        namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                    )
                    textbox_elements = alt_txbx_xpath(element)

@@ -238,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    if not textbox_elements:
                        shape_text_xpath = etree.XPath(
                            ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
-                            namespaces=namespaces,
+                            namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                        )
                        shape_text_elements = shape_text_xpath(element)
                        if shape_text_elements:
@@ -272,26 +276,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    _log.debug(
                        f"Found textbox content with {len(textbox_elements)} elements"
                    )
-                    tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
+                    tbc = self._handle_textbox_content(textbox_elements, doc)
                    added_elements.extend(tbc)

            # Check for Tables
-            if element.tag.endswith("tbl"):
+            if tag_name == "tbl":
                try:
-                    t = self._handle_tables(element, docx_obj, doc)
+                    t = self._handle_tables(element, doc)
                    added_elements.extend(t)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
            # Check for Image
            elif drawing_blip:
-                pics = self._handle_pictures(docx_obj, drawing_blip, doc)
+                pics = self._handle_pictures(drawing_blip, doc)
                added_elements.extend(pics)
                # Check for Text after the Image
                if (
-                    tag_name in ["p"]
-                    and element.find(".//w:t", namespaces=namespaces) is not None
+                    tag_name == "p"
+                    and element.find(
+                        ".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+                    )
+                    is not None
                ):
-                    te1 = self._handle_text_elements(element, docx_obj, doc)
+                    te1 = self._handle_text_elements(element, doc)
                    added_elements.extend(te1)
            # Check for DrawingML elements
            elif drawingml_els:
@@ -314,18 +321,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                else:
                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
-            elif tag_name in ["sdt"]:
-                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
+            elif tag_name == "sdt":
+                sdt_content = element.find(
+                    ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+                )
                if sdt_content is not None:
                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
-                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
+                    paragraphs = sdt_content.findall(
+                        ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+                    )
                    for p in paragraphs:
-                        te = self._handle_text_elements(p, docx_obj, doc)
+                        te = self._handle_text_elements(p, doc)
                        added_elements.extend(te)
            # Check for Text
-            elif tag_name in ["p"]:
+            elif tag_name == "p":
                # "tcPr", "sectPr"
-                te = self._handle_text_elements(element, docx_obj, doc)
+                te = self._handle_text_elements(element, doc)
                added_elements.extend(te)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -384,16 +395,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for key in keys_to_reset:
            self.list_counters[key] = 0

-    def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
+    def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
        """Check if a list is numbered based on its numFmt value."""
        try:
            # Access the numbering part of the document
-            if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
+            if not hasattr(self.docx_obj, "part") or not hasattr(
+                self.docx_obj.part, "package"
+            ):
                return False

            numbering_part = None
            # Find the numbering part
-            for part in docx_obj.part.package.parts:
+            for part in self.docx_obj.part.package.parts:
                if "numbering" in part.partname:
                    numbering_part = part
                    break
@@ -523,15 +536,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
        # The .bold and .italic properties are booleans, but .underline can be an enum
        # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
-        has_bold = run.bold or False
-        has_italic = run.italic or False
+        is_bold = run.bold or False
+        is_italic = run.italic or False
+        is_strikethrough = run.font.strike or False
        # Convert any non-None underline value to True
-        has_underline = bool(run.underline is not None and run.underline)
+        is_underline = bool(run.underline is not None and run.underline)
+        is_sub = run.font.subscript or False
+        is_sup = run.font.superscript or False
+        script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE

        return Formatting(
-            bold=has_bold,
-            italic=has_italic,
-            underline=has_underline,
+            bold=is_bold,
+            italic=is_italic,
+            underline=is_underline,
+            strikethrough=is_strikethrough,
+            script=script,
        )

    def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -724,7 +743,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _handle_textbox_content(
        self,
        textbox_elements: list,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
@@ -766,7 +784,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # Process all the paragraphs
        for p, position in all_paragraphs:
            # Create paragraph object to get text content
-            paragraph = Paragraph(p, docx_obj)
+            paragraph = Paragraph(p, self.docx_obj)
            text_content = paragraph.text

            # Create a unique identifier based on content and position
@@ -782,7 +800,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # Mark this paragraph as processed
            processed_paragraphs.add(paragraph_id)

-            elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
+            elem_ref.extend(self._handle_text_elements(p, doc))

        # Restore original parent
        self.parents[level] = original_parent
@@ -854,11 +872,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _handle_text_elements(
        self,
        element: BaseOxmlElement,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
-        paragraph = Paragraph(element, docx_obj)
+        paragraph = Paragraph(element, self.docx_obj)
        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
            element=element, text=paragraph.text
@@ -884,7 +901,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            and p_style_id not in ["Title", "Heading"]
        ):
            # Check if this is actually a numbered list by examining the numFmt
-            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
+            is_numbered = self._is_numbered_list(numid, ilevel)

            li = self._add_list_item(
                doc=doc,
@@ -1239,14 +1256,35 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
        return elem_ref

+    @staticmethod
+    def _group_cell_elements(
+        group_name: str,
+        doc: DoclingDocument,
+        provs_in_cell: list[RefItem],
+        docling_table: TableItem,
+    ) -> RefItem:
+        group_element = doc.add_group(
+            label=GroupLabel.UNSPECIFIED,
+            name=group_name,
+            parent=docling_table,
+        )
+        for prov in provs_in_cell:
+            group_element.children.append(prov)
+            pr_item = prov.resolve(doc)
+            item_parent = pr_item.parent.resolve(doc)
+            if pr_item.get_ref() in item_parent.children:
+                item_parent.children.remove(pr_item.get_ref())
+            pr_item.parent = group_element.get_ref()
+        ref_for_rich_cell = group_element.get_ref()
+        return ref_for_rich_cell
+
    def _handle_tables(
        self,
        element: BaseOxmlElement,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
-        table: Table = Table(element, docx_obj)
+        table: Table = Table(element, self.docx_obj)
        num_rows = len(table.rows)
        num_cols = len(table.columns)
        _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
@@ -1255,7 +1293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            cell_element = table.rows[0].cells[0]
            # In case we have a table of only 1 cell, we consider it furniture
            # And proceed processing the content of the cell as though it's in the document body
-            self._walk_linear(cell_element._element, docx_obj, doc)
+            self._walk_linear(cell_element._element, doc)
            return elem_ref

        data = TableData(num_rows=num_rows, num_cols=num_cols)
@@ -1300,52 +1338,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    text = text.replace("<eq>", "$").replace("</eq>", "$")

                provs_in_cell: list[RefItem] = []
-                _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
-                ref_for_rich_cell = provs_in_cell[0]
-                rich_table_cell = False
+                rich_table_cell: bool = self._is_rich_table_cell(cell)

-                def group_cell_elements(
-                    group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
-                ) -> RefItem:
-                    group_element = doc.add_group(
-                        label=GroupLabel.UNSPECIFIED,
-                        name=group_name,
-                        parent=docling_table,
-                    )
-                    for prov in provs_in_cell:
-                        group_element.children.append(prov)
-                        pr_item = prov.resolve(doc)
-                        item_parent = pr_item.parent.resolve(doc)
-                        if pr_item.get_ref() in item_parent.children:
-                            item_parent.children.remove(pr_item.get_ref())
-                        pr_item.parent = group_element.get_ref()
-                    ref_for_rich_cell = group_element.get_ref()
-                    return ref_for_rich_cell
+                if rich_table_cell:
+                    _, provs_in_cell = self._walk_linear(cell._element, doc)
+                _log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}")

-                if len(provs_in_cell) > 1:
+                if len(provs_in_cell) > 0:
                    # Cell has multiple elements, we need to group them
                    rich_table_cell = True
                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
-                    ref_for_rich_cell = group_cell_elements(
-                        group_name, doc, provs_in_cell
+                    ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
+                        group_name, doc, provs_in_cell, docling_table
                    )

-                elif len(provs_in_cell) == 1:
-                    item_ref = provs_in_cell[0]
-                    pr_item = item_ref.resolve(doc)
-                    if isinstance(pr_item, TextItem):
-                        # Cell has only one element and it's just a text
-                        rich_table_cell = False
-                        doc.delete_items(node_items=[pr_item])
-                    else:
-                        rich_table_cell = True
-                        group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
-                        ref_for_rich_cell = group_cell_elements(
-                            group_name, doc, provs_in_cell
-                        )
-                else:
-                    rich_table_cell = False
-
                if rich_table_cell:
                    rich_cell = RichTableCell(
                        text=text,
@@ -1377,17 +1383,79 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    col_idx += cell.grid_span
        return elem_ref

+    def _is_rich_table_cell(self, cell: _Cell) -> bool:
+        """Determine whether a docx cell should be parsed as a Docling RichTableCell.
+
+        A docx cell can hold rich content and be parsed with a Docling RichTableCell.
+        However, this requires walking through the lxml elements and creating
+        node items. If the cell holds only plain text, a TableCell, the parsing
+        is simpler and using a TableCell is prefered.
+
+        Plain text means:
+        - The cell has only one paragraph
+        - The paragraph consists solely of runs with no run properties
+          (no need of Docling formatting).
+        - No other block-level elements are present inside the cell element.
+
+        Args:
+            cell: A docx cell
+
+        Returns:
+            Whether the docx cell should be parsed as RichTableCell
+        """
+        tc = cell._tc
+
+        # must contain only one paragraph
+        paragraphs = list(
+            tc.iterchildren(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
+            )
+        )
+        if len(paragraphs) > 1:
+            return True
+
+        # no other content
+        allowed_tags = {"p", "tcPr"}  # paragraph or table-cell properties
+        for child in tc:
+            tag = child.tag.split("}")[-1]
+            if tag not in allowed_tags:
+                return True
+        for elem in tc:
+            if self.blip_xpath_expr(elem):
+                return True
+            if elem.findall(
+                ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+            ):
+                return True
+
+        # paragraph must contain runs with no run-properties
+        for para in paragraphs:
+            runs = list(
+                para.iterchildren(
+                    "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
+                )
+            )
+            for rn in runs:
+                item: Run = Run(rn, self.docx_obj)
+                if item is not None:
+                    fm = MsWordDocumentBackend._get_format_from_run(item)
+                    if fm != Formatting():
+                        return True
+
+        # All checks passed: plain text only
+        return False
+
    def _handle_pictures(
-        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
+        self, drawing_blip: Any, doc: DoclingDocument
    ) -> list[RefItem]:
        def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
            image_data: Optional[bytes] = None
            rId = drawing_blip[0].get(
                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
            )
-            if rId in docx_obj.part.rels:
+            if rId in self.docx_obj.part.rels:
                # Access the image part using the relationship ID
-                image_part = docx_obj.part.rels[rId].target_part
+                image_part = self.docx_obj.part.rels[rId].target_part
                image_data = image_part.blob  # Get the binary image data
            return image_data