fix(docx): slow table parsing (#2553)

* chore(docx): remove unnecessary import Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): simplify parsing of simple tables Simplify the parsing of tables with just text (no rich cells). Move nested function group_cell_elements out of _handle_tables for readability. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): reuse method for finding inline pictures Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): format strikethrough text Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(docx): use fixtures to avoid converting same file multiple times Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): remove unnecessary argument docx_obj in functions Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(docx): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): small improvements in backend and its unit tests Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): parse superscript and subscript formatted text Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-11-06 05:25:53 +01:00
parent 0ba8d5d9e3
commit ef623ffcee
6 changed files with 3366 additions and 218 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -3,7 +3,7 @@ import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Final, Optional, Union

 from docling_core.types.doc import (
    DocItemLabel,
@@ -17,9 +17,9 @@ from docling_core.types.doc import (
    RichTableCell,
    TableCell,
    TableData,
-    TextItem,
+    TableItem,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, Script
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@@ -36,7 +36,6 @@ from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.docx.drawingml.utils import (
    get_docx_to_pdf_converter,
-    get_libreoffice_cmd,
    get_pil_from_dml_docx,
 )
 from docling.backend.docx.latex.omml import oMath2Latex
@@ -47,6 +46,18 @@ _log = logging.getLogger(__name__)


 class MsWordDocumentBackend(DeclarativeDocumentBackend):
+    _BLIP_NAMESPACES: Final = {
+        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+        "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+        "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
+        "v": "urn:schemas-microsoft-com:vml",
+        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
+        "w10": "urn:schemas-microsoft-com:office:word",
+        "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
+    }
+
    @override
    def __init__(
        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
@@ -58,6 +69,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.xml_namespaces = {
            "w": "http://schemas.microsoft.com/office/word/2003/wordml"
        }
+        self.blip_xpath_expr = etree.XPath(
+            ".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+        )
        # self.initialise(path_or_stream)
        # Word file:
        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
@@ -133,8 +147,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
-            doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
-            # doc, _ = doc_info
+            doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
+
            return doc
        else:
            raise RuntimeError(
@@ -192,7 +206,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _walk_linear(
        self,
        body: BaseOxmlElement,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
        # parent:
    ) -> tuple[DoclingDocument, list[RefItem]]:
@@ -200,20 +213,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for element in body:
            tag_name = etree.QName(element).localname
            # Check for Inline Images (blip elements)
-            namespaces = {
-                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
-                "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
-                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
-                "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
-                "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
-                "v": "urn:schemas-microsoft-com:vml",
-                "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
-                "w10": "urn:schemas-microsoft-com:office:word",
-                "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
-            }
-            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
-            drawing_blip = xpath_expr(element)
-            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
+            drawing_blip = self.blip_xpath_expr(element)
+            drawingml_els = element.findall(
+                ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+            )

            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -221,7 +224,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if element_id not in self.processed_textbox_elements:
                # Modern Word textboxes
                txbx_xpath = etree.XPath(
-                    ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
+                    ".//w:txbxContent|.//v:textbox//w:p",
+                    namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                )
                textbox_elements = txbx_xpath(element)

@@ -230,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    # Additional checks for textboxes in DrawingML and VML formats
                    alt_txbx_xpath = etree.XPath(
                        ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
-                        namespaces=namespaces,
+                        namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                    )
                    textbox_elements = alt_txbx_xpath(element)

@@ -238,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    if not textbox_elements:
                        shape_text_xpath = etree.XPath(
                            ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
-                            namespaces=namespaces,
+                            namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                        )
                        shape_text_elements = shape_text_xpath(element)
                        if shape_text_elements:
@@ -272,26 +276,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    _log.debug(
                        f"Found textbox content with {len(textbox_elements)} elements"
                    )
-                    tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
+                    tbc = self._handle_textbox_content(textbox_elements, doc)
                    added_elements.extend(tbc)

            # Check for Tables
-            if element.tag.endswith("tbl"):
+            if tag_name == "tbl":
                try:
-                    t = self._handle_tables(element, docx_obj, doc)
+                    t = self._handle_tables(element, doc)
                    added_elements.extend(t)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
            # Check for Image
            elif drawing_blip:
-                pics = self._handle_pictures(docx_obj, drawing_blip, doc)
+                pics = self._handle_pictures(drawing_blip, doc)
                added_elements.extend(pics)
                # Check for Text after the Image
                if (
-                    tag_name in ["p"]
-                    and element.find(".//w:t", namespaces=namespaces) is not None
+                    tag_name == "p"
+                    and element.find(
+                        ".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+                    )
+                    is not None
                ):
-                    te1 = self._handle_text_elements(element, docx_obj, doc)
+                    te1 = self._handle_text_elements(element, doc)
                    added_elements.extend(te1)
            # Check for DrawingML elements
            elif drawingml_els:
@@ -314,18 +321,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                else:
                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
-            elif tag_name in ["sdt"]:
-                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
+            elif tag_name == "sdt":
+                sdt_content = element.find(
+                    ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+                )
                if sdt_content is not None:
                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
-                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
+                    paragraphs = sdt_content.findall(
+                        ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+                    )
                    for p in paragraphs:
-                        te = self._handle_text_elements(p, docx_obj, doc)
+                        te = self._handle_text_elements(p, doc)
                        added_elements.extend(te)
            # Check for Text
-            elif tag_name in ["p"]:
+            elif tag_name == "p":
                # "tcPr", "sectPr"
-                te = self._handle_text_elements(element, docx_obj, doc)
+                te = self._handle_text_elements(element, doc)
                added_elements.extend(te)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -384,16 +395,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for key in keys_to_reset:
            self.list_counters[key] = 0

-    def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
+    def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
        """Check if a list is numbered based on its numFmt value."""
        try:
            # Access the numbering part of the document
-            if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
+            if not hasattr(self.docx_obj, "part") or not hasattr(
+                self.docx_obj.part, "package"
+            ):
                return False

            numbering_part = None
            # Find the numbering part
-            for part in docx_obj.part.package.parts:
+            for part in self.docx_obj.part.package.parts:
                if "numbering" in part.partname:
                    numbering_part = part
                    break
@@ -523,15 +536,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
        # The .bold and .italic properties are booleans, but .underline can be an enum
        # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
-        has_bold = run.bold or False
-        has_italic = run.italic or False
+        is_bold = run.bold or False
+        is_italic = run.italic or False
+        is_strikethrough = run.font.strike or False
        # Convert any non-None underline value to True
-        has_underline = bool(run.underline is not None and run.underline)
+        is_underline = bool(run.underline is not None and run.underline)
+        is_sub = run.font.subscript or False
+        is_sup = run.font.superscript or False
+        script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE

        return Formatting(
-            bold=has_bold,
-            italic=has_italic,
-            underline=has_underline,
+            bold=is_bold,
+            italic=is_italic,
+            underline=is_underline,
+            strikethrough=is_strikethrough,
+            script=script,
        )

    def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -724,7 +743,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _handle_textbox_content(
        self,
        textbox_elements: list,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
@@ -766,7 +784,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # Process all the paragraphs
        for p, position in all_paragraphs:
            # Create paragraph object to get text content
-            paragraph = Paragraph(p, docx_obj)
+            paragraph = Paragraph(p, self.docx_obj)
            text_content = paragraph.text

            # Create a unique identifier based on content and position
@@ -782,7 +800,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # Mark this paragraph as processed
            processed_paragraphs.add(paragraph_id)

-            elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
+            elem_ref.extend(self._handle_text_elements(p, doc))

        # Restore original parent
        self.parents[level] = original_parent
@@ -854,11 +872,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _handle_text_elements(
        self,
        element: BaseOxmlElement,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
-        paragraph = Paragraph(element, docx_obj)
+        paragraph = Paragraph(element, self.docx_obj)
        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
            element=element, text=paragraph.text
@@ -884,7 +901,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            and p_style_id not in ["Title", "Heading"]
        ):
            # Check if this is actually a numbered list by examining the numFmt
-            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
+            is_numbered = self._is_numbered_list(numid, ilevel)

            li = self._add_list_item(
                doc=doc,
@@ -1239,14 +1256,35 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
        return elem_ref

+    @staticmethod
+    def _group_cell_elements(
+        group_name: str,
+        doc: DoclingDocument,
+        provs_in_cell: list[RefItem],
+        docling_table: TableItem,
+    ) -> RefItem:
+        group_element = doc.add_group(
+            label=GroupLabel.UNSPECIFIED,
+            name=group_name,
+            parent=docling_table,
+        )
+        for prov in provs_in_cell:
+            group_element.children.append(prov)
+            pr_item = prov.resolve(doc)
+            item_parent = pr_item.parent.resolve(doc)
+            if pr_item.get_ref() in item_parent.children:
+                item_parent.children.remove(pr_item.get_ref())
+            pr_item.parent = group_element.get_ref()
+        ref_for_rich_cell = group_element.get_ref()
+        return ref_for_rich_cell
+
    def _handle_tables(
        self,
        element: BaseOxmlElement,
-        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
-        table: Table = Table(element, docx_obj)
+        table: Table = Table(element, self.docx_obj)
        num_rows = len(table.rows)
        num_cols = len(table.columns)
        _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
@@ -1255,7 +1293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            cell_element = table.rows[0].cells[0]
            # In case we have a table of only 1 cell, we consider it furniture
            # And proceed processing the content of the cell as though it's in the document body
-            self._walk_linear(cell_element._element, docx_obj, doc)
+            self._walk_linear(cell_element._element, doc)
            return elem_ref

        data = TableData(num_rows=num_rows, num_cols=num_cols)
@@ -1300,52 +1338,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    text = text.replace("<eq>", "$").replace("</eq>", "$")

                provs_in_cell: list[RefItem] = []
-                _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
-                ref_for_rich_cell = provs_in_cell[0]
-                rich_table_cell = False
+                rich_table_cell: bool = self._is_rich_table_cell(cell)

-                def group_cell_elements(
-                    group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
-                ) -> RefItem:
-                    group_element = doc.add_group(
-                        label=GroupLabel.UNSPECIFIED,
-                        name=group_name,
-                        parent=docling_table,
-                    )
-                    for prov in provs_in_cell:
-                        group_element.children.append(prov)
-                        pr_item = prov.resolve(doc)
-                        item_parent = pr_item.parent.resolve(doc)
-                        if pr_item.get_ref() in item_parent.children:
-                            item_parent.children.remove(pr_item.get_ref())
-                        pr_item.parent = group_element.get_ref()
-                    ref_for_rich_cell = group_element.get_ref()
-                    return ref_for_rich_cell
+                if rich_table_cell:
+                    _, provs_in_cell = self._walk_linear(cell._element, doc)
+                _log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}")

-                if len(provs_in_cell) > 1:
+                if len(provs_in_cell) > 0:
                    # Cell has multiple elements, we need to group them
                    rich_table_cell = True
                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
-                    ref_for_rich_cell = group_cell_elements(
-                        group_name, doc, provs_in_cell
+                    ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
+                        group_name, doc, provs_in_cell, docling_table
                    )

-                elif len(provs_in_cell) == 1:
-                    item_ref = provs_in_cell[0]
-                    pr_item = item_ref.resolve(doc)
-                    if isinstance(pr_item, TextItem):
-                        # Cell has only one element and it's just a text
-                        rich_table_cell = False
-                        doc.delete_items(node_items=[pr_item])
-                    else:
-                        rich_table_cell = True
-                        group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
-                        ref_for_rich_cell = group_cell_elements(
-                            group_name, doc, provs_in_cell
-                        )
-                else:
-                    rich_table_cell = False
-
                if rich_table_cell:
                    rich_cell = RichTableCell(
                        text=text,
@@ -1377,17 +1383,79 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    col_idx += cell.grid_span
        return elem_ref

+    def _is_rich_table_cell(self, cell: _Cell) -> bool:
+        """Determine whether a docx cell should be parsed as a Docling RichTableCell.
+
+        A docx cell can hold rich content and be parsed with a Docling RichTableCell.
+        However, this requires walking through the lxml elements and creating
+        node items. If the cell holds only plain text, a TableCell, the parsing
+        is simpler and using a TableCell is prefered.
+
+        Plain text means:
+        - The cell has only one paragraph
+        - The paragraph consists solely of runs with no run properties
+          (no need of Docling formatting).
+        - No other block-level elements are present inside the cell element.
+
+        Args:
+            cell: A docx cell
+
+        Returns:
+            Whether the docx cell should be parsed as RichTableCell
+        """
+        tc = cell._tc
+
+        # must contain only one paragraph
+        paragraphs = list(
+            tc.iterchildren(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
+            )
+        )
+        if len(paragraphs) > 1:
+            return True
+
+        # no other content
+        allowed_tags = {"p", "tcPr"}  # paragraph or table-cell properties
+        for child in tc:
+            tag = child.tag.split("}")[-1]
+            if tag not in allowed_tags:
+                return True
+        for elem in tc:
+            if self.blip_xpath_expr(elem):
+                return True
+            if elem.findall(
+                ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+            ):
+                return True
+
+        # paragraph must contain runs with no run-properties
+        for para in paragraphs:
+            runs = list(
+                para.iterchildren(
+                    "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
+                )
+            )
+            for rn in runs:
+                item: Run = Run(rn, self.docx_obj)
+                if item is not None:
+                    fm = MsWordDocumentBackend._get_format_from_run(item)
+                    if fm != Formatting():
+                        return True
+
+        # All checks passed: plain text only
+        return False
+
    def _handle_pictures(
-        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
+        self, drawing_blip: Any, doc: DoclingDocument
    ) -> list[RefItem]:
        def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
            image_data: Optional[bytes] = None
            rId = drawing_blip[0].get(
                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
            )
-            if rId in docx_obj.part.rels:
+            if rId in self.docx_obj.part.rels:
                # Access the image part using the relationship ID
-                image_part = docx_obj.part.rels[rId].target_part
+                image_part = self.docx_obj.part.rels[rId].target_part
                image_data = image_part.blob  # Get the binary image data
            return image_data

--- a/tests/data/docx/docx_rich_cells.docx
+++ b/tests/data/docx/docx_rich_cells.docx
--- a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt
@@ -0,0 +1,107 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group header-0
+    item-2 at level 2: section: group header-1
+      item-3 at level 3: section_header: Table with rich cells
+        item-4 at level 4: table with [4x2]
+          item-5 at level 5: unspecified: group rich_cell_group_1_0_1
+            item-6 at level 6: text: This is a list:
+            item-7 at level 6: list: group list
+              item-8 at level 7: list_item: A First
+              item-9 at level 7: list_item: A Second
+              item-10 at level 7: list_item: A Third
+          item-11 at level 5: unspecified: group rich_cell_group_1_1_1
+            item-12 at level 6: text: This is a formatted list:
+            item-13 at level 6: list: group list
+              item-14 at level 7: list_item: 
+                item-15 at level 8: inline: group group
+                  item-16 at level 9: text: B
+                  item-17 at level 9: text: First
+              item-18 at level 7: list_item: 
+                item-19 at level 8: inline: group group
+                  item-20 at level 9: text: B
+                  item-21 at level 9: text: Second
+              item-22 at level 7: list_item: 
+                item-23 at level 8: inline: group group
+                  item-24 at level 9: text: B
+                  item-25 at level 9: text: Third
+          item-26 at level 5: unspecified: group rich_cell_group_1_0_2
+            item-27 at level 6: text: First Paragraph
+
+Second Paragraph
+            item-28 at level 6: text: Third paragraph before a numbered list
+            item-29 at level 6: list: group list
+              item-30 at level 7: list_item: Number one
+              item-31 at level 7: list_item: Number two
+              item-32 at level 7: list_item: Number three
+          item-33 at level 5: unspecified: group rich_cell_group_1_1_2
+            item-34 at level 6: text: This is simple text with
+            item-35 at level 6: text: bold
+            item-36 at level 6: text: ,
+            item-37 at level 6: text: strikethrough
+            item-38 at level 6: text: and
+            item-39 at level 6: text: italic
+            item-40 at level 6: text: formatting with x
+            item-41 at level 6: text: 2
+            item-42 at level 6: text: and H
+            item-43 at level 6: text: 2
+            item-44 at level 6: text: O
+          item-45 at level 5: unspecified: group rich_cell_group_1_0_3
+            item-46 at level 6: text: This is a paragraph
+            item-47 at level 6: text: This is another paragraph
+        item-48 at level 4: inline: group group
+        item-49 at level 4: text: 
+        item-50 at level 4: text: 
+        item-51 at level 4: text: 
+        item-52 at level 4: text: 
+        item-53 at level 4: text: 
+        item-54 at level 4: text: 
+      item-55 at level 3: section_header: Table with nested table
+        item-56 at level 4: text: Before table
+        item-57 at level 4: table with [3x2]
+          item-58 at level 5: unspecified: group rich_cell_group_2_1_1
+            item-59 at level 6: text: Simple cell with
+            item-60 at level 6: text: bold
+            item-61 at level 6: text: and
+            item-62 at level 6: text: italic
+            item-63 at level 6: text: text
+          item-64 at level 5: unspecified: group rich_cell_group_3_0_2
+            item-65 at level 6: table with [2x3]
+              item-66 at level 7: unspecified: group rich_cell_group_3_0_1
+                item-67 at level 8: text: Cell 1
+              item-68 at level 7: unspecified: group rich_cell_group_3_1_1
+                item-69 at level 8: text: Cell 2
+              item-70 at level 7: unspecified: group rich_cell_group_3_2_1
+                item-71 at level 8: text: Cell 3
+            item-72 at level 6: text: 
+          item-73 at level 5: unspecified: group rich_cell_group_4_1_2
+            item-74 at level 6: text: Rich cell
+A nested table
+            item-75 at level 6: table with [2x3]
+              item-76 at level 7: unspecified: group rich_cell_group_4_0_1
+                item-77 at level 8: text: Cell 1
+              item-78 at level 7: unspecified: group rich_cell_group_4_1_1
+                item-79 at level 8: text: Cell 2
+              item-80 at level 7: unspecified: group rich_cell_group_4_2_1
+                item-81 at level 8: text: Cell 3
+            item-82 at level 6: text: 
+        item-83 at level 4: inline: group group
+        item-84 at level 4: inline: group group
+          item-85 at level 5: text: After table with
+          item-86 at level 5: text: bold
+          item-87 at level 5: text: ,
+          item-88 at level 5: text: underline
+          item-89 at level 5: text: ,
+          item-90 at level 5: text: strikethrough
+          item-91 at level 5: text: , and
+          item-92 at level 5: text: italic
+          item-93 at level 5: text: formatting
+        item-94 at level 4: text: 
+      item-95 at level 3: section_header: Table with pictures
+        item-96 at level 4: text: 
+        item-97 at level 4: table with [3x2]
+          item-98 at level 5: unspecified: group rich_cell_group_5_1_1
+            item-99 at level 6: picture
+          item-100 at level 5: unspecified: group rich_cell_group_5_0_2
+            item-101 at level 6: text: Text and picture
+            item-102 at level 6: picture
+        item-103 at level 4: text: 
--- a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json
+++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json
--- a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md
+++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md
@@ -0,0 +1,25 @@
+### Table with rich cells
+
+|  Column A                                                                                                              | Column B                                                                                                   |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|
+| This is a list:  - A First - A Second - A Third                                                                        | This is a formatted list:  - B **First** - B *Second* - B Third                                            |
+| First Paragraph  Second Paragraph  Third paragraph before a numbered list  1. Number one 2. Number two 3. Number three | This is simple text with  **bold**  ,  ~~strikethrough~~  and  *italic*  formatting with x  2  and H  2  O |
+| This is a paragraph  This is another paragraph                                                                         |                                                                                                            |
+
+### Table with nested table
+
+Before table
+
+|  Column A                                                                                                | Column B                                                                                                                                       |
+|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| Simple cell upper left                                                                                   | Simple cell with  **bold**  and  *italic*  text                                                                                                |
+| | A        | B      | C          | |----------|--------|------------| | *Cell 1* | Cell 2 | **Cell 3** | | Rich cell A nested table  | A          | B            | C      | |------------|--------------|--------| | ~~Cell 1~~ | ***Cell 2*** | Cell 3 | |
+
+After table with **bold** , underline , ~~strikethrough~~ , and *italic* formatting
+
+### Table with pictures
+
+| Column A                         | Column B       |
+|----------------------------------|----------------|
+| Only text                        | <!-- image --> |
+| Text and picture  <!-- image --> |                |
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from pathlib import Path

@@ -18,23 +19,109 @@ from docling.document_converter import DocumentConverter
 from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export

+_log = logging.getLogger(__name__)
+
 GENERATE = GEN_TEST_DATA
 IS_CI = bool(os.getenv("CI"))


+@pytest.fixture(scope="module")
+def docx_paths() -> list[Path]:
+    # Define the directory you want to search
+    directory = Path("./tests/data/docx/")
+
+    # List all docx files in the directory and its subdirectories
+    docx_files = sorted(directory.rglob("*.docx"))
+
+    return docx_files
+
+
+def get_converter():
+    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
+
+    return converter
+
+
+@pytest.fixture(scope="module")
+def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]:
+    documents: list[dict[Path, DoclingDocument]] = []
+
+    converter = get_converter()
+
+    for docx_path in docx_paths:
+        _log.debug(f"converting {docx_path}")
+
+        gt_path = (
+            docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
+        )
+
+        conv_result: ConversionResult = converter.convert(docx_path)
+
+        doc: DoclingDocument = conv_result.document
+
+        assert doc, f"Failed to convert document from file {gt_path}"
+        documents.append((gt_path, doc))
+
+    return documents
+
+
+def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument]]):
+    has_libreoffice = False
+    try:
+        cmd = get_libreoffice_cmd(raise_if_unavailable=True)
+        if cmd is not None:
+            has_libreoffice = True
+    except Exception:
+        pass
+
+    for docx_path, doc in docx_paths:
+        if not IS_CI and not has_libreoffice and docx_path.name == "drawingml.docx":
+            print(f"Skipping {docx_path} because no Libreoffice is installed.")
+            continue
+
+        pred_md: str = doc.export_to_markdown()
+        assert verify_export(pred_md, str(docx_path) + ".md", generate=GENERATE), (
+            f"export to markdown failed on {docx_path}"
+        )
+
+        pred_itxt: str = doc._export_to_indented_text(
+            max_text_len=70, explicit_tables=False
+        )
+        assert verify_export(pred_itxt, str(docx_path) + ".itxt", generate=GENERATE), (
+            f"export to indented-text failed on {docx_path}"
+        )
+
+        assert verify_document(doc, str(docx_path) + ".json", generate=GENERATE), (
+            f"DoclingDocument verification failed on {docx_path}"
+        )
+
+        if docx_path.name == "word_tables.docx":
+            pred_html: str = doc.export_to_html()
+            assert verify_export(
+                pred_text=pred_html,
+                gtfile=str(docx_path) + ".html",
+                generate=GENERATE,
+            ), f"export to html failed on {docx_path}"
+
+
+flaky_file = "textbox.docx"
+
+
+def test_e2e_docx_conversions(documents):
+    target = [item for item in documents if item[0].name != flaky_file]
+    _test_e2e_docx_conversions_impl(target)
+
+
@pytest.mark.xfail(strict=False)
-def test_textbox_extraction():
-    in_path = Path("tests/data/docx/textbox.docx")
-    in_doc = InputDocument(
-        path_or_stream=in_path,
-        format=InputFormat.DOCX,
-        backend=MsWordDocumentBackend,
-    )
-    backend = MsWordDocumentBackend(
-        in_doc=in_doc,
-        path_or_stream=in_path,
-    )
-    doc = backend.convert()
+def test_textbox_conversion(documents):
+    target = [item for item in documents if item[0].name == flaky_file]
+    _test_e2e_docx_conversions_impl(target)
+
+
+@pytest.mark.xfail(strict=False)
+def test_textbox_extraction(documents):
+    name = "textbox.docx"
+    doc = next(item[1] for item in documents if item[0].name == name)

    # Verify if a particular textbox content is extracted
    textbox_found = False
@@ -44,18 +131,9 @@ def test_textbox_extraction():
    assert textbox_found


-def test_heading_levels():
-    in_path = Path("tests/data/docx/word_sample.docx")
-    in_doc = InputDocument(
-        path_or_stream=in_path,
-        format=InputFormat.DOCX,
-        backend=MsWordDocumentBackend,
-    )
-    backend = MsWordDocumentBackend(
-        in_doc=in_doc,
-        path_or_stream=in_path,
-    )
-    doc = backend.convert()
+def test_heading_levels(documents):
+    name = "word_sample.docx"
+    doc = next(item[1] for item in documents if item[0].name == name)

    found_lvl_1 = found_lvl_2 = False
    for item, _ in doc.iterate_items():
@@ -69,104 +147,11 @@ def test_heading_levels():
    assert found_lvl_1 and found_lvl_2


-def get_docx_paths():
-    # Define the directory you want to search
-    directory = Path("./tests/data/docx/")
+def test_text_after_image_anchors(documents):
+    """Test to analyse whether text gets parsed after image anchors."""

-    # List all PDF files in the directory and its subdirectories
-    pdf_files = sorted(directory.rglob("*.docx"))
-    return pdf_files
-
-
-def get_converter():
-    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
-
-    return converter
-
-
-def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
-    converter = get_converter()
-
-    has_libreoffice = False
-    try:
-        cmd = get_libreoffice_cmd(raise_if_unavailable=True)
-        if cmd is not None:
-            has_libreoffice = True
-    except Exception:
-        pass
-
-    for docx_path in docx_paths:
-        if (
-            not IS_CI
-            and not has_libreoffice
-            and str(docx_path) in ("tests/data/docx/drawingml.docx",)
-        ):
-            print(f"Skipping {docx_path} because no Libreoffice is installed.")
-            continue
-
-        gt_path = (
-            docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
-        )
-
-        conv_result: ConversionResult = converter.convert(docx_path)
-
-        doc: DoclingDocument = conv_result.document
-
-        pred_md: str = doc.export_to_markdown()
-        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
-            f"export to markdown failed on {docx_path}"
-        )
-
-        pred_itxt: str = doc._export_to_indented_text(
-            max_text_len=70, explicit_tables=False
-        )
-        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
-            f"export to indented-text failed on {docx_path}"
-        )
-
-        assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
-            f"DoclingDocument verification failed on {docx_path}"
-        )
-
-        if docx_path.name == "word_tables.docx":
-            pred_html: str = doc.export_to_html()
-            assert verify_export(
-                pred_text=pred_html,
-                gtfile=str(gt_path) + ".html",
-                generate=GENERATE,
-            ), f"export to html failed on {docx_path}"
-
-
-flaky_path = Path("tests/data/docx/textbox.docx")
-
-
-def test_e2e_docx_conversions():
-    _test_e2e_docx_conversions_impl(
-        docx_paths=[path for path in get_docx_paths() if path != flaky_path]
-    )
-
-
-@pytest.mark.xfail(strict=False)
-def test_textbox_conversion():
-    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
-
-
-def test_text_after_image_anchors():
-    """
-    Test to analyse whether text gets parsed after image anchors.
-    """
-
-    in_path = Path("tests/data/docx/word_image_anchors.docx")
-    in_doc = InputDocument(
-        path_or_stream=in_path,
-        format=InputFormat.DOCX,
-        backend=MsWordDocumentBackend,
-    )
-    backend = MsWordDocumentBackend(
-        in_doc=in_doc,
-        path_or_stream=in_path,
-    )
-    doc = backend.convert()
+    name = "word_image_anchors.docx"
+    doc = next(item[1] for item in documents if item[0].name == name)

    found_text_after_anchor_1 = found_text_after_anchor_2 = (
        found_text_after_anchor_3
@@ -188,3 +173,38 @@ def test_text_after_image_anchors():
        and found_text_after_anchor_3
        and found_text_after_anchor_4
    )
+
+
+def test_is_rich_table_cell(docx_paths):
+    """Test the function is_rich_table_cell."""
+
+    name = "docx_rich_cells.docx"
+    path = next(item for item in docx_paths if item.name == name)
+
+    in_doc = InputDocument(
+        path_or_stream=path,
+        format=InputFormat.DOCX,
+        backend=MsWordDocumentBackend,
+        filename=name,
+    )
+    backend = MsWordDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=path,
+    )
+
+    gt_cells: list[bool] = []
+    # table: Table with rich cells
+    gt_cells.extend([False, False, True, True, True, True, True, False])
+    # table: Table with nested table
+    gt_cells.extend([False, False, False, True, True, True])
+    # table: Table with pictures
+    gt_cells.extend([False, False, False, True, True, False])
+    gt_it = iter(gt_cells)
+
+    for idx_t, table in enumerate(backend.docx_obj.tables):
+        for idx_r, row in enumerate(table.rows):
+            for idx_c, cell in enumerate(row.cells):
+                assert next(gt_it) == backend._is_rich_table_cell(cell), (
+                    f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
+                    f"with text: {cell.text}"
+                )