fix(docx): slow table parsing (#2553)

* chore(docx): remove unnecessary import Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): simplify parsing of simple tables Simplify the parsing of tables with just text (no rich cells). Move nested function group_cell_elements out of _handle_tables for readability. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): reuse method for finding inline pictures Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): format strikethrough text Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(docx): use fixtures to avoid converting same file multiple times Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): remove unnecessary argument docx_obj in functions Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(docx): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): small improvements in backend and its unit tests Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): parse superscript and subscript formatted text Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-11-06 05:25:53 +01:00
parent 0ba8d5d9e3
commit ef623ffcee
6 changed files with 3366 additions and 218 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -3,7 +3,7 @@ import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Final, Optional, Union
 from docling_core.types.doc import (
    DocItemLabel,
@@ -17,9 +17,9 @@ from docling_core.types.doc import (
    RichTableCell,
    TableCell,
    TableData,
-    TextItem,
+    TableItem,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, Script
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@@ -36,7 +36,6 @@ from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.docx.drawingml.utils import (
    get_docx_to_pdf_converter,
    get_libreoffice_cmd,
    get_pil_from_dml_docx,
 )
 from docling.backend.docx.latex.omml import oMath2Latex
@@ -47,6 +46,18 @@ _log = logging.getLogger(__name__)
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
    _BLIP_NAMESPACES: Final = {
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
        "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
        "v": "urn:schemas-microsoft-com:vml",
        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
        "w10": "urn:schemas-microsoft-com:office:word",
        "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
    }
    @override
    def __init__(
        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
@@ -58,6 +69,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.xml_namespaces = {
            "w": "http://schemas.microsoft.com/office/word/2003/wordml"
        }
        self.blip_xpath_expr = etree.XPath(
            ".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
        )
        # self.initialise(path_or_stream)
        # Word file:
        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
@@ -133,8 +147,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
-            doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
+            doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
-            # doc, _ = doc_info
+
            return doc
        else:
            raise RuntimeError(
@@ -192,7 +206,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _walk_linear(
        self,
        body: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
        # parent:
    ) -> tuple[DoclingDocument, list[RefItem]]:
@@ -200,20 +213,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for element in body:
            tag_name = etree.QName(element).localname
            # Check for Inline Images (blip elements)
-            namespaces = {
+            drawing_blip = self.blip_xpath_expr(element)
-                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+            drawingml_els = element.findall(
-                "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+                ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
-                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+            )
                "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
                "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
                "v": "urn:schemas-microsoft-com:vml",
                "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
                "w10": "urn:schemas-microsoft-com:office:word",
                "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
            }
            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
            drawing_blip = xpath_expr(element)
            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
            # Check for textbox content - check multiple textbox formats
            # Only process if the element hasn't been processed before
@@ -221,7 +224,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if element_id not in self.processed_textbox_elements:
                # Modern Word textboxes
                txbx_xpath = etree.XPath(
-                    ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
+                    ".//w:txbxContent|.//v:textbox//w:p",
                    namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                )
                textbox_elements = txbx_xpath(element)
@@ -230,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    # Additional checks for textboxes in DrawingML and VML formats
                    alt_txbx_xpath = etree.XPath(
                        ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
-                        namespaces=namespaces,
+                        namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                    )
                    textbox_elements = alt_txbx_xpath(element)
@@ -238,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    if not textbox_elements:
                        shape_text_xpath = etree.XPath(
                            ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
-                            namespaces=namespaces,
+                            namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
                        )
                        shape_text_elements = shape_text_xpath(element)
                        if shape_text_elements:
@@ -272,26 +276,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    _log.debug(
                        f"Found textbox content with {len(textbox_elements)} elements"
                    )
-                    tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
+                    tbc = self._handle_textbox_content(textbox_elements, doc)
                    added_elements.extend(tbc)
            # Check for Tables
-            if element.tag.endswith("tbl"):
+            if tag_name == "tbl":
                try:
-                    t = self._handle_tables(element, docx_obj, doc)
+                    t = self._handle_tables(element, doc)
                    added_elements.extend(t)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
            # Check for Image
            elif drawing_blip:
-                pics = self._handle_pictures(docx_obj, drawing_blip, doc)
+                pics = self._handle_pictures(drawing_blip, doc)
                added_elements.extend(pics)
                # Check for Text after the Image
                if (
-                    tag_name in ["p"]
+                    tag_name == "p"
-                    and element.find(".//w:t", namespaces=namespaces) is not None
+                    and element.find(
                        ".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
                    )
                    is not None
                ):
-                    te1 = self._handle_text_elements(element, docx_obj, doc)
+                    te1 = self._handle_text_elements(element, doc)
                    added_elements.extend(te1)
            # Check for DrawingML elements
            elif drawingml_els:
@@ -314,18 +321,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                else:
                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
            # Check for the sdt containers, like table of contents
-            elif tag_name in ["sdt"]:
+            elif tag_name == "sdt":
-                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
+                sdt_content = element.find(
                    ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
                )
                if sdt_content is not None:
                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
-                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
+                    paragraphs = sdt_content.findall(
                        ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
                    )
                    for p in paragraphs:
-                        te = self._handle_text_elements(p, docx_obj, doc)
+                        te = self._handle_text_elements(p, doc)
                        added_elements.extend(te)
            # Check for Text
-            elif tag_name in ["p"]:
+            elif tag_name == "p":
                # "tcPr", "sectPr"
-                te = self._handle_text_elements(element, docx_obj, doc)
+                te = self._handle_text_elements(element, doc)
                added_elements.extend(te)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -384,16 +395,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for key in keys_to_reset:
            self.list_counters[key] = 0
-    def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
+    def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
        """Check if a list is numbered based on its numFmt value."""
        try:
            # Access the numbering part of the document
-            if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
+            if not hasattr(self.docx_obj, "part") or not hasattr(
                self.docx_obj.part, "package"
            ):
                return False
            numbering_part = None
            # Find the numbering part
-            for part in docx_obj.part.package.parts:
+            for part in self.docx_obj.part.package.parts:
                if "numbering" in part.partname:
                    numbering_part = part
                    break
@@ -523,15 +536,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
        # The .bold and .italic properties are booleans, but .underline can be an enum
        # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
-        has_bold = run.bold or False
+        is_bold = run.bold or False
-        has_italic = run.italic or False
+        is_italic = run.italic or False
        is_strikethrough = run.font.strike or False
        # Convert any non-None underline value to True
-        has_underline = bool(run.underline is not None and run.underline)
+        is_underline = bool(run.underline is not None and run.underline)
        is_sub = run.font.subscript or False
        is_sup = run.font.superscript or False
        script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE
        return Formatting(
-            bold=has_bold,
+            bold=is_bold,
-            italic=has_italic,
+            italic=is_italic,
-            underline=has_underline,
+            underline=is_underline,
            strikethrough=is_strikethrough,
            script=script,
        )
    def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -724,7 +743,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _handle_textbox_content(
        self,
        textbox_elements: list,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
@@ -766,7 +784,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # Process all the paragraphs
        for p, position in all_paragraphs:
            # Create paragraph object to get text content
-            paragraph = Paragraph(p, docx_obj)
+            paragraph = Paragraph(p, self.docx_obj)
            text_content = paragraph.text
            # Create a unique identifier based on content and position
@@ -782,7 +800,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # Mark this paragraph as processed
            processed_paragraphs.add(paragraph_id)
-            elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
+            elem_ref.extend(self._handle_text_elements(p, doc))
        # Restore original parent
        self.parents[level] = original_parent
@@ -854,11 +872,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _handle_text_elements(
        self,
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
-        paragraph = Paragraph(element, docx_obj)
+        paragraph = Paragraph(element, self.docx_obj)
        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
            element=element, text=paragraph.text
@@ -884,7 +901,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            and p_style_id not in ["Title", "Heading"]
        ):
            # Check if this is actually a numbered list by examining the numFmt
-            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
+            is_numbered = self._is_numbered_list(numid, ilevel)
            li = self._add_list_item(
                doc=doc,
@@ -1239,14 +1256,35 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
        return elem_ref
    @staticmethod
    def _group_cell_elements(
        group_name: str,
        doc: DoclingDocument,
        provs_in_cell: list[RefItem],
        docling_table: TableItem,
    ) -> RefItem:
        group_element = doc.add_group(
            label=GroupLabel.UNSPECIFIED,
            name=group_name,
            parent=docling_table,
        )
        for prov in provs_in_cell:
            group_element.children.append(prov)
            pr_item = prov.resolve(doc)
            item_parent = pr_item.parent.resolve(doc)
            if pr_item.get_ref() in item_parent.children:
                item_parent.children.remove(pr_item.get_ref())
            pr_item.parent = group_element.get_ref()
        ref_for_rich_cell = group_element.get_ref()
        return ref_for_rich_cell
    def _handle_tables(
        self,
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
    ) -> list[RefItem]:
        elem_ref: list[RefItem] = []
-        table: Table = Table(element, docx_obj)
+        table: Table = Table(element, self.docx_obj)
        num_rows = len(table.rows)
        num_cols = len(table.columns)
        _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
@@ -1255,7 +1293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            cell_element = table.rows[0].cells[0]
            # In case we have a table of only 1 cell, we consider it furniture
            # And proceed processing the content of the cell as though it's in the document body
-            self._walk_linear(cell_element._element, docx_obj, doc)
+            self._walk_linear(cell_element._element, doc)
            return elem_ref
        data = TableData(num_rows=num_rows, num_cols=num_cols)
@@ -1300,52 +1338,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    text = text.replace("<eq>", "$").replace("</eq>", "$")
                provs_in_cell: list[RefItem] = []
-                _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
+                rich_table_cell: bool = self._is_rich_table_cell(cell)
                ref_for_rich_cell = provs_in_cell[0]
                rich_table_cell = False
-                def group_cell_elements(
+                if rich_table_cell:
-                    group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
+                    _, provs_in_cell = self._walk_linear(cell._element, doc)
-                ) -> RefItem:
+                _log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}")
                    group_element = doc.add_group(
                        label=GroupLabel.UNSPECIFIED,
                        name=group_name,
                        parent=docling_table,
                    )
                    for prov in provs_in_cell:
                        group_element.children.append(prov)
                        pr_item = prov.resolve(doc)
                        item_parent = pr_item.parent.resolve(doc)
                        if pr_item.get_ref() in item_parent.children:
                            item_parent.children.remove(pr_item.get_ref())
                        pr_item.parent = group_element.get_ref()
                    ref_for_rich_cell = group_element.get_ref()
                    return ref_for_rich_cell
-                if len(provs_in_cell) > 1:
+                if len(provs_in_cell) > 0:
                    # Cell has multiple elements, we need to group them
                    rich_table_cell = True
                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
-                    ref_for_rich_cell = group_cell_elements(
+                    ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
-                        group_name, doc, provs_in_cell
+                        group_name, doc, provs_in_cell, docling_table
                    )
                elif len(provs_in_cell) == 1:
                    item_ref = provs_in_cell[0]
                    pr_item = item_ref.resolve(doc)
                    if isinstance(pr_item, TextItem):
                        # Cell has only one element and it's just a text
                        rich_table_cell = False
                        doc.delete_items(node_items=[pr_item])
                    else:
                        rich_table_cell = True
                        group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
                        ref_for_rich_cell = group_cell_elements(
                            group_name, doc, provs_in_cell
                        )
                else:
                    rich_table_cell = False
                if rich_table_cell:
                    rich_cell = RichTableCell(
                        text=text,
@@ -1377,17 +1383,79 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    col_idx += cell.grid_span
        return elem_ref
    def _is_rich_table_cell(self, cell: _Cell) -> bool:
        """Determine whether a docx cell should be parsed as a Docling RichTableCell.
        A docx cell can hold rich content and be parsed with a Docling RichTableCell.
        However, this requires walking through the lxml elements and creating
        node items. If the cell holds only plain text, a TableCell, the parsing
        is simpler and using a TableCell is prefered.
        Plain text means:
        - The cell has only one paragraph
        - The paragraph consists solely of runs with no run properties
          (no need of Docling formatting).
        - No other block-level elements are present inside the cell element.
        Args:
            cell: A docx cell
        Returns:
            Whether the docx cell should be parsed as RichTableCell
        """
        tc = cell._tc
        # must contain only one paragraph
        paragraphs = list(
            tc.iterchildren(
                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
            )
        )
        if len(paragraphs) > 1:
            return True
        # no other content
        allowed_tags = {"p", "tcPr"}  # paragraph or table-cell properties
        for child in tc:
            tag = child.tag.split("}")[-1]
            if tag not in allowed_tags:
                return True
        for elem in tc:
            if self.blip_xpath_expr(elem):
                return True
            if elem.findall(
                ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
            ):
                return True
        # paragraph must contain runs with no run-properties
        for para in paragraphs:
            runs = list(
                para.iterchildren(
                    "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
                )
            )
            for rn in runs:
                item: Run = Run(rn, self.docx_obj)
                if item is not None:
                    fm = MsWordDocumentBackend._get_format_from_run(item)
                    if fm != Formatting():
                        return True
        # All checks passed: plain text only
        return False
    def _handle_pictures(
-        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
+        self, drawing_blip: Any, doc: DoclingDocument
    ) -> list[RefItem]:
        def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
            image_data: Optional[bytes] = None
            rId = drawing_blip[0].get(
                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
            )
-            if rId in docx_obj.part.rels:
+            if rId in self.docx_obj.part.rels:
                # Access the image part using the relationship ID
-                image_part = docx_obj.part.rels[rId].target_part
+                image_part = self.docx_obj.part.rels[rId].target_part
                image_data = image_part.blob  # Get the binary image data
            return image_data
--- a/tests/data/docx/docx_rich_cells.docx
+++ b/tests/data/docx/docx_rich_cells.docx
--- a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt
@@ -0,0 +1,107 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group header-0
    item-2 at level 2: section: group header-1
      item-3 at level 3: section_header: Table with rich cells
        item-4 at level 4: table with [4x2]
          item-5 at level 5: unspecified: group rich_cell_group_1_0_1
            item-6 at level 6: text: This is a list:
            item-7 at level 6: list: group list
              item-8 at level 7: list_item: A First
              item-9 at level 7: list_item: A Second
              item-10 at level 7: list_item: A Third
          item-11 at level 5: unspecified: group rich_cell_group_1_1_1
            item-12 at level 6: text: This is a formatted list:
            item-13 at level 6: list: group list
              item-14 at level 7: list_item: 
                item-15 at level 8: inline: group group
                  item-16 at level 9: text: B
                  item-17 at level 9: text: First
              item-18 at level 7: list_item: 
                item-19 at level 8: inline: group group
                  item-20 at level 9: text: B
                  item-21 at level 9: text: Second
              item-22 at level 7: list_item: 
                item-23 at level 8: inline: group group
                  item-24 at level 9: text: B
                  item-25 at level 9: text: Third
          item-26 at level 5: unspecified: group rich_cell_group_1_0_2
            item-27 at level 6: text: First Paragraph
 Second Paragraph
            item-28 at level 6: text: Third paragraph before a numbered list
            item-29 at level 6: list: group list
              item-30 at level 7: list_item: Number one
              item-31 at level 7: list_item: Number two
              item-32 at level 7: list_item: Number three
          item-33 at level 5: unspecified: group rich_cell_group_1_1_2
            item-34 at level 6: text: This is simple text with
            item-35 at level 6: text: bold
            item-36 at level 6: text: ,
            item-37 at level 6: text: strikethrough
            item-38 at level 6: text: and
            item-39 at level 6: text: italic
            item-40 at level 6: text: formatting with x
            item-41 at level 6: text: 2
            item-42 at level 6: text: and H
            item-43 at level 6: text: 2
            item-44 at level 6: text: O
          item-45 at level 5: unspecified: group rich_cell_group_1_0_3
            item-46 at level 6: text: This is a paragraph
            item-47 at level 6: text: This is another paragraph
        item-48 at level 4: inline: group group
        item-49 at level 4: text: 
        item-50 at level 4: text: 
        item-51 at level 4: text: 
        item-52 at level 4: text: 
        item-53 at level 4: text: 
        item-54 at level 4: text: 
      item-55 at level 3: section_header: Table with nested table
        item-56 at level 4: text: Before table
        item-57 at level 4: table with [3x2]
          item-58 at level 5: unspecified: group rich_cell_group_2_1_1
            item-59 at level 6: text: Simple cell with
            item-60 at level 6: text: bold
            item-61 at level 6: text: and
            item-62 at level 6: text: italic
            item-63 at level 6: text: text
          item-64 at level 5: unspecified: group rich_cell_group_3_0_2
            item-65 at level 6: table with [2x3]
              item-66 at level 7: unspecified: group rich_cell_group_3_0_1
                item-67 at level 8: text: Cell 1
              item-68 at level 7: unspecified: group rich_cell_group_3_1_1
                item-69 at level 8: text: Cell 2
              item-70 at level 7: unspecified: group rich_cell_group_3_2_1
                item-71 at level 8: text: Cell 3
            item-72 at level 6: text: 
          item-73 at level 5: unspecified: group rich_cell_group_4_1_2
            item-74 at level 6: text: Rich cell
 A nested table
            item-75 at level 6: table with [2x3]
              item-76 at level 7: unspecified: group rich_cell_group_4_0_1
                item-77 at level 8: text: Cell 1
              item-78 at level 7: unspecified: group rich_cell_group_4_1_1
                item-79 at level 8: text: Cell 2
              item-80 at level 7: unspecified: group rich_cell_group_4_2_1
                item-81 at level 8: text: Cell 3
            item-82 at level 6: text: 
        item-83 at level 4: inline: group group
        item-84 at level 4: inline: group group
          item-85 at level 5: text: After table with
          item-86 at level 5: text: bold
          item-87 at level 5: text: ,
          item-88 at level 5: text: underline
          item-89 at level 5: text: ,
          item-90 at level 5: text: strikethrough
          item-91 at level 5: text: , and
          item-92 at level 5: text: italic
          item-93 at level 5: text: formatting
        item-94 at level 4: text: 
      item-95 at level 3: section_header: Table with pictures
        item-96 at level 4: text: 
        item-97 at level 4: table with [3x2]
          item-98 at level 5: unspecified: group rich_cell_group_5_1_1
            item-99 at level 6: picture
          item-100 at level 5: unspecified: group rich_cell_group_5_0_2
            item-101 at level 6: text: Text and picture
            item-102 at level 6: picture
        item-103 at level 4: text: 
--- a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json
+++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json
--- a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md
+++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md
@@ -0,0 +1,25 @@
 ### Table with rich cells
 |  Column A                                                                                                              | Column B                                                                                                   |
 |------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|
 | This is a list:  - A First - A Second - A Third                                                                        | This is a formatted list:  - B **First** - B *Second* - B Third                                            |
 | First Paragraph  Second Paragraph  Third paragraph before a numbered list  1. Number one 2. Number two 3. Number three | This is simple text with  **bold**  ,  ~~strikethrough~~  and  *italic*  formatting with x  2  and H  2  O |
 | This is a paragraph  This is another paragraph                                                                         |                                                                                                            |
 ### Table with nested table
 Before table
 |  Column A                                                                                                | Column B                                                                                                                                       |
 |----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
 | Simple cell upper left                                                                                   | Simple cell with  **bold**  and  *italic*  text                                                                                                |
 | | A        | B      | C          | |----------|--------|------------| | *Cell 1* | Cell 2 | **Cell 3** | | Rich cell A nested table  | A          | B            | C      | |------------|--------------|--------| | ~~Cell 1~~ | ***Cell 2*** | Cell 3 | |
 After table with **bold** , underline , ~~strikethrough~~ , and *italic* formatting
 ### Table with pictures
 | Column A                         | Column B       |
 |----------------------------------|----------------|
 | Only text                        | <!-- image --> |
 | Text and picture  <!-- image --> |                |
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -1,3 +1,4 @@
 import logging
 import os
 from pathlib import Path
@@ -18,23 +19,109 @@ from docling.document_converter import DocumentConverter
 from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export
 _log = logging.getLogger(__name__)
 GENERATE = GEN_TEST_DATA
 IS_CI = bool(os.getenv("CI"))
@pytest.fixture(scope="module")
 def docx_paths() -> list[Path]:
    # Define the directory you want to search
    directory = Path("./tests/data/docx/")
    # List all docx files in the directory and its subdirectories
    docx_files = sorted(directory.rglob("*.docx"))
    return docx_files
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
    return converter
@pytest.fixture(scope="module")
 def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]:
    documents: list[dict[Path, DoclingDocument]] = []
    converter = get_converter()
    for docx_path in docx_paths:
        _log.debug(f"converting {docx_path}")
        gt_path = (
            docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
        )
        conv_result: ConversionResult = converter.convert(docx_path)
        doc: DoclingDocument = conv_result.document
        assert doc, f"Failed to convert document from file {gt_path}"
        documents.append((gt_path, doc))
    return documents
 def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument]]):
    has_libreoffice = False
    try:
        cmd = get_libreoffice_cmd(raise_if_unavailable=True)
        if cmd is not None:
            has_libreoffice = True
    except Exception:
        pass
    for docx_path, doc in docx_paths:
        if not IS_CI and not has_libreoffice and docx_path.name == "drawingml.docx":
            print(f"Skipping {docx_path} because no Libreoffice is installed.")
            continue
        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(docx_path) + ".md", generate=GENERATE), (
            f"export to markdown failed on {docx_path}"
        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(docx_path) + ".itxt", generate=GENERATE), (
            f"export to indented-text failed on {docx_path}"
        )
        assert verify_document(doc, str(docx_path) + ".json", generate=GENERATE), (
            f"DoclingDocument verification failed on {docx_path}"
        )
        if docx_path.name == "word_tables.docx":
            pred_html: str = doc.export_to_html()
            assert verify_export(
                pred_text=pred_html,
                gtfile=str(docx_path) + ".html",
                generate=GENERATE,
            ), f"export to html failed on {docx_path}"
 flaky_file = "textbox.docx"
 def test_e2e_docx_conversions(documents):
    target = [item for item in documents if item[0].name != flaky_file]
    _test_e2e_docx_conversions_impl(target)
@pytest.mark.xfail(strict=False)
-def test_textbox_extraction():
+def test_textbox_conversion(documents):
-    in_path = Path("tests/data/docx/textbox.docx")
+    target = [item for item in documents if item[0].name == flaky_file]
-    in_doc = InputDocument(
+    _test_e2e_docx_conversions_impl(target)
-        path_or_stream=in_path,
+
-        format=InputFormat.DOCX,
+
-        backend=MsWordDocumentBackend,
+@pytest.mark.xfail(strict=False)
-    )
+def test_textbox_extraction(documents):
-    backend = MsWordDocumentBackend(
+    name = "textbox.docx"
-        in_doc=in_doc,
+    doc = next(item[1] for item in documents if item[0].name == name)
        path_or_stream=in_path,
    )
    doc = backend.convert()
    # Verify if a particular textbox content is extracted
    textbox_found = False
@@ -44,18 +131,9 @@ def test_textbox_extraction():
    assert textbox_found
-def test_heading_levels():
+def test_heading_levels(documents):
-    in_path = Path("tests/data/docx/word_sample.docx")
+    name = "word_sample.docx"
-    in_doc = InputDocument(
+    doc = next(item[1] for item in documents if item[0].name == name)
        path_or_stream=in_path,
        format=InputFormat.DOCX,
        backend=MsWordDocumentBackend,
    )
    backend = MsWordDocumentBackend(
        in_doc=in_doc,
        path_or_stream=in_path,
    )
    doc = backend.convert()
    found_lvl_1 = found_lvl_2 = False
    for item, _ in doc.iterate_items():
@@ -69,104 +147,11 @@ def test_heading_levels():
    assert found_lvl_1 and found_lvl_2
-def get_docx_paths():
+def test_text_after_image_anchors(documents):
-    # Define the directory you want to search
+    """Test to analyse whether text gets parsed after image anchors."""
    directory = Path("./tests/data/docx/")
-    # List all PDF files in the directory and its subdirectories
+    name = "word_image_anchors.docx"
-    pdf_files = sorted(directory.rglob("*.docx"))
+    doc = next(item[1] for item in documents if item[0].name == name)
    return pdf_files
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
    return converter
 def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
    converter = get_converter()
    has_libreoffice = False
    try:
        cmd = get_libreoffice_cmd(raise_if_unavailable=True)
        if cmd is not None:
            has_libreoffice = True
    except Exception:
        pass
    for docx_path in docx_paths:
        if (
            not IS_CI
            and not has_libreoffice
            and str(docx_path) in ("tests/data/docx/drawingml.docx",)
        ):
            print(f"Skipping {docx_path} because no Libreoffice is installed.")
            continue
        gt_path = (
            docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
        )
        conv_result: ConversionResult = converter.convert(docx_path)
        doc: DoclingDocument = conv_result.document
        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
            f"export to markdown failed on {docx_path}"
        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
            f"export to indented-text failed on {docx_path}"
        )
        assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
            f"DoclingDocument verification failed on {docx_path}"
        )
        if docx_path.name == "word_tables.docx":
            pred_html: str = doc.export_to_html()
            assert verify_export(
                pred_text=pred_html,
                gtfile=str(gt_path) + ".html",
                generate=GENERATE,
            ), f"export to html failed on {docx_path}"
 flaky_path = Path("tests/data/docx/textbox.docx")
 def test_e2e_docx_conversions():
    _test_e2e_docx_conversions_impl(
        docx_paths=[path for path in get_docx_paths() if path != flaky_path]
    )
@pytest.mark.xfail(strict=False)
 def test_textbox_conversion():
    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
 def test_text_after_image_anchors():
    """
    Test to analyse whether text gets parsed after image anchors.
    """
    in_path = Path("tests/data/docx/word_image_anchors.docx")
    in_doc = InputDocument(
        path_or_stream=in_path,
        format=InputFormat.DOCX,
        backend=MsWordDocumentBackend,
    )
    backend = MsWordDocumentBackend(
        in_doc=in_doc,
        path_or_stream=in_path,
    )
    doc = backend.convert()
    found_text_after_anchor_1 = found_text_after_anchor_2 = (
        found_text_after_anchor_3
@@ -188,3 +173,38 @@ def test_text_after_image_anchors():
        and found_text_after_anchor_3
        and found_text_after_anchor_4
    )
 def test_is_rich_table_cell(docx_paths):
    """Test the function is_rich_table_cell."""
    name = "docx_rich_cells.docx"
    path = next(item for item in docx_paths if item.name == name)
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.DOCX,
        backend=MsWordDocumentBackend,
        filename=name,
    )
    backend = MsWordDocumentBackend(
        in_doc=in_doc,
        path_or_stream=path,
    )
    gt_cells: list[bool] = []
    # table: Table with rich cells
    gt_cells.extend([False, False, True, True, True, True, True, False])
    # table: Table with nested table
    gt_cells.extend([False, False, False, True, True, True])
    # table: Table with pictures
    gt_cells.extend([False, False, False, True, True, False])
    gt_it = iter(gt_cells)
    for idx_t, table in enumerate(backend.docx_obj.tables):
        for idx_r, row in enumerate(table.rows):
            for idx_c, cell in enumerate(row.cells):
                assert next(gt_it) == backend._is_rich_table_cell(cell), (
                    f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
                    f"with text: {cell.text}"
                )