fix: Improve numbered list detection for msword docs (#2100)

* Improve numbered list detection for msword docs This fixes the list detection in MSWord docs by properly tracking and counting the list entries. It fixes https://github.com/docling-project/docling/issues/2090 * DCO Remediation Commit for Nikhil Verma <nikhilgotmail@gmail.com> I, Nikhil Verma <nikhilgotmail@gmail.com>, hereby add my Signed-off-by to this commit: 509da6658e Signed-off-by: Nikhil Verma <nikhilgotmail@gmail.com> --------- Signed-off-by: Nikhil Verma <nikhilgotmail@gmail.com>
2025-12-08 12:48:28 +00:00 · 2025-08-22 14:08:34 +05:30
parent 94fcc46aa9
commit 3f03709885
3 changed files with 135 additions and 25 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

        self.level = 0
        self.listIter = 0
+        # Track list counters per numId and ilvl
+        self.list_counters: dict[tuple[int, int], int] = {}

        self.history: dict[str, Any] = {
            "names": [None],
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

        return None, None  # If the paragraph is not part of a list

+    def _get_list_counter(self, numid: int, ilvl: int) -> int:
+        """Get and increment the counter for a specific numId and ilvl combination."""
+        key = (numid, ilvl)
+        if key not in self.list_counters:
+            self.list_counters[key] = 0
+        self.list_counters[key] += 1
+        return self.list_counters[key]
+
+    def _reset_list_counters_for_new_sequence(self, numid: int):
+        """Reset counters when starting a new numbering sequence."""
+        # Reset all counters for this numid
+        keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
+        for key in keys_to_reset:
+            self.list_counters[key] = 0
+
+    def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
+        """Check if a list is numbered based on its numFmt value."""
+        try:
+            # Access the numbering part of the document
+            if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
+                return False
+
+            numbering_part = None
+            # Find the numbering part
+            for part in docx_obj.part.package.parts:
+                if "numbering" in part.partname:
+                    numbering_part = part
+                    break
+
+            if numbering_part is None:
+                return False
+
+            # Parse the numbering XML
+            numbering_root = numbering_part.element
+            namespaces = {
+                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+            }
+
+            # Find the numbering definition with the given numId
+            num_xpath = f".//w:num[@w:numId='{numId}']"
+            num_element = numbering_root.find(num_xpath, namespaces=namespaces)
+
+            if num_element is None:
+                return False
+
+            # Get the abstractNumId from the num element
+            abstract_num_id_elem = num_element.find(
+                ".//w:abstractNumId", namespaces=namespaces
+            )
+            if abstract_num_id_elem is None:
+                return False
+
+            abstract_num_id = abstract_num_id_elem.get(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+            )
+            if abstract_num_id is None:
+                return False
+
+            # Find the abstract numbering definition
+            abstract_num_xpath = (
+                f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
+            )
+            abstract_num_element = numbering_root.find(
+                abstract_num_xpath, namespaces=namespaces
+            )
+
+            if abstract_num_element is None:
+                return False
+
+            # Find the level definition for the given ilvl
+            lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
+            lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
+
+            if lvl_element is None:
+                return False
+
+            # Get the numFmt element
+            num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
+            if num_fmt_element is None:
+                return False
+
+            num_fmt = num_fmt_element.get(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+            )
+
+            # Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
+            # Bullet formats include: bullet
+            numbered_formats = {
+                "decimal",
+                "lowerRoman",
+                "upperRoman",
+                "lowerLetter",
+                "upperLetter",
+                "decimalZero",
+            }
+
+            return num_fmt in numbered_formats
+
+        except Exception as e:
+            _log.debug(f"Error determining if list is numbered: {e}")
+            return False
+
    def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
        parts = self._split_text_and_number(style_label)

@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        # Common styles for bullet and numbered lists.
        # "List Bullet", "List Number", "List Paragraph"
        # Identify whether list is a numbered list or not
-        # is_numbered = "List Bullet" not in paragraph.style.name
-        is_numbered = False
        p_style_id, p_level = self._get_label_and_level(paragraph)
        numid, ilevel = self._get_numId_and_ilvl(paragraph)

@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            and ilevel is not None
            and p_style_id not in ["Title", "Heading"]
        ):
+            # Check if this is actually a numbered list by examining the numFmt
+            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
+
            self._add_list_item(
                doc=doc,
                numid=numid,
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        if self._prev_numid() is None:  # Open new list
            self.level_at_new_list = level

+            # Reset counters for the new numbering sequence
+            self._reset_list_counters_for_new_sequence(numid)
+
            self.parents[level] = doc.add_list_group(
                name="list", parent=self.parents[level - 1]
            )

            # Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
            self._add_formatted_list_item(
                doc, elements, enum_marker, is_numbered, level
            )
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self.level_at_new_list + prev_indent + 1,
                self.level_at_new_list + ilevel + 1,
            ):
-                self.listIter = 0
                self.parents[i] = doc.add_list_group(
                    name="list", parent=self.parents[i - 1]
                )

            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
            self._add_formatted_list_item(
                doc,
                elements,
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    self.parents[k] = None

            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
            self._add_formatted_list_item(
                doc,
                elements,
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                is_numbered,
                self.level_at_new_list + ilevel,
            )
-            self.listIter = 0

        elif self._prev_numid() == numid or prev_indent == ilevel:
            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
            self._add_formatted_list_item(
                doc, elements, enum_marker, is_numbered, level - 1
            )
--- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.md
+++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.md
@@ -12,9 +12,9 @@ Paragraph 2.1.2

 #### Test 2:

- List item a
- List item b
- List item c
+1. List item a
+2. List item b
+3. List item c

 #### Test 3:

--- a/tests/data/groundtruth/docling_v2/word_sample.docx.md
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.md
@@ -18,9 +18,9 @@ To get started with swimming, first lay down in a water and try not to drown:

 Also, don’t forget:

- Wear sunglasses
- Don’t forget to drink water
- Use sun cream
+1. Wear sunglasses
+2. Don’t forget to drink water
+3. Use sun cream

 Hmm, what else…

@@ -40,6 +40,6 @@ Here are some interesting things a respectful duck could eat:

 And let’s add another list in the end:

- Leaves
- Berries
- Grain
+1. Leaves
+2. Berries
+3. Grain