Improve numbered list detection for msword docs

This fixes the list detection in MSWord docs by properly tracking and counting
the list entries. It fixes
https://github.com/docling-project/docling/issues/2090
This commit is contained in:
Nikhil Verma
2025-08-19 15:15:59 +05:30
parent d2494da8b8
commit 509da6658e
3 changed files with 135 additions and 25 deletions

View File

@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level = 0
self.listIter = 0
# Track list counters per numId and ilvl
self.list_counters: dict[tuple[int, int], int] = {}
self.history: dict[str, Any] = {
"names": [None],
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return None, None # If the paragraph is not part of a list
def _get_list_counter(self, numid: int, ilvl: int) -> int:
"""Get and increment the counter for a specific numId and ilvl combination."""
key = (numid, ilvl)
if key not in self.list_counters:
self.list_counters[key] = 0
self.list_counters[key] += 1
return self.list_counters[key]
def _reset_list_counters_for_new_sequence(self, numid: int):
"""Reset counters when starting a new numbering sequence."""
# Reset all counters for this numid
keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
for key in keys_to_reset:
self.list_counters[key] = 0
def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
"""Check if a list is numbered based on its numFmt value."""
try:
# Access the numbering part of the document
if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
return False
numbering_part = None
# Find the numbering part
for part in docx_obj.part.package.parts:
if "numbering" in part.partname:
numbering_part = part
break
if numbering_part is None:
return False
# Parse the numbering XML
numbering_root = numbering_part.element
namespaces = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
}
# Find the numbering definition with the given numId
num_xpath = f".//w:num[@w:numId='{numId}']"
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
if num_element is None:
return False
# Get the abstractNumId from the num element
abstract_num_id_elem = num_element.find(
".//w:abstractNumId", namespaces=namespaces
)
if abstract_num_id_elem is None:
return False
abstract_num_id = abstract_num_id_elem.get(
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
)
if abstract_num_id is None:
return False
# Find the abstract numbering definition
abstract_num_xpath = (
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
)
abstract_num_element = numbering_root.find(
abstract_num_xpath, namespaces=namespaces
)
if abstract_num_element is None:
return False
# Find the level definition for the given ilvl
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
if lvl_element is None:
return False
# Get the numFmt element
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
if num_fmt_element is None:
return False
num_fmt = num_fmt_element.get(
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
)
# Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
# Bullet formats include: bullet
numbered_formats = {
"decimal",
"lowerRoman",
"upperRoman",
"lowerLetter",
"upperLetter",
"decimalZero",
}
return num_fmt in numbered_formats
except Exception as e:
_log.debug(f"Error determining if list is numbered: {e}")
return False
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
parts = self._split_text_and_number(style_label)
@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# Identify whether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_id, p_level = self._get_label_and_level(paragraph)
numid, ilevel = self._get_numId_and_ilvl(paragraph)
@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and ilevel is not None
and p_style_id not in ["Title", "Heading"]
):
# Check if this is actually a numbered list by examining the numFmt
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
self._add_list_item(
doc=doc,
numid=numid,
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if self._prev_numid() is None: # Open new list
self.level_at_new_list = level
# Reset counters for the new numbering sequence
self._reset_list_counters_for_new_sequence(numid)
self.parents[level] = doc.add_list_group(
name="list", parent=self.parents[level - 1]
)
# Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level
)
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1,
):
self.listIter = 0
self.parents[i] = doc.add_list_group(
name="list", parent=self.parents[i - 1]
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
doc,
elements,
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[k] = None
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
doc,
elements,
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
is_numbered,
self.level_at_new_list + ilevel,
)
self.listIter = 0
elif self._prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
)

View File

@@ -12,9 +12,9 @@ Paragraph 2.1.2
#### Test 2:
- List item a
- List item b
- List item c
1. List item a
2. List item b
3. List item c
#### Test 3:

View File

@@ -18,9 +18,9 @@ To get started with swimming, first lay down in a water and try not to drown:
Also, dont forget:
- Wear sunglasses
- Dont forget to drink water
- Use sun cream
1. Wear sunglasses
2. Dont forget to drink water
3. Use sun cream
Hmm, what else…
@@ -40,6 +40,6 @@ Here are some interesting things a respectful duck could eat:
And lets add another list in the end:
- Leaves
- Berries
- Grain
1. Leaves
2. Berries
3. Grain