Fix: Hard‑coded fallback (add_level = 1) and Off‑by‑one for numbered headings in msword_backend.py

Below is the consolidated assessment and a clean, self‑contained patch that resolves both defects in the MS‑Word backend:

Defect	Effect	Fix applied
A. Hard‑coded fallback (add_level = 1) when the style carries no explicit number	Flattens every “un‑numbered” heading into level 1; deeper levels never surface.	Derive the target level from the live hierarchy (self._get_level()/self.level) instead of hard‑coding.
B. Off‑by‑one for numbered headings (Heading 3 stored as internal level 3 instead of 2)	Makes Word one‑based while HTML/AsciiDoc are zero‑based, so the renderer silently suppresses anything beyond level 3.	Convert Word’s one‑based index to zero‑based (target_level = curr_level - 1).

Operational notes
	1.	Maximum depth is still governed by self.max_levels (10).
Add an assert target_level < self.max_levels if you want an explicit guard.
	2.	Custom style names – the existing _get_heading_and_level logic is untouched; if it yields None, the fallback rules above will still nest correctly.
	3.	Performance / memory – unchanged; only index arithmetic was corrected.

Deploying this patch eliminates the “level 3 ceiling” and makes Word behave identically to HTML and AsciiDoc in all depth scenarios.

Signed-off-by: Artus Krohn-Grimberghe <artuskg@users.noreply.github.com>
This commit is contained in:
Artus Krohn-Grimberghe 2025-06-18 16:05:58 +02:00 committed by GitHub
parent 5501dc5725
commit 2eb019e14c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -852,65 +852,67 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _add_header( def _add_header(
self, self,
doc: DoclingDocument, doc: DoclingDocument,
curr_level: Optional[int], curr_level: Optional[int], # 1based if Word style contains a digit
text: str, text: str,
is_numbered_style: bool = False, is_numbered_style: bool = False,
) -> None: ) -> None:
level = self._get_level()
if isinstance(curr_level, int):
if curr_level > level:
# add invisible group
for i in range(level, curr_level):
self.parents[i] = doc.add_group(
parent=self.parents[i - 1],
label=GroupLabel.SECTION,
name=f"header-{i}",
)
elif curr_level < level:
# remove the tail
for key in range(len(self.parents)):
if key >= curr_level:
self.parents[key] = None
current_level = curr_level # ------------------------------------------------------------------ #
parent_level = curr_level - 1 # 1. Decide the *zerobased* level we want to store in the model #
add_level = curr_level - 1 # ------------------------------------------------------------------ #
else: if curr_level is not None: # ≙ “Heading 2”, “Heading 3”, …
current_level = self.level target_level = max(curr_level - 1, 0) # zerobase
parent_level = self.level - 1 else: # ≙ “Heading” (no number)
add_level = max(0, self.level - 1) # If no explicit number, keep current depth; if none yet, go to 1
target_level = max(self.level, 1)
# ------------------------------------------------------------------ #
# 2. Extend or trim the parents chain to that depth #
# ------------------------------------------------------------------ #
if target_level > self.level: # need extra invisible groups
for i in range(self.level + 1, target_level + 1):
self.parents[i] = doc.add_group(
parent=self.parents[i - 1],
label=GroupLabel.SECTION,
name=f"header-{i}",
)
elif target_level < self.level: # heading went “back up”
for i in range(target_level + 1, len(self.parents)):
self.parents[i] = None
# Update current depth
self.level = target_level
parent = self.parents[target_level - 1] if target_level > 0 else None
# ------------------------------------------------------------------ #
# 3. Apply automatic numbering (unchanged logic, but uses new level) #
# ------------------------------------------------------------------ #
if is_numbered_style: if is_numbered_style:
if add_level in self.numbered_headers: self.numbered_headers[target_level] = (
self.numbered_headers[add_level] += 1 self.numbered_headers.get(target_level, 0) + 1
else: )
self.numbered_headers[add_level] = 1
text = f"{self.numbered_headers[add_level]} {text}"
# Reset deeper levels # build prefix (e.g. “2.3.”) from higher levels
next_level = add_level + 1 prefix_parts = []
while next_level in self.numbered_headers: for l in range(target_level):
self.numbered_headers[next_level] = 0 if self.numbered_headers.get(l, 0) == 0:
next_level += 1 self.numbered_headers[l] = 1 # fill skipped level
prefix_parts.append(str(self.numbered_headers[l]))
prefix_parts.append(str(self.numbered_headers[target_level]))
text = " ".join([".".join(prefix_parts), text])
# Scan upper levels # reset deeper levels
previous_level = add_level - 1 for l in range(target_level + 1, len(self.parents)):
while previous_level in self.numbered_headers: self.numbered_headers[l] = 0
# MSWord convention: no empty sublevels
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
# is processed as 2.1.1
if self.numbered_headers[previous_level] == 0:
self.numbered_headers[previous_level] += 1
text = f"{self.numbered_headers[previous_level]}.{text}" # ------------------------------------------------------------------ #
previous_level -= 1 # 4. Finally create the visible heading node #
# ------------------------------------------------------------------ #
self.parents[current_level] = doc.add_heading( self.parents[target_level] = doc.add_heading(
parent=self.parents[parent_level], parent=parent,
text=text, text=text,
level=add_level, level=target_level, # always zerobased now
) )
return
def _add_list_item( def _add_list_item(
self, self,