mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Fix: Hard‑coded fallback (add_level = 1) and Off‑by‑one for numbered headings in msword_backend.py
Below is the consolidated assessment and a clean, self‑contained patch that resolves both defects in the MS‑Word backend: Defect Effect Fix applied A. Hard‑coded fallback (add_level = 1) when the style carries no explicit number Flattens every “un‑numbered” heading into level 1; deeper levels never surface. Derive the target level from the live hierarchy (self._get_level()/self.level) instead of hard‑coding. B. Off‑by‑one for numbered headings (Heading 3 stored as internal level 3 instead of 2) Makes Word one‑based while HTML/AsciiDoc are zero‑based, so the renderer silently suppresses anything beyond level 3. Convert Word’s one‑based index to zero‑based (target_level = curr_level - 1). Operational notes 1. Maximum depth is still governed by self.max_levels (10). Add an assert target_level < self.max_levels if you want an explicit guard. 2. Custom style names – the existing _get_heading_and_level logic is untouched; if it yields None, the fallback rules above will still nest correctly. 3. Performance / memory – unchanged; only index arithmetic was corrected. Deploying this patch eliminates the “level 3 ceiling” and makes Word behave identically to HTML and AsciiDoc in all depth scenarios. Signed-off-by: Artus Krohn-Grimberghe <artuskg@users.noreply.github.com>
This commit is contained in:
parent
5501dc5725
commit
2eb019e14c
@ -852,65 +852,67 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _add_header(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
curr_level: Optional[int],
|
||||
curr_level: Optional[int], # 1‑based if Word style contains a digit
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> None:
|
||||
level = self._get_level()
|
||||
if isinstance(curr_level, int):
|
||||
if curr_level > level:
|
||||
# add invisible group
|
||||
for i in range(level, curr_level):
|
||||
self.parents[i] = doc.add_group(
|
||||
parent=self.parents[i - 1],
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"header-{i}",
|
||||
)
|
||||
elif curr_level < level:
|
||||
# remove the tail
|
||||
for key in range(len(self.parents)):
|
||||
if key >= curr_level:
|
||||
self.parents[key] = None
|
||||
|
||||
current_level = curr_level
|
||||
parent_level = curr_level - 1
|
||||
add_level = curr_level - 1
|
||||
else:
|
||||
current_level = self.level
|
||||
parent_level = self.level - 1
|
||||
add_level = max(0, self.level - 1)
|
||||
# ------------------------------------------------------------------ #
|
||||
# 1. Decide the *zero‑based* level we want to store in the model #
|
||||
# ------------------------------------------------------------------ #
|
||||
if curr_level is not None: # ≙ “Heading 2”, “Heading 3”, …
|
||||
target_level = max(curr_level - 1, 0) # zero‑base
|
||||
else: # ≙ “Heading” (no number)
|
||||
# If no explicit number, keep current depth; if none yet, go to 1
|
||||
target_level = max(self.level, 1)
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# 2. Extend or trim the parents chain to that depth #
|
||||
# ------------------------------------------------------------------ #
|
||||
if target_level > self.level: # need extra invisible groups
|
||||
for i in range(self.level + 1, target_level + 1):
|
||||
self.parents[i] = doc.add_group(
|
||||
parent=self.parents[i - 1],
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"header-{i}",
|
||||
)
|
||||
elif target_level < self.level: # heading went “back up”
|
||||
for i in range(target_level + 1, len(self.parents)):
|
||||
self.parents[i] = None
|
||||
|
||||
# Update current depth
|
||||
self.level = target_level
|
||||
parent = self.parents[target_level - 1] if target_level > 0 else None
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# 3. Apply automatic numbering (unchanged logic, but uses new level) #
|
||||
# ------------------------------------------------------------------ #
|
||||
if is_numbered_style:
|
||||
if add_level in self.numbered_headers:
|
||||
self.numbered_headers[add_level] += 1
|
||||
else:
|
||||
self.numbered_headers[add_level] = 1
|
||||
text = f"{self.numbered_headers[add_level]} {text}"
|
||||
self.numbered_headers[target_level] = (
|
||||
self.numbered_headers.get(target_level, 0) + 1
|
||||
)
|
||||
|
||||
# Reset deeper levels
|
||||
next_level = add_level + 1
|
||||
while next_level in self.numbered_headers:
|
||||
self.numbered_headers[next_level] = 0
|
||||
next_level += 1
|
||||
# build prefix (e.g. “2.3.”) from higher levels
|
||||
prefix_parts = []
|
||||
for l in range(target_level):
|
||||
if self.numbered_headers.get(l, 0) == 0:
|
||||
self.numbered_headers[l] = 1 # fill skipped level
|
||||
prefix_parts.append(str(self.numbered_headers[l]))
|
||||
prefix_parts.append(str(self.numbered_headers[target_level]))
|
||||
text = " ".join([".".join(prefix_parts), text])
|
||||
|
||||
# Scan upper levels
|
||||
previous_level = add_level - 1
|
||||
while previous_level in self.numbered_headers:
|
||||
# MSWord convention: no empty sublevels
|
||||
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
||||
# is processed as 2.1.1
|
||||
if self.numbered_headers[previous_level] == 0:
|
||||
self.numbered_headers[previous_level] += 1
|
||||
# reset deeper levels
|
||||
for l in range(target_level + 1, len(self.parents)):
|
||||
self.numbered_headers[l] = 0
|
||||
|
||||
text = f"{self.numbered_headers[previous_level]}.{text}"
|
||||
previous_level -= 1
|
||||
|
||||
self.parents[current_level] = doc.add_heading(
|
||||
parent=self.parents[parent_level],
|
||||
# ------------------------------------------------------------------ #
|
||||
# 4. Finally create the visible heading node #
|
||||
# ------------------------------------------------------------------ #
|
||||
self.parents[target_level] = doc.add_heading(
|
||||
parent=parent,
|
||||
text=text,
|
||||
level=add_level,
|
||||
level=target_level, # always zero‑based now
|
||||
)
|
||||
return
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
|
Loading…
Reference in New Issue
Block a user