mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Fix: Hard‑coded fallback (add_level = 1) and Off‑by‑one for numbered headings in msword_backend.py
Below is the consolidated assessment and a clean, self‑contained patch that resolves both defects in the MS‑Word backend: Defect Effect Fix applied A. Hard‑coded fallback (add_level = 1) when the style carries no explicit number Flattens every “un‑numbered” heading into level 1; deeper levels never surface. Derive the target level from the live hierarchy (self._get_level()/self.level) instead of hard‑coding. B. Off‑by‑one for numbered headings (Heading 3 stored as internal level 3 instead of 2) Makes Word one‑based while HTML/AsciiDoc are zero‑based, so the renderer silently suppresses anything beyond level 3. Convert Word’s one‑based index to zero‑based (target_level = curr_level - 1). Operational notes 1. Maximum depth is still governed by self.max_levels (10). Add an assert target_level < self.max_levels if you want an explicit guard. 2. Custom style names – the existing _get_heading_and_level logic is untouched; if it yields None, the fallback rules above will still nest correctly. 3. Performance / memory – unchanged; only index arithmetic was corrected. Deploying this patch eliminates the “level 3 ceiling” and makes Word behave identically to HTML and AsciiDoc in all depth scenarios. Signed-off-by: Artus Krohn-Grimberghe <artuskg@users.noreply.github.com>
This commit is contained in:
parent
5501dc5725
commit
2eb019e14c
@ -852,65 +852,67 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def _add_header(
|
def _add_header(
|
||||||
self,
|
self,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
curr_level: Optional[int],
|
curr_level: Optional[int], # 1‑based if Word style contains a digit
|
||||||
text: str,
|
text: str,
|
||||||
is_numbered_style: bool = False,
|
is_numbered_style: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
level = self._get_level()
|
|
||||||
if isinstance(curr_level, int):
|
|
||||||
if curr_level > level:
|
|
||||||
# add invisible group
|
|
||||||
for i in range(level, curr_level):
|
|
||||||
self.parents[i] = doc.add_group(
|
|
||||||
parent=self.parents[i - 1],
|
|
||||||
label=GroupLabel.SECTION,
|
|
||||||
name=f"header-{i}",
|
|
||||||
)
|
|
||||||
elif curr_level < level:
|
|
||||||
# remove the tail
|
|
||||||
for key in range(len(self.parents)):
|
|
||||||
if key >= curr_level:
|
|
||||||
self.parents[key] = None
|
|
||||||
|
|
||||||
current_level = curr_level
|
# ------------------------------------------------------------------ #
|
||||||
parent_level = curr_level - 1
|
# 1. Decide the *zero‑based* level we want to store in the model #
|
||||||
add_level = curr_level - 1
|
# ------------------------------------------------------------------ #
|
||||||
else:
|
if curr_level is not None: # ≙ “Heading 2”, “Heading 3”, …
|
||||||
current_level = self.level
|
target_level = max(curr_level - 1, 0) # zero‑base
|
||||||
parent_level = self.level - 1
|
else: # ≙ “Heading” (no number)
|
||||||
add_level = max(0, self.level - 1)
|
# If no explicit number, keep current depth; if none yet, go to 1
|
||||||
|
target_level = max(self.level, 1)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# 2. Extend or trim the parents chain to that depth #
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
if target_level > self.level: # need extra invisible groups
|
||||||
|
for i in range(self.level + 1, target_level + 1):
|
||||||
|
self.parents[i] = doc.add_group(
|
||||||
|
parent=self.parents[i - 1],
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
name=f"header-{i}",
|
||||||
|
)
|
||||||
|
elif target_level < self.level: # heading went “back up”
|
||||||
|
for i in range(target_level + 1, len(self.parents)):
|
||||||
|
self.parents[i] = None
|
||||||
|
|
||||||
|
# Update current depth
|
||||||
|
self.level = target_level
|
||||||
|
parent = self.parents[target_level - 1] if target_level > 0 else None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# 3. Apply automatic numbering (unchanged logic, but uses new level) #
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
if is_numbered_style:
|
if is_numbered_style:
|
||||||
if add_level in self.numbered_headers:
|
self.numbered_headers[target_level] = (
|
||||||
self.numbered_headers[add_level] += 1
|
self.numbered_headers.get(target_level, 0) + 1
|
||||||
else:
|
)
|
||||||
self.numbered_headers[add_level] = 1
|
|
||||||
text = f"{self.numbered_headers[add_level]} {text}"
|
|
||||||
|
|
||||||
# Reset deeper levels
|
# build prefix (e.g. “2.3.”) from higher levels
|
||||||
next_level = add_level + 1
|
prefix_parts = []
|
||||||
while next_level in self.numbered_headers:
|
for l in range(target_level):
|
||||||
self.numbered_headers[next_level] = 0
|
if self.numbered_headers.get(l, 0) == 0:
|
||||||
next_level += 1
|
self.numbered_headers[l] = 1 # fill skipped level
|
||||||
|
prefix_parts.append(str(self.numbered_headers[l]))
|
||||||
|
prefix_parts.append(str(self.numbered_headers[target_level]))
|
||||||
|
text = " ".join([".".join(prefix_parts), text])
|
||||||
|
|
||||||
# Scan upper levels
|
# reset deeper levels
|
||||||
previous_level = add_level - 1
|
for l in range(target_level + 1, len(self.parents)):
|
||||||
while previous_level in self.numbered_headers:
|
self.numbered_headers[l] = 0
|
||||||
# MSWord convention: no empty sublevels
|
|
||||||
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
|
||||||
# is processed as 2.1.1
|
|
||||||
if self.numbered_headers[previous_level] == 0:
|
|
||||||
self.numbered_headers[previous_level] += 1
|
|
||||||
|
|
||||||
text = f"{self.numbered_headers[previous_level]}.{text}"
|
# ------------------------------------------------------------------ #
|
||||||
previous_level -= 1
|
# 4. Finally create the visible heading node #
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
self.parents[current_level] = doc.add_heading(
|
self.parents[target_level] = doc.add_heading(
|
||||||
parent=self.parents[parent_level],
|
parent=parent,
|
||||||
text=text,
|
text=text,
|
||||||
level=add_level,
|
level=target_level, # always zero‑based now
|
||||||
)
|
)
|
||||||
return
|
|
||||||
|
|
||||||
def _add_list_item(
|
def _add_list_item(
|
||||||
self,
|
self,
|
||||||
|
Loading…
Reference in New Issue
Block a user