mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix(docx): identifying numbered headers (#1231)
* Modifications to identify numbered headers Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * Add style check Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> --------- Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
committed by
GitHub
parent
0974ba4e1c
commit
f739d0e4c5
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.max_levels: int = 10
|
||||
self.level_at_new_list: Optional[int] = None
|
||||
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||
self.numbered_headers: dict[int, int] = {}
|
||||
for i in range(-1, self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
@@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=None, label=DocItemLabel.TITLE, text=text
|
||||
)
|
||||
elif "Heading" in p_style_id:
|
||||
self.add_header(doc, p_level, text)
|
||||
style_element = getattr(paragraph.style, "element", None)
|
||||
if style_element:
|
||||
is_numbered_style = (
|
||||
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
||||
)
|
||||
else:
|
||||
is_numbered_style = False
|
||||
self.add_header(doc, p_level, text, is_numbered_style)
|
||||
|
||||
elif len(equations) > 0:
|
||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
||||
@@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
def add_header(
|
||||
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> None:
|
||||
level = self.get_level()
|
||||
if isinstance(curr_level, int):
|
||||
@@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if key >= curr_level:
|
||||
self.parents[key] = None
|
||||
|
||||
self.parents[curr_level] = doc.add_heading(
|
||||
parent=self.parents[curr_level - 1],
|
||||
text=text,
|
||||
level=curr_level,
|
||||
)
|
||||
current_level = curr_level
|
||||
parent_level = curr_level - 1
|
||||
add_level = curr_level
|
||||
else:
|
||||
self.parents[self.level] = doc.add_heading(
|
||||
parent=self.parents[self.level - 1],
|
||||
text=text,
|
||||
level=1,
|
||||
)
|
||||
current_level = self.level
|
||||
parent_level = self.level - 1
|
||||
add_level = 1
|
||||
|
||||
if is_numbered_style:
|
||||
if add_level in self.numbered_headers:
|
||||
self.numbered_headers[add_level] += 1
|
||||
else:
|
||||
self.numbered_headers[add_level] = 1
|
||||
text = f"{self.numbered_headers[add_level]} {text}"
|
||||
|
||||
# Reset deeper levels
|
||||
next_level = add_level + 1
|
||||
while next_level in self.numbered_headers:
|
||||
self.numbered_headers[next_level] = 0
|
||||
next_level += 1
|
||||
|
||||
# Scan upper levels
|
||||
previous_level = add_level - 1
|
||||
while previous_level in self.numbered_headers:
|
||||
# MSWord convention: no empty sublevels
|
||||
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
||||
# is processed as 2.1.1
|
||||
if self.numbered_headers[previous_level] == 0:
|
||||
self.numbered_headers[previous_level] += 1
|
||||
|
||||
text = f"{self.numbered_headers[previous_level]}.{text}"
|
||||
previous_level -= 1
|
||||
|
||||
self.parents[current_level] = doc.add_heading(
|
||||
parent=self.parents[parent_level],
|
||||
text=text,
|
||||
level=add_level,
|
||||
)
|
||||
return
|
||||
|
||||
def add_listitem(
|
||||
|
||||
Reference in New Issue
Block a user