mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
Improve numbered list detection for msword docs
This fixes the list detection in MSWord docs by properly tracking and counting the list entries. It fixes https://github.com/docling-project/docling/issues/2090
This commit is contained in:
@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
self.level = 0
|
self.level = 0
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
# Track list counters per numId and ilvl
|
||||||
|
self.list_counters: dict[tuple[int, int], int] = {}
|
||||||
|
|
||||||
self.history: dict[str, Any] = {
|
self.history: dict[str, Any] = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return None, None # If the paragraph is not part of a list
|
return None, None # If the paragraph is not part of a list
|
||||||
|
|
||||||
|
def _get_list_counter(self, numid: int, ilvl: int) -> int:
|
||||||
|
"""Get and increment the counter for a specific numId and ilvl combination."""
|
||||||
|
key = (numid, ilvl)
|
||||||
|
if key not in self.list_counters:
|
||||||
|
self.list_counters[key] = 0
|
||||||
|
self.list_counters[key] += 1
|
||||||
|
return self.list_counters[key]
|
||||||
|
|
||||||
|
def _reset_list_counters_for_new_sequence(self, numid: int):
|
||||||
|
"""Reset counters when starting a new numbering sequence."""
|
||||||
|
# Reset all counters for this numid
|
||||||
|
keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
|
||||||
|
for key in keys_to_reset:
|
||||||
|
self.list_counters[key] = 0
|
||||||
|
|
||||||
|
def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
|
||||||
|
"""Check if a list is numbered based on its numFmt value."""
|
||||||
|
try:
|
||||||
|
# Access the numbering part of the document
|
||||||
|
if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
numbering_part = None
|
||||||
|
# Find the numbering part
|
||||||
|
for part in docx_obj.part.package.parts:
|
||||||
|
if "numbering" in part.partname:
|
||||||
|
numbering_part = part
|
||||||
|
break
|
||||||
|
|
||||||
|
if numbering_part is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Parse the numbering XML
|
||||||
|
numbering_root = numbering_part.element
|
||||||
|
namespaces = {
|
||||||
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Find the numbering definition with the given numId
|
||||||
|
num_xpath = f".//w:num[@w:numId='{numId}']"
|
||||||
|
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
|
||||||
|
|
||||||
|
if num_element is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get the abstractNumId from the num element
|
||||||
|
abstract_num_id_elem = num_element.find(
|
||||||
|
".//w:abstractNumId", namespaces=namespaces
|
||||||
|
)
|
||||||
|
if abstract_num_id_elem is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
abstract_num_id = abstract_num_id_elem.get(
|
||||||
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
|
)
|
||||||
|
if abstract_num_id is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Find the abstract numbering definition
|
||||||
|
abstract_num_xpath = (
|
||||||
|
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
|
||||||
|
)
|
||||||
|
abstract_num_element = numbering_root.find(
|
||||||
|
abstract_num_xpath, namespaces=namespaces
|
||||||
|
)
|
||||||
|
|
||||||
|
if abstract_num_element is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Find the level definition for the given ilvl
|
||||||
|
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
|
||||||
|
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
|
||||||
|
|
||||||
|
if lvl_element is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get the numFmt element
|
||||||
|
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
|
||||||
|
if num_fmt_element is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
num_fmt = num_fmt_element.get(
|
||||||
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
|
||||||
|
# Bullet formats include: bullet
|
||||||
|
numbered_formats = {
|
||||||
|
"decimal",
|
||||||
|
"lowerRoman",
|
||||||
|
"upperRoman",
|
||||||
|
"lowerLetter",
|
||||||
|
"upperLetter",
|
||||||
|
"decimalZero",
|
||||||
|
}
|
||||||
|
|
||||||
|
return num_fmt in numbered_formats
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
_log.debug(f"Error determining if list is numbered: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
||||||
parts = self._split_text_and_number(style_label)
|
parts = self._split_text_and_number(style_label)
|
||||||
|
|
||||||
@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
# Identify whether list is a numbered list or not
|
# Identify whether list is a numbered list or not
|
||||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
|
||||||
is_numbered = False
|
|
||||||
p_style_id, p_level = self._get_label_and_level(paragraph)
|
p_style_id, p_level = self._get_label_and_level(paragraph)
|
||||||
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
||||||
|
|
||||||
@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
and ilevel is not None
|
and ilevel is not None
|
||||||
and p_style_id not in ["Title", "Heading"]
|
and p_style_id not in ["Title", "Heading"]
|
||||||
):
|
):
|
||||||
|
# Check if this is actually a numbered list by examining the numFmt
|
||||||
|
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
|
||||||
|
|
||||||
self._add_list_item(
|
self._add_list_item(
|
||||||
doc=doc,
|
doc=doc,
|
||||||
numid=numid,
|
numid=numid,
|
||||||
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if self._prev_numid() is None: # Open new list
|
if self._prev_numid() is None: # Open new list
|
||||||
self.level_at_new_list = level
|
self.level_at_new_list = level
|
||||||
|
|
||||||
|
# Reset counters for the new numbering sequence
|
||||||
|
self._reset_list_counters_for_new_sequence(numid)
|
||||||
|
|
||||||
self.parents[level] = doc.add_list_group(
|
self.parents[level] = doc.add_list_group(
|
||||||
name="list", parent=self.parents[level - 1]
|
name="list", parent=self.parents[level - 1]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set marker and enumerated arguments if this is an enumeration element.
|
# Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
counter = self._get_list_counter(numid, ilevel)
|
||||||
is_numbered = True
|
enum_marker = str(counter) + "."
|
||||||
|
else:
|
||||||
|
enum_marker = ""
|
||||||
self._add_formatted_list_item(
|
self._add_formatted_list_item(
|
||||||
doc, elements, enum_marker, is_numbered, level
|
doc, elements, enum_marker, is_numbered, level
|
||||||
)
|
)
|
||||||
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level_at_new_list + prev_indent + 1,
|
self.level_at_new_list + prev_indent + 1,
|
||||||
self.level_at_new_list + ilevel + 1,
|
self.level_at_new_list + ilevel + 1,
|
||||||
):
|
):
|
||||||
self.listIter = 0
|
|
||||||
self.parents[i] = doc.add_list_group(
|
self.parents[i] = doc.add_list_group(
|
||||||
name="list", parent=self.parents[i - 1]
|
name="list", parent=self.parents[i - 1]
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
counter = self._get_list_counter(numid, ilevel)
|
||||||
is_numbered = True
|
enum_marker = str(counter) + "."
|
||||||
|
else:
|
||||||
|
enum_marker = ""
|
||||||
self._add_formatted_list_item(
|
self._add_formatted_list_item(
|
||||||
doc,
|
doc,
|
||||||
elements,
|
elements,
|
||||||
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents[k] = None
|
self.parents[k] = None
|
||||||
|
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
counter = self._get_list_counter(numid, ilevel)
|
||||||
is_numbered = True
|
enum_marker = str(counter) + "."
|
||||||
|
else:
|
||||||
|
enum_marker = ""
|
||||||
self._add_formatted_list_item(
|
self._add_formatted_list_item(
|
||||||
doc,
|
doc,
|
||||||
elements,
|
elements,
|
||||||
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
is_numbered,
|
is_numbered,
|
||||||
self.level_at_new_list + ilevel,
|
self.level_at_new_list + ilevel,
|
||||||
)
|
)
|
||||||
self.listIter = 0
|
|
||||||
|
|
||||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
counter = self._get_list_counter(numid, ilevel)
|
||||||
is_numbered = True
|
enum_marker = str(counter) + "."
|
||||||
|
else:
|
||||||
|
enum_marker = ""
|
||||||
self._add_formatted_list_item(
|
self._add_formatted_list_item(
|
||||||
doc, elements, enum_marker, is_numbered, level - 1
|
doc, elements, enum_marker, is_numbered, level - 1
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -12,9 +12,9 @@ Paragraph 2.1.2
|
|||||||
|
|
||||||
#### Test 2:
|
#### Test 2:
|
||||||
|
|
||||||
- List item a
|
1. List item a
|
||||||
- List item b
|
2. List item b
|
||||||
- List item c
|
3. List item c
|
||||||
|
|
||||||
#### Test 3:
|
#### Test 3:
|
||||||
|
|
||||||
|
|||||||
@@ -18,9 +18,9 @@ To get started with swimming, first lay down in a water and try not to drown:
|
|||||||
|
|
||||||
Also, don’t forget:
|
Also, don’t forget:
|
||||||
|
|
||||||
- Wear sunglasses
|
1. Wear sunglasses
|
||||||
- Don’t forget to drink water
|
2. Don’t forget to drink water
|
||||||
- Use sun cream
|
3. Use sun cream
|
||||||
|
|
||||||
Hmm, what else…
|
Hmm, what else…
|
||||||
|
|
||||||
@@ -40,6 +40,6 @@ Here are some interesting things a respectful duck could eat:
|
|||||||
|
|
||||||
And let’s add another list in the end:
|
And let’s add another list in the end:
|
||||||
|
|
||||||
- Leaves
|
1. Leaves
|
||||||
- Berries
|
2. Berries
|
||||||
- Grain
|
3. Grain
|
||||||
Reference in New Issue
Block a user