mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Flexibilize heading detection
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
d5431577f0
commit
e535209c75
@ -234,33 +234,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return None, None # If the paragraph is not part of a list
|
return None, None # If the paragraph is not part of a list
|
||||||
|
|
||||||
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
||||||
if paragraph.style is None:
|
parts = self.split_text_and_number(style_label)
|
||||||
return "Normal", None
|
|
||||||
label = paragraph.style.style_id
|
|
||||||
if label is None:
|
|
||||||
return "Normal", None
|
|
||||||
if ":" in label:
|
|
||||||
parts = label.split(":")
|
|
||||||
|
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
return parts[0], self.str_to_int(parts[1], None)
|
|
||||||
|
|
||||||
parts = self.split_text_and_number(label)
|
|
||||||
|
|
||||||
if "Heading" in label and len(parts) == 2:
|
|
||||||
parts.sort()
|
parts.sort()
|
||||||
label_str: str = ""
|
label_str: str = ""
|
||||||
label_level: Optional[int] = 0
|
label_level: Optional[int] = 0
|
||||||
if parts[0] == "Heading":
|
if parts[0].strip().lower() == "heading":
|
||||||
label_str = parts[0]
|
label_str = "Heading"
|
||||||
label_level = self.str_to_int(parts[1], None)
|
label_level = self.str_to_int(parts[1], None)
|
||||||
if parts[1] == "Heading":
|
if parts[1].strip().lower() == "heading":
|
||||||
label_str = parts[1]
|
label_str = "Heading"
|
||||||
label_level = self.str_to_int(parts[0], None)
|
label_level = self.str_to_int(parts[0], None)
|
||||||
return label_str, label_level
|
return label_str, label_level
|
||||||
else:
|
|
||||||
return label, None
|
return style_label, None
|
||||||
|
|
||||||
|
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
||||||
|
if paragraph.style is None:
|
||||||
|
return "Normal", None
|
||||||
|
|
||||||
|
label = paragraph.style.style_id
|
||||||
|
name = paragraph.style.name
|
||||||
|
|
||||||
|
if label is None:
|
||||||
|
return "Normal", None
|
||||||
|
|
||||||
|
if ":" in label:
|
||||||
|
parts = label.split(":")
|
||||||
|
if len(parts) == 2:
|
||||||
|
return parts[0], self.str_to_int(parts[1], None)
|
||||||
|
|
||||||
|
if "heading" in label.lower():
|
||||||
|
return self.get_heading_and_level(label)
|
||||||
|
if "heading" in name.lower():
|
||||||
|
return self.get_heading_and_level(name)
|
||||||
|
|
||||||
|
return label, None
|
||||||
|
|
||||||
def handle_equations_in_text(self, element, text):
|
def handle_equations_in_text(self, element, text):
|
||||||
only_texts = []
|
only_texts = []
|
||||||
|
Loading…
Reference in New Issue
Block a user