From e7fc1a40ed39c22c75f0b1753401658411b902a0 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Fri, 4 Apr 2025 14:46:43 +0200 Subject: [PATCH] Identify headers through inhenrited style Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index efd15d4d..5530bba0 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -264,6 +264,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label = paragraph.style.style_id name = paragraph.style.name + base_style_label = None + base_style_name = None + if base_style := getattr(paragraph.style, "base_style", None): + base_style_label = base_style.style_id + base_style_name = base_style.name if label is None: return "Normal", None @@ -277,6 +282,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return self._get_heading_and_level(label) if "heading" in name.lower(): return self._get_heading_and_level(name) + if base_style_label and "heading" in base_style_label.lower(): + return self._get_heading_and_level(base_style_label) + if base_style_name and "heading" in base_style_name.lower(): + return self._get_heading_and_level(base_style_name) return label, None