Improve text parsing

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
2025-07-27 04:24:45 +00:00 · 2025-03-31 11:41:06 +02:00 · 2025-03-31 11:41:06 +02:00 · 76982a5b15
commit 76982a5b15
parent eb4d17bba5
1 changed files with 30 additions and 5 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -269,6 +269,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        for subt in element.iter():
            tag_name = etree.QName(subt).localname
            if tag_name == "t" and "math" not in subt.tag:
+                if isinstance(subt.text, str):
                    only_texts.append(subt.text)
                    texts_and_equations.append(subt.text)
            elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
@ -276,12 +277,36 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                only_equations.append(latex_equation)
                texts_and_equations.append(latex_equation)

-        if "".join(only_texts).strip() != text.strip():
+        if len(only_equations) < 1:
+            return text, []
+
+        if (
+            re.sub(r"\s+", "", "".join(only_texts)).strip()
+            != re.sub(r"\s+", "", text).strip()
+        ):
            # If we are not able to reconstruct the initial raw text
            # do not try to parse equations and return the original
            return text, []

-        return "".join(texts_and_equations), only_equations
+        # Insert equations into original text
+        # This is done to preserve white space structure
+        output_text = ""
+        init_i = 0
+        for i_substr, substr in enumerate(texts_and_equations):
+            if substr not in text:
+                if i_substr > 0:
+                    i_text_before = text[init_i:].find(
+                        texts_and_equations[i_substr - 1]
+                    )
+                    output_text += text[init_i:][
+                        : i_text_before + len(texts_and_equations[i_substr - 1])
+                    ]
+                    init_i += i_text_before + len(texts_and_equations[i_substr - 1])
+                output_text += substr
+                if only_equations.index(substr) == len(only_equations) - 1:
+                    output_text += text[init_i:]
+
+        return output_text, only_equations

    def handle_text_elements(
        self,
@ -348,7 +373,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
        elif "Heading" in p_style_id:
            style_element = getattr(paragraph.style, "element", None)
-            if style_element:
+            if style_element is not None:
                is_numbered_style = (
                    "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
                )