diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py
index 280358be..03234788 100644
--- a/docling/backend/docx/latex/latex_dict.py
+++ b/docling/backend/docx/latex/latex_dict.py
@@ -215,6 +215,9 @@ FUNC = {
"coth": "\\coth({fe})",
"sec": "\\sec({fe})",
"csc": "\\csc({fe})",
+ "mod": "\\mod {fe}",
+ "max": "\\max({fe})",
+ "min": "\\min({fe})",
}
FUNC_PLACE = "{fe}"
diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py
index add0de71..b2d5f900 100644
--- a/docling/backend/docx/latex/omml.py
+++ b/docling/backend/docx/latex/omml.py
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
On 23/01/2025
"""
+import logging
+
import lxml.etree as ET
from pylatexenc.latexencode import UnicodeToLatexEncoder
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
+_log = logging.getLogger(__name__)
+
def load(stream):
tree = ET.parse(stream)
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
if FUNC.get(t):
latex_chars.append(FUNC[t])
else:
- raise NotSupport("Not support func %s" % t)
- else:
+ _log.warning("Function not supported, will default to text: %s", t)
+ if isinstance(t, str):
+ latex_chars.append(t)
+ elif isinstance(t, str):
latex_chars.append(t)
t = BLANK.join(latex_chars)
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
out_latex_str = self.u.unicode_to_latex(s)
- # print(s, out_latex_str)
-
if (
s.startswith("{") is False
and out_latex_str.startswith("{")
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
):
out_latex_str = f" {out_latex_str[1:-1]} "
- # print(s, out_latex_str)
-
if "ensuremath" in out_latex_str:
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
out_latex_str = out_latex_str.replace("}", " ")
- # print(s, out_latex_str)
-
if out_latex_str.strip().startswith("\\text"):
out_latex_str = f" \\text{{{out_latex_str}}} "
- # print(s, out_latex_str)
-
return out_latex_str
def do_r(self, elm):
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
"""
_str = []
_base_str = []
- for s in elm.findtext("./{0}t".format(OMML_NS)):
- out_latex_str = self.process_unicode(s)
- _str.append(out_latex_str)
- _base_str.append(s)
+ found_text = elm.findtext("./{0}t".format(OMML_NS))
+ if found_text:
+ for s in found_text:
+ out_latex_str = self.process_unicode(s)
+ _str.append(out_latex_str)
+ _base_str.append(s)
proc_str = escape_latex(BLANK.join(_str))
base_proc_str = BLANK.join(_base_str)
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index d6b73f70..5530bba0 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -58,6 +58,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list: Optional[int] = None
self.parents: dict[int, Optional[NodeItem]] = {}
self.numbered_headers: dict[int, int] = {}
+ self.equation_bookends: str = "{EQ}"
for i in range(-1, self.max_levels):
self.parents[i] = None
@@ -263,6 +264,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label = paragraph.style.style_id
name = paragraph.style.name
+ base_style_label = None
+ base_style_name = None
+ if base_style := getattr(paragraph.style, "base_style", None):
+ base_style_label = base_style.style_id
+ base_style_name = base_style.name
if label is None:
return "Normal", None
@@ -276,6 +282,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return self._get_heading_and_level(label)
if "heading" in name.lower():
return self._get_heading_and_level(name)
+ if base_style_label and "heading" in base_style_label.lower():
+ return self._get_heading_and_level(base_style_label)
+ if base_style_name and "heading" in base_style_name.lower():
+ return self._get_heading_and_level(base_style_name)
return label, None
@@ -356,9 +366,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
- latex_equation = str(oMath2Latex(subt))
- only_equations.append(latex_equation)
- texts_and_equations.append(latex_equation)
+ latex_equation = str(oMath2Latex(subt)).strip()
+ if len(latex_equation) > 0:
+ only_equations.append(
+ self.equation_bookends.format(EQ=latex_equation)
+ )
+ texts_and_equations.append(
+ self.equation_bookends.format(EQ=latex_equation)
+ )
if len(only_equations) < 1:
return text, []
@@ -373,21 +388,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Insert equations into original text
# This is done to preserve white space structure
- output_text = ""
+ output_text = text[:]
init_i = 0
for i_substr, substr in enumerate(texts_and_equations):
- if substr not in text:
+ if len(substr) == 0:
+ continue
+
+ if substr in output_text[init_i:]:
+ init_i += output_text[init_i:].find(substr) + len(substr)
+ else:
if i_substr > 0:
- i_text_before = text[init_i:].find(
- texts_and_equations[i_substr - 1]
- )
- output_text += text[init_i:][
- : i_text_before + len(texts_and_equations[i_substr - 1])
- ]
- init_i += i_text_before + len(texts_and_equations[i_substr - 1])
- output_text += substr
- if only_equations.index(substr) == len(only_equations) - 1:
- output_text += text[init_i:]
+ output_text = output_text[:init_i] + substr + output_text[init_i:]
+ init_i += len(substr)
+ else:
+ output_text = substr + output_text
return output_text, only_equations
@@ -479,13 +493,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0:
- if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
+ if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
# Standalone equation
level = self._get_level()
doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
- text=text,
+ text=text.replace("", "").replace("", ""),
)
else:
# Inline equation
@@ -498,8 +512,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if len(text_tmp) == 0:
break
- pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0]
- text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1]
+ split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
+
+ pre_eq_text = split_text_tmp[0]
+ text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
+
if len(pre_eq_text) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
@@ -509,8 +526,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc.add_text(
label=DocItemLabel.FORMULA,
parent=inline_equation,
- text=eq,
+ text=eq.replace("", "").replace("", ""),
)
+
if len(text_tmp) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt
index 1412074d..c1c68fb6 100644
--- a/tests/data/groundtruth/docling_v2/equations.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt
@@ -1,7 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: inline: group group
item-2 at level 2: paragraph: This is a word document and this is an inline equation:
- item-3 at level 2: formula: A= \pi r^{2}
+ item-3 at level 2: formula: A= \pi r^{2}
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-5 at level 1: paragraph:
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
@@ -15,7 +15,7 @@ item-0 at level 0: unspecified: group _root_
item-14 at level 1: paragraph:
item-15 at level 1: inline: group group
item-16 at level 2: paragraph: This is a word document and this is an inline equation:
- item-17 at level 2: formula: A= \pi r^{2}
+ item-17 at level 2: formula: A= \pi r^{2}
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-19 at level 1: paragraph:
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
@@ -31,7 +31,7 @@ item-0 at level 0: unspecified: group _root_
item-30 at level 1: paragraph:
item-31 at level 1: inline: group group
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
- item-33 at level 2: formula: A= \pi r^{2}
+ item-33 at level 2: formula: A= \pi r^{2}
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-35 at level 1: paragraph:
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json
index 43247d6f..0d57b78f 100644
--- a/tests/data/groundtruth/docling_v2/equations.docx.json
+++ b/tests/data/groundtruth/docling_v2/equations.docx.json
@@ -196,8 +196,8 @@
"content_layer": "body",
"label": "formula",
"prov": [],
- "orig": "A= \\pi r^{2} ",
- "text": "A= \\pi r^{2} "
+ "orig": "A= \\pi r^{2}",
+ "text": "A= \\pi r^{2}"
},
{
"self_ref": "#/texts/2",
@@ -352,8 +352,8 @@
"content_layer": "body",
"label": "formula",
"prov": [],
- "orig": "A= \\pi r^{2} ",
- "text": "A= \\pi r^{2} "
+ "orig": "A= \\pi r^{2}",
+ "text": "A= \\pi r^{2}"
},
{
"self_ref": "#/texts/15",
@@ -532,8 +532,8 @@
"content_layer": "body",
"label": "formula",
"prov": [],
- "orig": "A= \\pi r^{2} ",
- "text": "A= \\pi r^{2} "
+ "orig": "A= \\pi r^{2}",
+ "text": "A= \\pi r^{2}"
},
{
"self_ref": "#/texts/30",
diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md
index a8253ebf..578d5eb0 100644
--- a/tests/data/groundtruth/docling_v2/equations.docx.md
+++ b/tests/data/groundtruth/docling_v2/equations.docx.md
@@ -1,4 +1,4 @@
-This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
+This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$
@@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
-This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
+This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$
@@ -22,7 +22,7 @@ $$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
-This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
+This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$