Adding new latex symbols, simplifying how equations are added to text

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-04-03 17:57:30 +02:00
parent 0499cd1c1e
commit 207cd78a26
7 changed files with 62 additions and 38 deletions

View File

@ -215,6 +215,9 @@ FUNC = {
"coth": "\\coth({fe})", "coth": "\\coth({fe})",
"sec": "\\sec({fe})", "sec": "\\sec({fe})",
"csc": "\\csc({fe})", "csc": "\\csc({fe})",
"mod": "\\mod {fe}",
"max": "\\max({fe})",
"min": "\\min({fe})",
} }
FUNC_PLACE = "{fe}" FUNC_PLACE = "{fe}"

View File

@ -281,8 +281,10 @@ class oMath2Latex(Tag2Method):
if FUNC.get(t): if FUNC.get(t):
latex_chars.append(FUNC[t]) latex_chars.append(FUNC[t])
else: else:
raise NotSupport("Not support func %s" % t) print(f"Function not supported, will default to text: {t}")
else: if isinstance(t, str):
latex_chars.append(t)
elif isinstance(t, str):
latex_chars.append(t) latex_chars.append(t)
t = BLANK.join(latex_chars) t = BLANK.join(latex_chars)
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
@ -415,10 +417,12 @@ class oMath2Latex(Tag2Method):
""" """
_str = [] _str = []
_base_str = [] _base_str = []
for s in elm.findtext("./{0}t".format(OMML_NS)): found_text = elm.findtext("./{0}t".format(OMML_NS))
out_latex_str = self.process_unicode(s) if found_text:
_str.append(out_latex_str) for s in found_text:
_base_str.append(s) out_latex_str = self.process_unicode(s)
_str.append(out_latex_str)
_base_str.append(s)
proc_str = escape_latex(BLANK.join(_str)) proc_str = escape_latex(BLANK.join(_str))
base_proc_str = BLANK.join(_base_str) base_proc_str = BLANK.join(_base_str)

View File

@ -58,6 +58,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list: Optional[int] = None self.level_at_new_list: Optional[int] = None
self.parents: dict[int, Optional[NodeItem]] = {} self.parents: dict[int, Optional[NodeItem]] = {}
self.numbered_headers: dict[int, int] = {} self.numbered_headers: dict[int, int] = {}
self.equation_bookends: str = "<eq>{EQ}</eq>"
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
@ -356,9 +357,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
only_texts.append(subt.text) only_texts.append(subt.text)
texts_and_equations.append(subt.text) texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag: elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt)) latex_equation = str(oMath2Latex(subt)).strip()
only_equations.append(latex_equation) if len(latex_equation) > 0:
texts_and_equations.append(latex_equation) only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
if len(only_equations) < 1: if len(only_equations) < 1:
return text, [] return text, []
@ -373,21 +379,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Insert equations into original text # Insert equations into original text
# This is done to preserve white space structure # This is done to preserve white space structure
output_text = "" output_text = text[:]
init_i = 0 init_i = 0
for i_substr, substr in enumerate(texts_and_equations): for i_substr, substr in enumerate(texts_and_equations):
if substr not in text: if len(substr) == 0:
continue
if substr in output_text[init_i:]:
init_i += output_text[init_i:].find(substr) + len(substr)
else:
if i_substr > 0: if i_substr > 0:
i_text_before = text[init_i:].find( output_text = output_text[:init_i] + substr + output_text[init_i:]
texts_and_equations[i_substr - 1] init_i += len(substr)
) else:
output_text += text[init_i:][ output_text = substr + output_text
: i_text_before + len(texts_and_equations[i_substr - 1])
]
init_i += i_text_before + len(texts_and_equations[i_substr - 1])
output_text += substr
if only_equations.index(substr) == len(only_equations) - 1:
output_text += text[init_i:]
return output_text, only_equations return output_text, only_equations
@ -479,13 +484,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._add_header(doc, p_level, text, is_numbered_style) self._add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0: elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0: if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
# Standalone equation # Standalone equation
level = self._get_level() level = self._get_level()
doc.add_text( doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=self.parents[level - 1], parent=self.parents[level - 1],
text=text, text=text.replace("<eq>", "").replace("</eq>", ""),
) )
else: else:
# Inline equation # Inline equation
@ -498,8 +503,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if len(text_tmp) == 0: if len(text_tmp) == 0:
break break
pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0] split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1]
pre_eq_text = split_text_tmp[0]
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
if len(pre_eq_text) > 0: if len(pre_eq_text) > 0:
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -509,8 +517,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc.add_text( doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=inline_equation, parent=inline_equation,
text=eq, text=eq.replace("<eq>", "").replace("</eq>", ""),
) )
if len(text_tmp) > 0: if len(text_tmp) > 0:
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,

View File

@ -196,8 +196,8 @@
"content_layer": "body", "content_layer": "body",
"label": "formula", "label": "formula",
"prov": [], "prov": [],
"orig": "A= \\pi r^{2} ", "orig": "A= \\pi r^{2}",
"text": "A= \\pi r^{2} " "text": "A= \\pi r^{2}"
}, },
{ {
"self_ref": "#/texts/2", "self_ref": "#/texts/2",
@ -352,8 +352,8 @@
"content_layer": "body", "content_layer": "body",
"label": "formula", "label": "formula",
"prov": [], "prov": [],
"orig": "A= \\pi r^{2} ", "orig": "A= \\pi r^{2}",
"text": "A= \\pi r^{2} " "text": "A= \\pi r^{2}"
}, },
{ {
"self_ref": "#/texts/15", "self_ref": "#/texts/15",
@ -532,8 +532,8 @@
"content_layer": "body", "content_layer": "body",
"label": "formula", "label": "formula",
"prov": [], "prov": [],
"orig": "A= \\pi r^{2} ", "orig": "A= \\pi r^{2}",
"text": "A= \\pi r^{2} " "text": "A= \\pi r^{2}"
}, },
{ {
"self_ref": "#/texts/30", "self_ref": "#/texts/30",

View File

@ -1,4 +1,4 @@
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$ $$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$
@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$ $$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$
@ -22,7 +22,7 @@ $$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$ $$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$

View File

@ -76,15 +76,23 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
<<<<<<< HEAD
assert verify_export( assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md" ), "export to md"
=======
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
>>>>>>> 64a7888 (Adding new latex symbols, simplifying how equations are added to text)
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(
<<<<<<< HEAD
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
=======
pred_itxt, str(gt_path) + ".itxt", GENERATE
>>>>>>> 64a7888 (Adding new latex symbols, simplifying how equations are added to text)
), "export to indented-text" ), "export to indented-text"
assert verify_document( assert verify_document(