Adding new latex symbols, simplifying how equations are added to text

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-04-03 17:57:30 +02:00
parent 0499cd1c1e
commit 207cd78a26
7 changed files with 62 additions and 38 deletions

View File

@ -215,6 +215,9 @@ FUNC = {
"coth": "\\coth({fe})", "coth": "\\coth({fe})",
"sec": "\\sec({fe})", "sec": "\\sec({fe})",
"csc": "\\csc({fe})", "csc": "\\csc({fe})",
"mod": "\\mod {fe}",
"max": "\\max({fe})",
"min": "\\min({fe})",
} }
FUNC_PLACE = "{fe}" FUNC_PLACE = "{fe}"

View File

@ -281,8 +281,10 @@ class oMath2Latex(Tag2Method):
if FUNC.get(t): if FUNC.get(t):
latex_chars.append(FUNC[t]) latex_chars.append(FUNC[t])
else: else:
raise NotSupport("Not support func %s" % t) print(f"Function not supported, will default to text: {t}")
else: if isinstance(t, str):
latex_chars.append(t)
elif isinstance(t, str):
latex_chars.append(t) latex_chars.append(t)
t = BLANK.join(latex_chars) t = BLANK.join(latex_chars)
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
@ -415,7 +417,9 @@ class oMath2Latex(Tag2Method):
""" """
_str = [] _str = []
_base_str = [] _base_str = []
for s in elm.findtext("./{0}t".format(OMML_NS)): found_text = elm.findtext("./{0}t".format(OMML_NS))
if found_text:
for s in found_text:
out_latex_str = self.process_unicode(s) out_latex_str = self.process_unicode(s)
_str.append(out_latex_str) _str.append(out_latex_str)
_base_str.append(s) _base_str.append(s)

View File

@ -58,6 +58,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list: Optional[int] = None self.level_at_new_list: Optional[int] = None
self.parents: dict[int, Optional[NodeItem]] = {} self.parents: dict[int, Optional[NodeItem]] = {}
self.numbered_headers: dict[int, int] = {} self.numbered_headers: dict[int, int] = {}
self.equation_bookends: str = "<eq>{EQ}</eq>"
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
@ -356,9 +357,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
only_texts.append(subt.text) only_texts.append(subt.text)
texts_and_equations.append(subt.text) texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag: elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt)) latex_equation = str(oMath2Latex(subt)).strip()
only_equations.append(latex_equation) if len(latex_equation) > 0:
texts_and_equations.append(latex_equation) only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
if len(only_equations) < 1: if len(only_equations) < 1:
return text, [] return text, []
@ -373,21 +379,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Insert equations into original text # Insert equations into original text
# This is done to preserve white space structure # This is done to preserve white space structure
output_text = "" output_text = text[:]
init_i = 0 init_i = 0
for i_substr, substr in enumerate(texts_and_equations): for i_substr, substr in enumerate(texts_and_equations):
if substr not in text: if len(substr) == 0:
continue
if substr in output_text[init_i:]:
init_i += output_text[init_i:].find(substr) + len(substr)
else:
if i_substr > 0: if i_substr > 0:
i_text_before = text[init_i:].find( output_text = output_text[:init_i] + substr + output_text[init_i:]
texts_and_equations[i_substr - 1] init_i += len(substr)
) else:
output_text += text[init_i:][ output_text = substr + output_text
: i_text_before + len(texts_and_equations[i_substr - 1])
]
init_i += i_text_before + len(texts_and_equations[i_substr - 1])
output_text += substr
if only_equations.index(substr) == len(only_equations) - 1:
output_text += text[init_i:]
return output_text, only_equations return output_text, only_equations
@ -479,13 +484,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._add_header(doc, p_level, text, is_numbered_style) self._add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0: elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0: if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
# Standalone equation # Standalone equation
level = self._get_level() level = self._get_level()
doc.add_text( doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=self.parents[level - 1], parent=self.parents[level - 1],
text=text, text=text.replace("<eq>", "").replace("</eq>", ""),
) )
else: else:
# Inline equation # Inline equation
@ -498,8 +503,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if len(text_tmp) == 0: if len(text_tmp) == 0:
break break
pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0] split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1]
pre_eq_text = split_text_tmp[0]
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
if len(pre_eq_text) > 0: if len(pre_eq_text) > 0:
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -509,8 +517,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc.add_text( doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=inline_equation, parent=inline_equation,
text=eq, text=eq.replace("<eq>", "").replace("</eq>", ""),
) )
if len(text_tmp) > 0: if len(text_tmp) > 0:
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,

View File

@ -76,15 +76,23 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
<<<<<<< HEAD
assert verify_export( assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md" ), "export to md"
=======
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
>>>>>>> 64a7888 (Adding new latex symbols, simplifying how equations are added to text)
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(
<<<<<<< HEAD
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
=======
pred_itxt, str(gt_path) + ".itxt", GENERATE
>>>>>>> 64a7888 (Adding new latex symbols, simplifying how equations are added to text)
), "export to indented-text" ), "export to indented-text"
assert verify_document( assert verify_document(