mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 05:24:28 +00:00
Adding new latex symbols, simplifying how equations are added to text
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
71148eb381
commit
64a7888092
@ -215,6 +215,9 @@ FUNC = {
|
|||||||
"coth": "\\coth({fe})",
|
"coth": "\\coth({fe})",
|
||||||
"sec": "\\sec({fe})",
|
"sec": "\\sec({fe})",
|
||||||
"csc": "\\csc({fe})",
|
"csc": "\\csc({fe})",
|
||||||
|
"mod": "\\mod {fe}",
|
||||||
|
"max": "\\max({fe})",
|
||||||
|
"min": "\\min({fe})",
|
||||||
}
|
}
|
||||||
|
|
||||||
FUNC_PLACE = "{fe}"
|
FUNC_PLACE = "{fe}"
|
||||||
|
@ -281,8 +281,10 @@ class oMath2Latex(Tag2Method):
|
|||||||
if FUNC.get(t):
|
if FUNC.get(t):
|
||||||
latex_chars.append(FUNC[t])
|
latex_chars.append(FUNC[t])
|
||||||
else:
|
else:
|
||||||
raise NotSupport("Not support func %s" % t)
|
print(f"Function not supported, will default to text: {t}")
|
||||||
else:
|
if isinstance(t, str):
|
||||||
|
latex_chars.append(t)
|
||||||
|
elif isinstance(t, str):
|
||||||
latex_chars.append(t)
|
latex_chars.append(t)
|
||||||
t = BLANK.join(latex_chars)
|
t = BLANK.join(latex_chars)
|
||||||
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
||||||
@ -415,7 +417,9 @@ class oMath2Latex(Tag2Method):
|
|||||||
"""
|
"""
|
||||||
_str = []
|
_str = []
|
||||||
_base_str = []
|
_base_str = []
|
||||||
for s in elm.findtext("./{0}t".format(OMML_NS)):
|
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
||||||
|
if found_text:
|
||||||
|
for s in found_text:
|
||||||
out_latex_str = self.process_unicode(s)
|
out_latex_str = self.process_unicode(s)
|
||||||
_str.append(out_latex_str)
|
_str.append(out_latex_str)
|
||||||
_base_str.append(s)
|
_base_str.append(s)
|
||||||
|
@ -54,6 +54,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level_at_new_list: Optional[int] = None
|
self.level_at_new_list: Optional[int] = None
|
||||||
self.parents: dict[int, Optional[NodeItem]] = {}
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||||
self.numbered_headers: dict[int, int] = {}
|
self.numbered_headers: dict[int, int] = {}
|
||||||
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
@ -284,9 +285,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
only_texts.append(subt.text)
|
only_texts.append(subt.text)
|
||||||
texts_and_equations.append(subt.text)
|
texts_and_equations.append(subt.text)
|
||||||
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
||||||
latex_equation = str(oMath2Latex(subt))
|
latex_equation = str(oMath2Latex(subt)).strip()
|
||||||
only_equations.append(latex_equation)
|
if len(latex_equation) > 0:
|
||||||
texts_and_equations.append(latex_equation)
|
only_equations.append(
|
||||||
|
self.equation_bookends.format(EQ=latex_equation)
|
||||||
|
)
|
||||||
|
texts_and_equations.append(
|
||||||
|
self.equation_bookends.format(EQ=latex_equation)
|
||||||
|
)
|
||||||
|
|
||||||
if len(only_equations) < 1:
|
if len(only_equations) < 1:
|
||||||
return text, []
|
return text, []
|
||||||
@ -301,21 +307,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Insert equations into original text
|
# Insert equations into original text
|
||||||
# This is done to preserve white space structure
|
# This is done to preserve white space structure
|
||||||
output_text = ""
|
output_text = text[:]
|
||||||
init_i = 0
|
init_i = 0
|
||||||
for i_substr, substr in enumerate(texts_and_equations):
|
for i_substr, substr in enumerate(texts_and_equations):
|
||||||
if substr not in text:
|
if len(substr) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if substr in output_text[init_i:]:
|
||||||
|
init_i += output_text[init_i:].find(substr) + len(substr)
|
||||||
|
else:
|
||||||
if i_substr > 0:
|
if i_substr > 0:
|
||||||
i_text_before = text[init_i:].find(
|
output_text = output_text[:init_i] + substr + output_text[init_i:]
|
||||||
texts_and_equations[i_substr - 1]
|
init_i += len(substr)
|
||||||
)
|
else:
|
||||||
output_text += text[init_i:][
|
output_text = substr + output_text
|
||||||
: i_text_before + len(texts_and_equations[i_substr - 1])
|
|
||||||
]
|
|
||||||
init_i += i_text_before + len(texts_and_equations[i_substr - 1])
|
|
||||||
output_text += substr
|
|
||||||
if only_equations.index(substr) == len(only_equations) - 1:
|
|
||||||
output_text += text[init_i:]
|
|
||||||
|
|
||||||
return output_text, only_equations
|
return output_text, only_equations
|
||||||
|
|
||||||
@ -393,13 +398,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.add_header(doc, p_level, text, is_numbered_style)
|
self.add_header(doc, p_level, text, is_numbered_style)
|
||||||
|
|
||||||
elif len(equations) > 0:
|
elif len(equations) > 0:
|
||||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
|
||||||
# Standalone equation
|
# Standalone equation
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
text=text,
|
text=text.replace("<eq>", "").replace("</eq>", ""),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Inline equation
|
# Inline equation
|
||||||
@ -412,8 +417,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if len(text_tmp) == 0:
|
if len(text_tmp) == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0]
|
split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
|
||||||
text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1]
|
|
||||||
|
pre_eq_text = split_text_tmp[0]
|
||||||
|
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
|
||||||
|
|
||||||
if len(pre_eq_text) > 0:
|
if len(pre_eq_text) > 0:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -423,8 +431,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=inline_equation,
|
parent=inline_equation,
|
||||||
text=eq,
|
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(text_tmp) > 0:
|
if len(text_tmp) > 0:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
@ -76,13 +76,13 @@ def test_e2e_docx_conversions():
|
|||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
pred_itxt, str(gt_path) + ".itxt", GENERATE
|
||||||
), "export to indented-text"
|
), "export to indented-text"
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(
|
||||||
|
Loading…
Reference in New Issue
Block a user