mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Adding new latex symbols, simplifying how equations are added to text
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
71148eb381
commit
64a7888092
@ -215,6 +215,9 @@ FUNC = {
|
|||||||
"coth": "\\coth({fe})",
|
"coth": "\\coth({fe})",
|
||||||
"sec": "\\sec({fe})",
|
"sec": "\\sec({fe})",
|
||||||
"csc": "\\csc({fe})",
|
"csc": "\\csc({fe})",
|
||||||
|
"mod": "\\mod {fe}",
|
||||||
|
"max": "\\max({fe})",
|
||||||
|
"min": "\\min({fe})",
|
||||||
}
|
}
|
||||||
|
|
||||||
FUNC_PLACE = "{fe}"
|
FUNC_PLACE = "{fe}"
|
||||||
|
@ -281,8 +281,10 @@ class oMath2Latex(Tag2Method):
|
|||||||
if FUNC.get(t):
|
if FUNC.get(t):
|
||||||
latex_chars.append(FUNC[t])
|
latex_chars.append(FUNC[t])
|
||||||
else:
|
else:
|
||||||
raise NotSupport("Not support func %s" % t)
|
print(f"Function not supported, will default to text: {t}")
|
||||||
else:
|
if isinstance(t, str):
|
||||||
|
latex_chars.append(t)
|
||||||
|
elif isinstance(t, str):
|
||||||
latex_chars.append(t)
|
latex_chars.append(t)
|
||||||
t = BLANK.join(latex_chars)
|
t = BLANK.join(latex_chars)
|
||||||
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
||||||
@ -415,10 +417,12 @@ class oMath2Latex(Tag2Method):
|
|||||||
"""
|
"""
|
||||||
_str = []
|
_str = []
|
||||||
_base_str = []
|
_base_str = []
|
||||||
for s in elm.findtext("./{0}t".format(OMML_NS)):
|
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
||||||
out_latex_str = self.process_unicode(s)
|
if found_text:
|
||||||
_str.append(out_latex_str)
|
for s in found_text:
|
||||||
_base_str.append(s)
|
out_latex_str = self.process_unicode(s)
|
||||||
|
_str.append(out_latex_str)
|
||||||
|
_base_str.append(s)
|
||||||
|
|
||||||
proc_str = escape_latex(BLANK.join(_str))
|
proc_str = escape_latex(BLANK.join(_str))
|
||||||
base_proc_str = BLANK.join(_base_str)
|
base_proc_str = BLANK.join(_base_str)
|
||||||
|
@ -54,6 +54,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level_at_new_list: Optional[int] = None
|
self.level_at_new_list: Optional[int] = None
|
||||||
self.parents: dict[int, Optional[NodeItem]] = {}
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||||
self.numbered_headers: dict[int, int] = {}
|
self.numbered_headers: dict[int, int] = {}
|
||||||
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
@ -284,9 +285,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
only_texts.append(subt.text)
|
only_texts.append(subt.text)
|
||||||
texts_and_equations.append(subt.text)
|
texts_and_equations.append(subt.text)
|
||||||
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
||||||
latex_equation = str(oMath2Latex(subt))
|
latex_equation = str(oMath2Latex(subt)).strip()
|
||||||
only_equations.append(latex_equation)
|
if len(latex_equation) > 0:
|
||||||
texts_and_equations.append(latex_equation)
|
only_equations.append(
|
||||||
|
self.equation_bookends.format(EQ=latex_equation)
|
||||||
|
)
|
||||||
|
texts_and_equations.append(
|
||||||
|
self.equation_bookends.format(EQ=latex_equation)
|
||||||
|
)
|
||||||
|
|
||||||
if len(only_equations) < 1:
|
if len(only_equations) < 1:
|
||||||
return text, []
|
return text, []
|
||||||
@ -301,21 +307,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Insert equations into original text
|
# Insert equations into original text
|
||||||
# This is done to preserve white space structure
|
# This is done to preserve white space structure
|
||||||
output_text = ""
|
output_text = text[:]
|
||||||
init_i = 0
|
init_i = 0
|
||||||
for i_substr, substr in enumerate(texts_and_equations):
|
for i_substr, substr in enumerate(texts_and_equations):
|
||||||
if substr not in text:
|
if len(substr) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if substr in output_text[init_i:]:
|
||||||
|
init_i += output_text[init_i:].find(substr) + len(substr)
|
||||||
|
else:
|
||||||
if i_substr > 0:
|
if i_substr > 0:
|
||||||
i_text_before = text[init_i:].find(
|
output_text = output_text[:init_i] + substr + output_text[init_i:]
|
||||||
texts_and_equations[i_substr - 1]
|
init_i += len(substr)
|
||||||
)
|
else:
|
||||||
output_text += text[init_i:][
|
output_text = substr + output_text
|
||||||
: i_text_before + len(texts_and_equations[i_substr - 1])
|
|
||||||
]
|
|
||||||
init_i += i_text_before + len(texts_and_equations[i_substr - 1])
|
|
||||||
output_text += substr
|
|
||||||
if only_equations.index(substr) == len(only_equations) - 1:
|
|
||||||
output_text += text[init_i:]
|
|
||||||
|
|
||||||
return output_text, only_equations
|
return output_text, only_equations
|
||||||
|
|
||||||
@ -393,13 +398,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.add_header(doc, p_level, text, is_numbered_style)
|
self.add_header(doc, p_level, text, is_numbered_style)
|
||||||
|
|
||||||
elif len(equations) > 0:
|
elif len(equations) > 0:
|
||||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
|
||||||
# Standalone equation
|
# Standalone equation
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
text=text,
|
text=text.replace("<eq>", "").replace("</eq>", ""),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Inline equation
|
# Inline equation
|
||||||
@ -412,8 +417,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if len(text_tmp) == 0:
|
if len(text_tmp) == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0]
|
split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
|
||||||
text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1]
|
|
||||||
|
pre_eq_text = split_text_tmp[0]
|
||||||
|
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
|
||||||
|
|
||||||
if len(pre_eq_text) > 0:
|
if len(pre_eq_text) > 0:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -423,8 +431,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=inline_equation,
|
parent=inline_equation,
|
||||||
text=eq,
|
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(text_tmp) > 0:
|
if len(text_tmp) > 0:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
item-0 at level 0: unspecified: group _root_
|
item-0 at level 0: unspecified: group _root_
|
||||||
item-1 at level 1: inline: group group
|
item-1 at level 1: inline: group group
|
||||||
item-2 at level 2: paragraph: This is a word document and this is an inline equation:
|
item-2 at level 2: paragraph: This is a word document and this is an inline equation:
|
||||||
item-3 at level 2: formula: A= \pi r^{2}
|
item-3 at level 2: formula: A= \pi r^{2}
|
||||||
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
||||||
item-5 at level 1: paragraph:
|
item-5 at level 1: paragraph:
|
||||||
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
|
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
|
||||||
@ -15,7 +15,7 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-14 at level 1: paragraph:
|
item-14 at level 1: paragraph:
|
||||||
item-15 at level 1: inline: group group
|
item-15 at level 1: inline: group group
|
||||||
item-16 at level 2: paragraph: This is a word document and this is an inline equation:
|
item-16 at level 2: paragraph: This is a word document and this is an inline equation:
|
||||||
item-17 at level 2: formula: A= \pi r^{2}
|
item-17 at level 2: formula: A= \pi r^{2}
|
||||||
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
||||||
item-19 at level 1: paragraph:
|
item-19 at level 1: paragraph:
|
||||||
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
|
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
|
||||||
@ -31,7 +31,7 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-30 at level 1: paragraph:
|
item-30 at level 1: paragraph:
|
||||||
item-31 at level 1: inline: group group
|
item-31 at level 1: inline: group group
|
||||||
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
|
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
|
||||||
item-33 at level 2: formula: A= \pi r^{2}
|
item-33 at level 2: formula: A= \pi r^{2}
|
||||||
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
||||||
item-35 at level 1: paragraph:
|
item-35 at level 1: paragraph:
|
||||||
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
|
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
|
||||||
|
@ -196,8 +196,8 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "formula",
|
"label": "formula",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A= \\pi r^{2} ",
|
"orig": "A= \\pi r^{2}",
|
||||||
"text": "A= \\pi r^{2} "
|
"text": "A= \\pi r^{2}"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/2",
|
"self_ref": "#/texts/2",
|
||||||
@ -352,8 +352,8 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "formula",
|
"label": "formula",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A= \\pi r^{2} ",
|
"orig": "A= \\pi r^{2}",
|
||||||
"text": "A= \\pi r^{2} "
|
"text": "A= \\pi r^{2}"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/15",
|
"self_ref": "#/texts/15",
|
||||||
@ -532,8 +532,8 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "formula",
|
"label": "formula",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A= \\pi r^{2} ",
|
"orig": "A= \\pi r^{2}",
|
||||||
"text": "A= \\pi r^{2} "
|
"text": "A= \\pi r^{2}"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/30",
|
"self_ref": "#/texts/30",
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
|
This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
|
||||||
|
|
||||||
$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$
|
$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$
|
||||||
|
|
||||||
@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+
|
|||||||
|
|
||||||
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
|
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
|
||||||
|
|
||||||
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
|
This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
|
||||||
|
|
||||||
$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$
|
$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$
|
||||||
|
|
||||||
@ -22,7 +22,7 @@ $$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{
|
|||||||
|
|
||||||
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
|
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
|
||||||
|
|
||||||
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
|
This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this:
|
||||||
|
|
||||||
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$
|
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$
|
||||||
|
|
||||||
|
@ -76,13 +76,13 @@ def test_e2e_docx_conversions():
|
|||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
pred_itxt, str(gt_path) + ".itxt", GENERATE
|
||||||
), "export to indented-text"
|
), "export to indented-text"
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(
|
||||||
|
Loading…
Reference in New Issue
Block a user