From ae2e0832cd469e41fef0d79e72debe0a05f3f927 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Thu, 3 Apr 2025 17:57:30 +0200 Subject: [PATCH] Adding new latex symbols, simplifying how equations are added to text Signed-off-by: Rafael Teixeira de Lima --- docling/backend/docx/latex/latex_dict.py | 3 ++ docling/backend/docx/latex/omml.py | 16 +++--- docling/backend/msword_backend.py | 49 +++++++++++-------- .../docling_v2/equations.docx.itxt | 6 +-- .../docling_v2/equations.docx.json | 12 ++--- .../groundtruth/docling_v2/equations.docx.md | 6 +-- tests/test_backend_msword.py | 8 +++ 7 files changed, 62 insertions(+), 38 deletions(-) diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py index 280358be..03234788 100644 --- a/docling/backend/docx/latex/latex_dict.py +++ b/docling/backend/docx/latex/latex_dict.py @@ -215,6 +215,9 @@ FUNC = { "coth": "\\coth({fe})", "sec": "\\sec({fe})", "csc": "\\csc({fe})", + "mod": "\\mod {fe}", + "max": "\\max({fe})", + "min": "\\min({fe})", } FUNC_PLACE = "{fe}" diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index add0de71..52dade6f 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -281,8 +281,10 @@ class oMath2Latex(Tag2Method): if FUNC.get(t): latex_chars.append(FUNC[t]) else: - raise NotSupport("Not support func %s" % t) - else: + print(f"Function not supported, will default to text: {t}") + if isinstance(t, str): + latex_chars.append(t) + elif isinstance(t, str): latex_chars.append(t) t = BLANK.join(latex_chars) return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this @@ -415,10 +417,12 @@ class oMath2Latex(Tag2Method): """ _str = [] _base_str = [] - for s in elm.findtext("./{0}t".format(OMML_NS)): - out_latex_str = self.process_unicode(s) - _str.append(out_latex_str) - _base_str.append(s) + found_text = elm.findtext("./{0}t".format(OMML_NS)) + if found_text: + for s in found_text: + out_latex_str = self.process_unicode(s) + _str.append(out_latex_str) + _base_str.append(s) proc_str = escape_latex(BLANK.join(_str)) base_proc_str = BLANK.join(_base_str) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index d6b73f70..efd15d4d 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -58,6 +58,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list: Optional[int] = None self.parents: dict[int, Optional[NodeItem]] = {} self.numbered_headers: dict[int, int] = {} + self.equation_bookends: str = "{EQ}" for i in range(-1, self.max_levels): self.parents[i] = None @@ -356,9 +357,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): only_texts.append(subt.text) texts_and_equations.append(subt.text) elif "oMath" in subt.tag and "oMathPara" not in subt.tag: - latex_equation = str(oMath2Latex(subt)) - only_equations.append(latex_equation) - texts_and_equations.append(latex_equation) + latex_equation = str(oMath2Latex(subt)).strip() + if len(latex_equation) > 0: + only_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) + texts_and_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) if len(only_equations) < 1: return text, [] @@ -373,21 +379,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Insert equations into original text # This is done to preserve white space structure - output_text = "" + output_text = text[:] init_i = 0 for i_substr, substr in enumerate(texts_and_equations): - if substr not in text: + if len(substr) == 0: + continue + + if substr in output_text[init_i:]: + init_i += output_text[init_i:].find(substr) + len(substr) + else: if i_substr > 0: - i_text_before = text[init_i:].find( - texts_and_equations[i_substr - 1] - ) - output_text += text[init_i:][ - : i_text_before + len(texts_and_equations[i_substr - 1]) - ] - init_i += i_text_before + len(texts_and_equations[i_substr - 1]) - output_text += substr - if only_equations.index(substr) == len(only_equations) - 1: - output_text += text[init_i:] + output_text = output_text[:init_i] + substr + output_text[init_i:] + init_i += len(substr) + else: + output_text = substr + output_text return output_text, only_equations @@ -479,13 +484,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._add_header(doc, p_level, text, is_numbered_style) elif len(equations) > 0: - if (raw_text is None or len(raw_text) == 0) and len(text) > 0: + if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0: # Standalone equation level = self._get_level() doc.add_text( label=DocItemLabel.FORMULA, parent=self.parents[level - 1], - text=text, + text=text.replace("", "").replace("", ""), ) else: # Inline equation @@ -498,8 +503,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if len(text_tmp) == 0: break - pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0] - text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1] + split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1) + + pre_eq_text = split_text_tmp[0] + text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1] + if len(pre_eq_text) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, @@ -509,8 +517,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc.add_text( label=DocItemLabel.FORMULA, parent=inline_equation, - text=eq, + text=eq.replace("", "").replace("", ""), ) + if len(text_tmp) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt index 1412074d..c1c68fb6 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.itxt +++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt @@ -1,7 +1,7 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: inline: group group item-2 at level 2: paragraph: This is a word document and this is an inline equation: - item-3 at level 2: formula: A= \pi r^{2} + item-3 at level 2: formula: A= \pi r^{2} item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-5 at level 1: paragraph: item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23 @@ -15,7 +15,7 @@ item-0 at level 0: unspecified: group _root_ item-14 at level 1: paragraph: item-15 at level 1: inline: group group item-16 at level 2: paragraph: This is a word document and this is an inline equation: - item-17 at level 2: formula: A= \pi r^{2} + item-17 at level 2: formula: A= \pi r^{2} item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-19 at level 1: paragraph: item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k} @@ -31,7 +31,7 @@ item-0 at level 0: unspecified: group _root_ item-30 at level 1: paragraph: item-31 at level 1: inline: group group item-32 at level 2: paragraph: This is a word document and this is an inline equation: - item-33 at level 2: formula: A= \pi r^{2} + item-33 at level 2: formula: A= \pi r^{2} item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-35 at level 1: paragraph: item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json index 43247d6f..0d57b78f 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.json +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -196,8 +196,8 @@ "content_layer": "body", "label": "formula", "prov": [], - "orig": "A= \\pi r^{2} ", - "text": "A= \\pi r^{2} " + "orig": "A= \\pi r^{2}", + "text": "A= \\pi r^{2}" }, { "self_ref": "#/texts/2", @@ -352,8 +352,8 @@ "content_layer": "body", "label": "formula", "prov": [], - "orig": "A= \\pi r^{2} ", - "text": "A= \\pi r^{2} " + "orig": "A= \\pi r^{2}", + "text": "A= \\pi r^{2}" }, { "self_ref": "#/texts/15", @@ -532,8 +532,8 @@ "content_layer": "body", "label": "formula", "prov": [], - "orig": "A= \\pi r^{2} ", - "text": "A= \\pi r^{2} " + "orig": "A= \\pi r^{2}", + "text": "A= \\pi r^{2}" }, { "self_ref": "#/texts/30", diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md index a8253ebf..578d5eb0 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.md +++ b/tests/data/groundtruth/docling_v2/equations.docx.md @@ -1,4 +1,4 @@ -This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: +This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this: $$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$ @@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+ This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. -This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: +This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this: $$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$ @@ -22,7 +22,7 @@ $$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. -This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: +This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this: $$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$ diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 5c43ccf4..952029a5 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -76,15 +76,23 @@ def test_e2e_docx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() +<<<<<<< HEAD assert verify_export( pred_md, str(gt_path) + ".md", generate=GENERATE ), "export to md" +======= + assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md" +>>>>>>> 64a7888 (Adding new latex symbols, simplifying how equations are added to text) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export( +<<<<<<< HEAD pred_itxt, str(gt_path) + ".itxt", generate=GENERATE +======= + pred_itxt, str(gt_path) + ".itxt", GENERATE +>>>>>>> 64a7888 (Adding new latex symbols, simplifying how equations are added to text) ), "export to indented-text" assert verify_document(