From 64a788809269a5efd15ba790f185a432a8e826db Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Thu, 3 Apr 2025 17:57:30 +0200 Subject: [PATCH 1/3] Adding new latex symbols, simplifying how equations are added to text Signed-off-by: Rafael Teixeira de Lima --- docling/backend/docx/latex/latex_dict.py | 3 ++ docling/backend/docx/latex/omml.py | 16 +++--- docling/backend/msword_backend.py | 49 +++++++++++-------- .../docling_v2/equations.docx.itxt | 6 +-- .../docling_v2/equations.docx.json | 12 ++--- .../groundtruth/docling_v2/equations.docx.md | 6 +-- tests/test_backend_msword.py | 4 +- 7 files changed, 56 insertions(+), 40 deletions(-) diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py index 280358be..03234788 100644 --- a/docling/backend/docx/latex/latex_dict.py +++ b/docling/backend/docx/latex/latex_dict.py @@ -215,6 +215,9 @@ FUNC = { "coth": "\\coth({fe})", "sec": "\\sec({fe})", "csc": "\\csc({fe})", + "mod": "\\mod {fe}", + "max": "\\max({fe})", + "min": "\\min({fe})", } FUNC_PLACE = "{fe}" diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index add0de71..52dade6f 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -281,8 +281,10 @@ class oMath2Latex(Tag2Method): if FUNC.get(t): latex_chars.append(FUNC[t]) else: - raise NotSupport("Not support func %s" % t) - else: + print(f"Function not supported, will default to text: {t}") + if isinstance(t, str): + latex_chars.append(t) + elif isinstance(t, str): latex_chars.append(t) t = BLANK.join(latex_chars) return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this @@ -415,10 +417,12 @@ class oMath2Latex(Tag2Method): """ _str = [] _base_str = [] - for s in elm.findtext("./{0}t".format(OMML_NS)): - out_latex_str = self.process_unicode(s) - _str.append(out_latex_str) - _base_str.append(s) + found_text = elm.findtext("./{0}t".format(OMML_NS)) + if found_text: + for s in found_text: + out_latex_str = self.process_unicode(s) + _str.append(out_latex_str) + _base_str.append(s) proc_str = escape_latex(BLANK.join(_str)) base_proc_str = BLANK.join(_base_str) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5094c8f9..f0e17c9a 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -54,6 +54,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list: Optional[int] = None self.parents: dict[int, Optional[NodeItem]] = {} self.numbered_headers: dict[int, int] = {} + self.equation_bookends: str = "{EQ}" for i in range(-1, self.max_levels): self.parents[i] = None @@ -284,9 +285,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): only_texts.append(subt.text) texts_and_equations.append(subt.text) elif "oMath" in subt.tag and "oMathPara" not in subt.tag: - latex_equation = str(oMath2Latex(subt)) - only_equations.append(latex_equation) - texts_and_equations.append(latex_equation) + latex_equation = str(oMath2Latex(subt)).strip() + if len(latex_equation) > 0: + only_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) + texts_and_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) if len(only_equations) < 1: return text, [] @@ -301,21 +307,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Insert equations into original text # This is done to preserve white space structure - output_text = "" + output_text = text[:] init_i = 0 for i_substr, substr in enumerate(texts_and_equations): - if substr not in text: + if len(substr) == 0: + continue + + if substr in output_text[init_i:]: + init_i += output_text[init_i:].find(substr) + len(substr) + else: if i_substr > 0: - i_text_before = text[init_i:].find( - texts_and_equations[i_substr - 1] - ) - output_text += text[init_i:][ - : i_text_before + len(texts_and_equations[i_substr - 1]) - ] - init_i += i_text_before + len(texts_and_equations[i_substr - 1]) - output_text += substr - if only_equations.index(substr) == len(only_equations) - 1: - output_text += text[init_i:] + output_text = output_text[:init_i] + substr + output_text[init_i:] + init_i += len(substr) + else: + output_text = substr + output_text return output_text, only_equations @@ -393,13 +398,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.add_header(doc, p_level, text, is_numbered_style) elif len(equations) > 0: - if (raw_text is None or len(raw_text) == 0) and len(text) > 0: + if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0: # Standalone equation level = self.get_level() doc.add_text( label=DocItemLabel.FORMULA, parent=self.parents[level - 1], - text=text, + text=text.replace("", "").replace("", ""), ) else: # Inline equation @@ -412,8 +417,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if len(text_tmp) == 0: break - pre_eq_text = text_tmp.split(eq.strip(), maxsplit=1)[0] - text_tmp = text_tmp.split(eq.strip(), maxsplit=1)[1] + split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1) + + pre_eq_text = split_text_tmp[0] + text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1] + if len(pre_eq_text) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, @@ -423,8 +431,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc.add_text( label=DocItemLabel.FORMULA, parent=inline_equation, - text=eq, + text=eq.replace("", "").replace("", ""), ) + if len(text_tmp) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt index 1412074d..c1c68fb6 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.itxt +++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt @@ -1,7 +1,7 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: inline: group group item-2 at level 2: paragraph: This is a word document and this is an inline equation: - item-3 at level 2: formula: A= \pi r^{2} + item-3 at level 2: formula: A= \pi r^{2} item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-5 at level 1: paragraph: item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23 @@ -15,7 +15,7 @@ item-0 at level 0: unspecified: group _root_ item-14 at level 1: paragraph: item-15 at level 1: inline: group group item-16 at level 2: paragraph: This is a word document and this is an inline equation: - item-17 at level 2: formula: A= \pi r^{2} + item-17 at level 2: formula: A= \pi r^{2} item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-19 at level 1: paragraph: item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k} @@ -31,7 +31,7 @@ item-0 at level 0: unspecified: group _root_ item-30 at level 1: paragraph: item-31 at level 1: inline: group group item-32 at level 2: paragraph: This is a word document and this is an inline equation: - item-33 at level 2: formula: A= \pi r^{2} + item-33 at level 2: formula: A= \pi r^{2} item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-35 at level 1: paragraph: item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json index 43247d6f..0d57b78f 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.json +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -196,8 +196,8 @@ "content_layer": "body", "label": "formula", "prov": [], - "orig": "A= \\pi r^{2} ", - "text": "A= \\pi r^{2} " + "orig": "A= \\pi r^{2}", + "text": "A= \\pi r^{2}" }, { "self_ref": "#/texts/2", @@ -352,8 +352,8 @@ "content_layer": "body", "label": "formula", "prov": [], - "orig": "A= \\pi r^{2} ", - "text": "A= \\pi r^{2} " + "orig": "A= \\pi r^{2}", + "text": "A= \\pi r^{2}" }, { "self_ref": "#/texts/15", @@ -532,8 +532,8 @@ "content_layer": "body", "label": "formula", "prov": [], - "orig": "A= \\pi r^{2} ", - "text": "A= \\pi r^{2} " + "orig": "A= \\pi r^{2}", + "text": "A= \\pi r^{2}" }, { "self_ref": "#/texts/30", diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md index a8253ebf..578d5eb0 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.md +++ b/tests/data/groundtruth/docling_v2/equations.docx.md @@ -1,4 +1,4 @@ -This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: +This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this: $$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$ @@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+ This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. -This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: +This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this: $$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$ @@ -22,7 +22,7 @@ $$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. -This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: +This is a word document and this is an inline equation: $A= \pi r^{2}$ . If instead, I want an equation by line, I can do this: $$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$ diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f9843c78..975007c4 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -76,13 +76,13 @@ def test_e2e_docx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md" pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export( - pred_itxt, str(gt_path) + ".itxt" + pred_itxt, str(gt_path) + ".itxt", GENERATE ), "export to indented-text" assert verify_document( From 4bea04dc753fc53a99ecfe9313f25d9151ffbb52 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Fri, 4 Apr 2025 14:46:43 +0200 Subject: [PATCH 2/3] Identify headers through inhenrited style Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index efd15d4d..5530bba0 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -264,6 +264,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label = paragraph.style.style_id name = paragraph.style.name + base_style_label = None + base_style_name = None + if base_style := getattr(paragraph.style, "base_style", None): + base_style_label = base_style.style_id + base_style_name = base_style.name if label is None: return "Normal", None @@ -277,6 +282,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return self._get_heading_and_level(label) if "heading" in name.lower(): return self._get_heading_and_level(name) + if base_style_label and "heading" in base_style_label.lower(): + return self._get_heading_and_level(base_style_label) + if base_style_name and "heading" in base_style_name.lower(): + return self._get_heading_and_level(base_style_name) return label, None From 556b949b1875aa971c5c07ff6d3e8372fd919991 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Tue, 8 Apr 2025 15:41:16 +0200 Subject: [PATCH 3/3] Log warning message instead of print Signed-off-by: Rafael Teixeira de Lima --- docling/backend/docx/latex/omml.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index 52dade6f..b2d5f900 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py On 23/01/2025 """ +import logging + import lxml.etree as ET from pylatexenc.latexencode import UnicodeToLatexEncoder @@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import ( OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" +_log = logging.getLogger(__name__) + def load(stream): tree = ET.parse(stream) @@ -281,7 +285,7 @@ class oMath2Latex(Tag2Method): if FUNC.get(t): latex_chars.append(FUNC[t]) else: - print(f"Function not supported, will default to text: {t}") + _log.warning("Function not supported, will default to text: %s", t) if isinstance(t, str): latex_chars.append(t) elif isinstance(t, str): @@ -384,8 +388,6 @@ class oMath2Latex(Tag2Method): out_latex_str = self.u.unicode_to_latex(s) - # print(s, out_latex_str) - if ( s.startswith("{") is False and out_latex_str.startswith("{") @@ -394,19 +396,13 @@ class oMath2Latex(Tag2Method): ): out_latex_str = f" {out_latex_str[1:-1]} " - # print(s, out_latex_str) - if "ensuremath" in out_latex_str: out_latex_str = out_latex_str.replace("\\ensuremath{", " ") out_latex_str = out_latex_str.replace("}", " ") - # print(s, out_latex_str) - if out_latex_str.strip().startswith("\\text"): out_latex_str = f" \\text{{{out_latex_str}}} " - # print(s, out_latex_str) - return out_latex_str def do_r(self, elm):