diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py index 13486478..04d92ea2 100644 --- a/docling/backend/docx/latex/latex_dict.py +++ b/docling/backend/docx/latex/latex_dict.py @@ -65,6 +65,11 @@ CHR_BO = { "\u2210": "\\coprod", "\u2211": "\\sum", "\u222b": "\\int", + "\u222c": "\\iint", + "\u222d": "\\iiint", + "\u222e": "\\oint", + "\u222f": "\\oiint", + "\u2230": "\\oiiint", "\u22c0": "\\bigwedge", "\u22c1": "\\bigvee", "\u22c2": "\\bigcap", diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index 0db4fdce..9144d664 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -381,7 +381,8 @@ class oMath2Latex(Tag2Method): bo = "" for stag, t, e in self.process_children_list(elm): if stag == "naryPr": - bo = get_val(t.chr, store=CHR_BO) + # if contains no , the n-ary represents an integral + bo = get_val(t.chr, default="\\int", store=CHR_BO) else: res.append(t) return bo + BLANK.join(res) diff --git a/tests/data/docx/equations.docx b/tests/data/docx/equations.docx index 8ab71b96..5ae9bf8d 100644 Binary files a/tests/data/docx/equations.docx and b/tests/data/docx/equations.docx differ diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt index 6d3b2e30..36e708eb 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.itxt +++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt @@ -37,4 +37,17 @@ item-0 at level 0: unspecified: group _root_ item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty item-37 at level 1: text: item-38 at level 1: text: And that is an equation by itself. Cheers! - item-39 at level 1: text: \ No newline at end of file + item-39 at level 1: text: + item-40 at level 1: text: Large operators and integrals ar ... sented with n-ary objects in OMML XML: + item-41 at level 1: text: + item-42 at level 1: formula: \sum_{0}^{2}x + item-43 at level 1: formula: \bigcup_{n=1}^{m}\left(X_{n} \cap Y_{n}\right) + item-44 at level 1: formula: \prod_{k=1}^{n}A_{k} + item-45 at level 1: formula: \bigwedge_{}^{}x + item-46 at level 1: formula: \int_{}^{}(2x+1)dx + item-47 at level 1: formula: \iint_{0}^{1}xdx + item-48 at level 1: formula: \iiint_{}^{}ydy + item-49 at level 1: formula: \oint_{}^{}\frac{dy}{dx} + item-50 at level 1: formula: \oiint_{0}^{2 \pi }idt + item-51 at level 1: formula: \oiiint_{C}^{}\frac{1}{z}dz + item-52 at level 1: text: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json index 8b045f83..333d3215 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.json +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -1,10 +1,10 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "equations", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "binary_hash": 11121138535595486899, + "binary_hash": 8638432756089077257, "filename": "equations.docx" }, "furniture": { @@ -106,6 +106,45 @@ }, { "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + }, + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" + }, + { + "$ref": "#/texts/47" + }, + { + "$ref": "#/texts/48" } ], "content_layer": "body", @@ -655,6 +694,169 @@ "prov": [], "orig": "", "text": "" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Large operators and integrals are represented with n-ary objects in OMML XML:", + "text": "Large operators and integrals are represented with n-ary objects in OMML XML:", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\sum_{0}^{2}x", + "text": "\\sum_{0}^{2}x" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\bigcup_{n=1}^{m}\\left(X_{n} \\cap Y_{n}\\right)", + "text": "\\bigcup_{n=1}^{m}\\left(X_{n} \\cap Y_{n}\\right)" + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\prod_{k=1}^{n}A_{k}", + "text": "\\prod_{k=1}^{n}A_{k}" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\bigwedge_{}^{}x", + "text": "\\bigwedge_{}^{}x" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\int_{}^{}(2x+1)dx", + "text": "\\int_{}^{}(2x+1)dx" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\iint_{0}^{1}xdx", + "text": "\\iint_{0}^{1}xdx" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\iiint_{}^{}ydy", + "text": "\\iiint_{}^{}ydy" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\oint_{}^{}\\frac{dy}{dx}", + "text": "\\oint_{}^{}\\frac{dy}{dx}" + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\oiint_{0}^{2 \\pi }idt", + "text": "\\oiint_{0}^{2 \\pi }idt" + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\oiiint_{C}^{}\\frac{1}{z}dz", + "text": "\\oiiint_{C}^{}\\frac{1}{z}dz" + }, + { + "self_ref": "#/texts/48", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md index 578d5eb0..a3a2f4ca 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.md +++ b/tests/data/groundtruth/docling_v2/equations.docx.md @@ -26,4 +26,26 @@ This is a word document and this is an inline equation: $A= \pi r^{2}$ . If ins $$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$ -And that is an equation by itself. Cheers! \ No newline at end of file +And that is an equation by itself. Cheers! + +Large operators and integrals are represented with n-ary objects in OMML XML: + +$$\sum_{0}^{2}x$$ + +$$\bigcup_{n=1}^{m}\left(X_{n} \cap Y_{n}\right)$$ + +$$\prod_{k=1}^{n}A_{k}$$ + +$$\bigwedge_{}^{}x$$ + +$$\int_{}^{}(2x+1)dx$$ + +$$\iint_{0}^{1}xdx$$ + +$$\iiint_{}^{}ydy$$ + +$$\oint_{}^{}\frac{dy}{dx}$$ + +$$\oiint_{0}^{2 \pi }idt$$ + +$$\oiiint_{C}^{}\frac{1}{z}dz$$ \ No newline at end of file