Upgrading docling core and adding groups

2025-07-26 03:55:00 +00:00 · 2025-03-04 17:18:40 +01:00 · 2025-03-04 17:18:40 +01:00 · 655e95dd72
commit 655e95dd72
parent 5630c6b8fd
19 changed files with 500 additions and 238 deletions
--- a/docling/backend/docx_latex/init.py
+++ b/docling/backend/docx_latex/init.py
--- a/docling/backend/docx/latex/init.py
+++ b/docling/backend/docx/latex/init.py
--- a/docling/backend/docx/latex/latex_dict.py
+++ b/docling/backend/docx/latex/latex_dict.py
@ -268,4 +268,4 @@ LIM_TO = ("\\rightarrow", "\\to")

 LIM_UPP = "\\overset{{{lim}}}{{{text}}}"

-M = "\\begin{{matrix}}{text}\end{{matrix}}"
+M = "\\begin{{matrix}}{text}\\end{{matrix}}"
--- a/docling/backend/docx/latex/omml.py
+++ b/docling/backend/docx/latex/omml.py
@ -8,7 +8,7 @@ On 23/01/2025
 import lxml.etree as ET
 from pylatexenc.latexencode import UnicodeToLatexEncoder

-from docling.backend.docx_latex.latex_dict import (
+from docling.backend.docx.latex.latex_dict import (
    ALN,
    ARR,
    BACKSLASH,
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -26,7 +26,7 @@ from PIL import Image, UnidentifiedImageError
 from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.backend.docx_latex.omml import oMath2Latex
+from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

@ -164,7 +164,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    ) -> DoclingDocument:
        for element in body:
            tag_name = etree.QName(element).localname
-
            # Check for Inline Images (blip elements)
            namespaces = {
                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@ -262,6 +261,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        else:
            return label, None

+    def handle_equations_in_text(self, element, text):
+        only_texts = []
+        only_equations = []
+        texts_and_equations = []
+        for subt in element.iter():
+            tag_name = etree.QName(subt).localname
+            if tag_name == "t" and "math" not in subt.tag:
+                only_texts.append(subt.text)
+                texts_and_equations.append(subt.text)
+            elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
+                latex_equation = str(oMath2Latex(subt))
+                only_equations.append(latex_equation)
+                texts_and_equations.append(latex_equation)
+
+        if "".join(only_texts) != text:
+            return text
+
+        return "".join(texts_and_equations), only_equations

    def handle_text_elements(
        self,
@ -272,7 +289,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        paragraph = Paragraph(element, docx_obj)

        raw_text = paragraph.text
-        text = self.handle_equations_in_text(element=element, text=raw_text)
+        text, equations = self.handle_equations_in_text(element=element, text=raw_text)

        if text is None:
            return
@ -326,36 +343,57 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            self.parents[0] = doc.add_text(
                parent=None, label=DocItemLabel.TITLE, text=text
            )
-
        elif "Heading" in p_style_id:
            self.add_header(doc, p_level, text)

-        elif p_style_id in [
-            "Subtitle",
-            "Author",
-            "ListParagraph",
-            "ListBullet",
-            "Quote",
-        ]:
-            level = self.get_level()
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
-            )
-
-        elif (raw_text is None or len(raw_text) == 0) and len(text) > 0:
-            # Standalone equation
-            # Entities in which all text comes from equations
-            level = self.get_level()
-            if text.strip().startswith("$") and text.strip().endswith("$"):
-                text = text.strip()[1:-1]
-            doc.add_text(
-                label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text
-            )
+        elif len(equations) > 0:
+            if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
+                # Standalone equation
+                level = self.get_level()
+                doc.add_text(
+                    label=DocItemLabel.FORMULA,
+                    parent=self.parents[level - 1],
+                    text=text,
+                )
+            else:
+                # Inline equation
+                level = self.get_level()
+                inline_equation = doc.add_group(
+                    label=GroupLabel.INLINE, parent=self.parents[level - 1]
+                )
+                text_tmp = text
+                for eq in equations:
+                    if len(text_tmp) == 0:
+                        break
+                    pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
+                    text_tmp = text_tmp.split(eq, maxsplit=1)[1]
+                    if len(pre_eq_text) > 0:
+                        doc.add_text(
+                            label=DocItemLabel.PARAGRAPH,
+                            parent=inline_equation,
+                            text=pre_eq_text,
+                        )
+                    doc.add_text(
+                        label=DocItemLabel.FORMULA,
+                        parent=inline_equation,
+                        text=eq,
+                    )
+                if len(text_tmp) > 0:
+                    doc.add_text(
+                        label=DocItemLabel.PARAGRAPH,
+                        parent=inline_equation,
+                        text=text_tmp,
+                    )

        elif p_style_id in [
            "Paragraph",
            "Normal",
+            "Subtitle",
+            "Author",
            "DefaultText",
+            "ListParagraph",
+            "ListBullet",
+            "Quote",
        ]:
            level = self.get_level()
            doc.add_text(
@ -367,8 +405,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # hence we treat all other labels as pure text
            level = self.get_level()
            doc.add_text(
-                label=DocItemLabel.TEXT, parent=self.parents[level - 1], text=text
+                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
            )
+
        self.update_history(p_style_id, p_level, numid, ilevel)
        return

--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.

 [[package]]
 name = "accelerate"
@ -33,13 +33,13 @@ testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized",

 [[package]]
 name = "aiohappyeyeballs"
-version = "2.4.6"
+version = "2.4.8"
 description = "Happy Eyeballs for asyncio"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"},
-    {file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"},
+    {file = "aiohappyeyeballs-2.4.8-py3-none-any.whl", hash = "sha256:6cac4f5dd6e34a9644e69cf9021ef679e4394f54e58a183056d12009e42ea9e3"},
+    {file = "aiohappyeyeballs-2.4.8.tar.gz", hash = "sha256:19728772cb12263077982d2f55453babd8bec6a052a926cd5c0c42796da8bf62"},
 ]

 [[package]]
@ -311,6 +311,24 @@ files = [
 docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]

+[[package]]
+name = "backrefs"
+version = "5.8"
+description = "A wrapper around re and regex that adds additional back references."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"},
+    {file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"},
+    {file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"},
+    {file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"},
+    {file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"},
+    {file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"},
+]
+
+[package.extras]
+extras = ["regex"]
+
 [[package]]
 name = "beautifulsoup4"
 version = "4.13.3"
@ -852,13 +870,13 @@ files = [

 [[package]]
 name = "docling-core"
-version = "2.20.0"
+version = "2.21.1"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-2.20.0-py3-none-any.whl", hash = "sha256:72f50fce277b7bb51f4134f443240c041582184305c3bcaabdea13fc5550f160"},
-    {file = "docling_core-2.20.0.tar.gz", hash = "sha256:9733581c15f5a9b5e3a6cb74fa995cc4078ff16668007f86c5f75d1ea9180d7f"},
+    {file = "docling_core-2.21.1-py3-none-any.whl", hash = "sha256:b8112915728cdc14f328f636f6c0ed36e6bbcc02ff940cc0bf85e303738671c3"},
+    {file = "docling_core-2.21.1.tar.gz", hash = "sha256:3ccc50197d24a3156cfc6c22c8404c58757749646d876a1c1c69fd800f664a4f"},
 ]

 [package.dependencies]
@ -880,13 +898,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]

 [[package]]
 name = "docling-ibm-models"
-version = "3.4.0"
+version = "3.4.1"
 description = "This package contains the AI models used by the Docling PDF conversion package"
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"},
-    {file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"},
+    {file = "docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986"},
+    {file = "docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96"},
 ]

 [package.dependencies]
@ -1331,13 +1349,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",

 [[package]]
 name = "griffe"
-version = "1.5.7"
+version = "1.6.0"
 description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"},
-    {file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"},
+    {file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"},
+    {file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"},
 ]

 [package.dependencies]
@ -1818,18 +1836,18 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"]

 [[package]]
 name = "jeepney"
-version = "0.8.0"
+version = "0.9.0"
 description = "Low-level, pure Python DBus protocol wrapper."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"},
-    {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"},
+    {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
+    {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
 ]

 [package.extras]
 test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
-trio = ["async_generator", "trio"]
+trio = ["trio"]

 [[package]]
 name = "jinja2"
@ -2715,17 +2733,18 @@ pygments = ">2.12.0"

 [[package]]
 name = "mkdocs-material"
-version = "9.6.5"
+version = "9.6.7"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"},
-    {file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"},
+    {file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"},
+    {file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"},
 ]

 [package.dependencies]
 babel = ">=2.10,<3.0"
+backrefs = ">=5.7.post1,<6.0"
 colorama = ">=0.4,<1.0"
 jinja2 = ">=3.0,<4.0"
 markdown = ">=3.2,<4.0"
@ -2734,7 +2753,6 @@ mkdocs-material-extensions = ">=1.3,<2.0"
 paginate = ">=0.5,<1.0"
 pygments = ">=2.16,<3.0"
 pymdown-extensions = ">=10.2,<11.0"
-regex = ">=2022.4"
 requests = ">=2.26,<3.0"

 [package.extras]
@ -4755,13 +4773,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"

 [[package]]
 name = "pydantic-settings"
-version = "2.8.0"
+version = "2.8.1"
 description = "Settings management using Pydantic"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820"},
-    {file = "pydantic_settings-2.8.0.tar.gz", hash = "sha256:88e2ca28f6e68ea102c99c3c401d6c9078e68a5df600e97b43891c34e089500a"},
+    {file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"},
+    {file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"},
 ]

 [package.dependencies]
@ -5907,26 +5925,26 @@ files = [

 [[package]]
 name = "safetensors"
-version = "0.5.2"
+version = "0.5.3"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"},
-    {file = "safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae"},
-    {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c"},
-    {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e"},
-    {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869"},
-    {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a"},
-    {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5"},
-    {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975"},
-    {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e"},
-    {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f"},
-    {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf"},
-    {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76"},
-    {file = "safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2"},
-    {file = "safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589"},
-    {file = "safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8"},
+    {file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"},
+    {file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"},
+    {file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"},
+    {file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"},
+    {file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"},
 ]

 [package.dependencies]
@ -6223,13 +6241,13 @@ train = ["accelerate (>=0.20.3)", "datasets"]

 [[package]]
 name = "setuptools"
-version = "75.8.1"
+version = "75.8.2"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "setuptools-75.8.1-py3-none-any.whl", hash = "sha256:3bc32c0b84c643299ca94e77f834730f126efd621de0cc1de64119e0e17dab1f"},
-    {file = "setuptools-75.8.1.tar.gz", hash = "sha256:65fb779a8f28895242923582eadca2337285f0891c2c9e160754df917c3d2530"},
+    {file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"},
+    {file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"},
 ]

 [package.extras]
@ -7227,13 +7245,13 @@ files = [

 [[package]]
 name = "types-requests"
-version = "2.32.0.20241016"
+version = "2.32.0.20250301"
 description = "Typing stubs for requests"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"},
-    {file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"},
+    {file = "types_requests-2.32.0.20250301-py3-none-any.whl", hash = "sha256:0003e0124e2cbefefb88222ff822b48616af40c74df83350f599a650c8de483b"},
+    {file = "types_requests-2.32.0.20250301.tar.gz", hash = "sha256:3d909dc4eaab159c0d964ebe8bfa326a7afb4578d8706408d417e17d61b0c500"},
 ]

 [package.dependencies]
@ -7241,13 +7259,13 @@ urllib3 = ">=2"

 [[package]]
 name = "types-tqdm"
-version = "4.67.0.20241221"
+version = "4.67.0.20250301"
 description = "Typing stubs for tqdm"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
-    {file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
+    {file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"},
+    {file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"},
 ]

 [package.dependencies]
@ -7843,4 +7861,4 @@ vlm = ["accelerate", "transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "1d4718b694098b0676f1ad1606d769887e51fc29f604e5f4c83dd5e1c90557e7"
+content-hash = "a340b1230bc83cdcff125a84eee457b1d8786abc112f2c0553391a4ab9f092ea"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,24 +1,44 @@
 [tool.poetry]
 name = "docling"
-version = "2.25.1"  # DO NOT EDIT, updated automatically
+version = "2.25.1" # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
-authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
+authors = [
+  "Christoph Auer <cau@zurich.ibm.com>",
+  "Michele Dolfi <dol@zurich.ibm.com>",
+  "Maxim Lysak <mly@zurich.ibm.com>",
+  "Nikos Livathinos <nli@zurich.ibm.com>",
+  "Ahmed Nassar <ahn@zurich.ibm.com>",
+  "Panos Vagenas <pva@zurich.ibm.com>",
+  "Peter Staar <taa@zurich.ibm.com>",
+]
 license = "MIT"
 readme = "README.md"
 repository = "https://github.com/DS4SD/docling"
 homepage = "https://github.com/DS4SD/docling"
-keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"]
- classifiers = [
-     "License :: OSI Approved :: MIT License",
-     "Operating System :: MacOS :: MacOS X",
-     "Operating System :: POSIX :: Linux",
-     "Development Status :: 5 - Production/Stable",
-     "Intended Audience :: Developers",
-     "Intended Audience :: Science/Research",
-     "Topic :: Scientific/Engineering :: Artificial Intelligence",
-     "Programming Language :: Python :: 3"
- ]
-packages = [{include = "docling"}]
+keywords = [
+  "docling",
+  "convert",
+  "document",
+  "pdf",
+  "docx",
+  "html",
+  "markdown",
+  "layout model",
+  "segmentation",
+  "table structure",
+  "table former",
+]
+classifiers = [
+  "License :: OSI Approved :: MIT License",
+  "Operating System :: MacOS :: MacOS X",
+  "Operating System :: POSIX :: Linux",
+  "Development Status :: 5 - Production/Stable",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Programming Language :: Python :: 3",
+]
+packages = [{ include = "docling" }]

 [tool.poetry.dependencies]
 ######################
@ -26,7 +46,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {extras = ["chunking"], version = "^2.19.0"}
+docling-core = { extras = ["chunking"], version = "^2.21.1" }
 docling-ibm-models = "^3.4.0"
 docling-parse = "^3.3.0"
 filetype = "^1.2.0"
@ -40,7 +60,7 @@ certifi = ">=2024.7.4"
 rtree = "^1.3.0"
 scipy = [
  { version = "^1.6.0", markers = "python_version >= '3.10'" },
-  { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" }
+  { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
 ]
 typer = "^0.12.5"
 python-docx = "^1.1.2"
@ -56,21 +76,22 @@ onnxruntime = [
  # 1.19.2 is the last version with python3.9 support,
  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
-  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
+  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
 ]

 transformers = [
-  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
-  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
+  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
+  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
 ]
 accelerate = [
-  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
+  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
 ]
 pillow = ">=10.0.0,<12.0.0"
 tqdm = "^4.65.0"
+pylatexenc = "^2.10"

 [tool.poetry.group.dev.dependencies]
-black = {extras = ["jupyter"], version = "^24.4.2"}
+black = { extras = ["jupyter"], version = "^24.4.2" }
 pytest = "^7.2.2"
 pre-commit = "^3.7.1"
 mypy = "^1.10.1"
@ -93,7 +114,7 @@ types-tqdm = "^4.67.0.20241221"
 mkdocs-material = "^9.5.40"
 mkdocs-jupyter = "^0.25.0"
 mkdocs-click = "^0.8.1"
-mkdocstrings = {extras = ["python"], version = "^0.27.0"}
+mkdocstrings = { extras = ["python"], version = "^0.27.0" }
 griffe-pydantic = "^1.1.0"

 [tool.poetry.group.examples.dependencies]
@ -108,8 +129,8 @@ optional = true

 [tool.poetry.group.constraints.dependencies]
 numpy = [
-    { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
-    { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
+  { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
+  { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
 ]

 [tool.poetry.group.mac_intel]
@ -117,12 +138,12 @@ optional = true

 [tool.poetry.group.mac_intel.dependencies]
 torch = [
-  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"},
-  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"}
+  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
+  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
 ]
 torchvision = [
-  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
-  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
+  { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
+  { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
 ]

 [tool.poetry.extras]
@ -147,7 +168,7 @@ include = '\.pyi?$'
 [tool.isort]
 profile = "black"
 line_length = 88
-py_version=39
+py_version = 39

 [tool.mypy]
 pretty = true
@ -158,18 +179,19 @@ python_version = "3.10"

 [[tool.mypy.overrides]]
 module = [
-    "docling_parse.*",
-    "pypdfium2.*",
-    "networkx.*",
-    "scipy.*",
-    "filetype.*",
-    "tesserocr.*",
-    "docling_ibm_models.*",
-    "easyocr.*",
-    "ocrmac.*",
-    "lxml.*",
-    "huggingface_hub.*",
-    "transformers.*",
+  "docling_parse.*",
+  "pypdfium2.*",
+  "networkx.*",
+  "scipy.*",
+  "filetype.*",
+  "tesserocr.*",
+  "docling_ibm_models.*",
+  "easyocr.*",
+  "ocrmac.*",
+  "lxml.*",
+  "huggingface_hub.*",
+  "transformers.*",
+  "pylatexenc.*",
 ]
 ignore_missing_imports = true

--- a/tests/data/groundtruth/docling_v2/equations.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt
@ -1,31 +1,40 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
-  item-2 at level 1: paragraph: 
-  item-3 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
-  item-4 at level 1: paragraph: And that is an equation by itself. Cheers!
+  item-1 at level 1: inline: group group
+    item-2 at level 2: paragraph: This is a word document and this is an inline equation: 
+    item-3 at level 2: formula: A= \pi r^{2} 
+    item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
  item-5 at level 1: paragraph: 
-  item-6 at level 1: paragraph: This is another equation:
-  item-7 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
+  item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
+  item-7 at level 1: paragraph: And that is an equation by itself. Cheers!
  item-8 at level 1: paragraph: 
-  item-9 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
-  item-10 at level 1: paragraph: 
+  item-9 at level 1: paragraph: This is another equation:
+  item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
  item-11 at level 1: paragraph: 
-  item-12 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
+  item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
  item-13 at level 1: paragraph: 
-  item-14 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
-  item-15 at level 1: paragraph: 
-  item-16 at level 1: paragraph: And that is an equation by itself. Cheers!
-  item-17 at level 1: paragraph: 
-  item-18 at level 1: paragraph: This is another equation:
+  item-14 at level 1: paragraph: 
+  item-15 at level 1: inline: group group
+    item-16 at level 2: paragraph: This is a word document and this is an inline equation: 
+    item-17 at level 2: formula: A= \pi r^{2} 
+    item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
  item-19 at level 1: paragraph: 
-  item-20 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ht)x^{2}}{2!}+ \text{ \textellipsis } 
+  item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
  item-21 at level 1: paragraph: 
-  item-22 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
+  item-22 at level 1: paragraph: And that is an equation by itself. Cheers!
  item-23 at level 1: paragraph: 
-  item-24 at level 1: paragraph: 
-  item-25 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
-  item-26 at level 1: paragraph: 
-  item-27 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... tellipsis } , - \infty  < x <  \infty 
-  item-28 at level 1: paragraph: 
-  item-29 at level 1: paragraph: And that is an equation by itself. Cheers!
-  item-30 at level 1: paragraph: 
+  item-24 at level 1: paragraph: This is another equation:
+  item-25 at level 1: paragraph: 
+  item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
+  item-27 at level 1: paragraph: 
+  item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
+  item-29 at level 1: paragraph: 
+  item-30 at level 1: paragraph: 
+  item-31 at level 1: inline: group group
+    item-32 at level 2: paragraph: This is a word document and this is an inline equation: 
+    item-33 at level 2: formula: A= \pi r^{2} 
+    item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
+  item-35 at level 1: paragraph: 
+  item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty  < x <  \infty
+  item-37 at level 1: paragraph: 
+  item-38 at level 1: paragraph: And that is an equation by itself. Cheers!
+  item-39 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/equations.docx.json
+++ b/tests/data/groundtruth/docling_v2/equations.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.0.0",
+  "version": "1.2.0",
  "name": "equations",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -10,6 +10,7 @@
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
+    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
@ -17,13 +18,7 @@
    "self_ref": "#/body",
    "children": [
      {
-        "$ref": "#/texts/0"
-      },
-      {
-        "$ref": "#/texts/1"
-      },
-      {
-        "$ref": "#/texts/2"
+        "$ref": "#/groups/0"
      },
      {
        "$ref": "#/texts/3"
@ -56,13 +51,7 @@
        "$ref": "#/texts/12"
      },
      {
-        "$ref": "#/texts/13"
-      },
-      {
-        "$ref": "#/texts/14"
-      },
-      {
-        "$ref": "#/texts/15"
+        "$ref": "#/groups/1"
      },
      {
        "$ref": "#/texts/16"
@ -101,49 +90,126 @@
        "$ref": "#/texts/27"
      },
      {
-        "$ref": "#/texts/28"
+        "$ref": "#/groups/2"
      },
      {
-        "$ref": "#/texts/29"
+        "$ref": "#/texts/31"
+      },
+      {
+        "$ref": "#/texts/32"
+      },
+      {
+        "$ref": "#/texts/33"
+      },
+      {
+        "$ref": "#/texts/34"
+      },
+      {
+        "$ref": "#/texts/35"
      }
    ],
+    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
-  "groups": [],
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/0"
+        },
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/13"
+        },
+        {
+          "$ref": "#/texts/14"
+        },
+        {
+          "$ref": "#/texts/15"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/28"
+        },
+        {
+          "$ref": "#/texts/29"
+        },
+        {
+          "$ref": "#/texts/30"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    }
+  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/0"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
-      "text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
+      "orig": "This is a word document and this is an inline equation: ",
+      "text": "This is a word document and this is an inline equation: "
    },
    {
      "self_ref": "#/texts/1",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/0"
      },
      "children": [],
-      "label": "paragraph",
+      "content_layer": "body",
+      "label": "formula",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "A= \\pi r^{2} ",
+      "text": "A= \\pi r^{2} "
    },
    {
      "self_ref": "#/texts/2",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/0"
      },
      "children": [],
-      "label": "formula",
+      "content_layer": "body",
+      "label": "paragraph",
      "prov": [],
-      "orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23",
-      "text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23"
+      "orig": ". If instead, I want an equation by line, I can do this:",
+      "text": ". If instead, I want an equation by line, I can do this:"
    },
    {
      "self_ref": "#/texts/3",
@ -151,10 +217,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "And that is an equation by itself. Cheers!",
-      "text": "And that is an equation by itself. Cheers!"
+      "orig": "",
+      "text": ""
    },
    {
      "self_ref": "#/texts/4",
@ -162,10 +229,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "paragraph",
+      "content_layer": "body",
+      "label": "formula",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23",
+      "text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23"
    },
    {
      "self_ref": "#/texts/5",
@ -173,10 +241,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "This is another equation:",
-      "text": "This is another equation:"
+      "orig": "And that is an equation by itself. Cheers!",
+      "text": "And that is an equation by itself. Cheers!"
    },
    {
      "self_ref": "#/texts/6",
@ -184,10 +253,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "formula",
+      "content_layer": "body",
+      "label": "paragraph",
      "prov": [],
-      "orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)",
-      "text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)"
+      "orig": "",
+      "text": ""
    },
    {
      "self_ref": "#/texts/7",
@ -195,10 +265,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "This is another equation:",
+      "text": "This is another equation:"
    },
    {
      "self_ref": "#/texts/8",
@ -206,10 +277,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "paragraph",
+      "content_layer": "body",
+      "label": "formula",
      "prov": [],
-      "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
-      "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
+      "orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)",
+      "text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)"
    },
    {
      "self_ref": "#/texts/9",
@ -217,6 +289,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -228,10 +301,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
+      "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
    },
    {
      "self_ref": "#/texts/11",
@ -239,10 +313,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
-      "text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
+      "orig": "",
+      "text": ""
    },
    {
      "self_ref": "#/texts/12",
@ -250,6 +325,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -258,35 +334,38 @@
    {
      "self_ref": "#/texts/13",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/1"
      },
      "children": [],
-      "label": "formula",
+      "content_layer": "body",
+      "label": "paragraph",
      "prov": [],
-      "orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}",
-      "text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}"
+      "orig": "This is a word document and this is an inline equation: ",
+      "text": "This is a word document and this is an inline equation: "
    },
    {
      "self_ref": "#/texts/14",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/1"
      },
      "children": [],
-      "label": "paragraph",
+      "content_layer": "body",
+      "label": "formula",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "A= \\pi r^{2} ",
+      "text": "A= \\pi r^{2} "
    },
    {
      "self_ref": "#/texts/15",
      "parent": {
-        "$ref": "#/body"
+        "$ref": "#/groups/1"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "And that is an equation by itself. Cheers!",
-      "text": "And that is an equation by itself. Cheers!"
+      "orig": ". If instead, I want an equation by line, I can do this:",
+      "text": ". If instead, I want an equation by line, I can do this:"
    },
    {
      "self_ref": "#/texts/16",
@ -294,6 +373,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -305,10 +385,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "paragraph",
+      "content_layer": "body",
+      "label": "formula",
      "prov": [],
-      "orig": "This is another equation:",
-      "text": "This is another equation:"
+      "orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}",
+      "text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}"
    },
    {
      "self_ref": "#/texts/18",
@ -316,6 +397,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -327,10 +409,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "formula",
+      "content_layer": "body",
+      "label": "paragraph",
      "prov": [],
-      "orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } ",
-      "text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } "
+      "orig": "And that is an equation by itself. Cheers!",
+      "text": "And that is an equation by itself. Cheers!"
    },
    {
      "self_ref": "#/texts/20",
@ -338,6 +421,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -349,10 +433,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
-      "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
+      "orig": "This is another equation:",
+      "text": "This is another equation:"
    },
    {
      "self_ref": "#/texts/22",
@ -360,6 +445,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -371,10 +457,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "paragraph",
+      "content_layer": "body",
+      "label": "formula",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }",
+      "text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }"
    },
    {
      "self_ref": "#/texts/24",
@ -382,10 +469,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
-      "text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
+      "orig": "",
+      "text": ""
    },
    {
      "self_ref": "#/texts/25",
@ -393,10 +481,11 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
-      "orig": "",
-      "text": ""
+      "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
+      "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
    },
    {
      "self_ref": "#/texts/26",
@ -404,10 +493,11 @@
        "$ref": "#/body"
      },
      "children": [],
-      "label": "formula",
+      "content_layer": "body",
+      "label": "paragraph",
      "prov": [],
-      "orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty  < x <  \\infty ",
-      "text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty  < x <  \\infty "
+      "orig": "",
+      "text": ""
    },
    {
      "self_ref": "#/texts/27",
@ -415,6 +505,7 @@
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -422,21 +513,95 @@
    },
    {
      "self_ref": "#/texts/28",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "This is a word document and this is an inline equation: ",
+      "text": "This is a word document and this is an inline equation: "
+    },
+    {
+      "self_ref": "#/texts/29",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "formula",
+      "prov": [],
+      "orig": "A= \\pi r^{2} ",
+      "text": "A= \\pi r^{2} "
+    },
+    {
+      "self_ref": "#/texts/30",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": ". If instead, I want an equation by line, I can do this:",
+      "text": ". If instead, I want an equation by line, I can do this:"
+    },
+    {
+      "self_ref": "#/texts/31",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    },
+    {
+      "self_ref": "#/texts/32",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "formula",
+      "prov": [],
+      "orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty  < x <  \\infty",
+      "text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty  < x <  \\infty"
+    },
+    {
+      "self_ref": "#/texts/33",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    },
+    {
+      "self_ref": "#/texts/34",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "And that is an equation by itself. Cheers!",
      "text": "And that is an equation by itself. Cheers!"
    },
    {
-      "self_ref": "#/texts/29",
+      "self_ref": "#/texts/35",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
+      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
@ -446,5 +611,6 @@
  "pictures": [],
  "tables": [],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/equations.docx.md
+++ b/tests/data/groundtruth/docling_v2/equations.docx.md
@ -1,4 +1,4 @@
-This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
+This is a word document and this is an inline equation:  $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:

 $$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$

@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+

 This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.

-This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
+This is a word document and this is an inline equation:  $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:

 $$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$

@ -18,12 +18,12 @@ And that is an equation by itself. Cheers!

 This is another equation:

-$$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis } $$
+$$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis }$$

 This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.

-This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
+This is a word document and this is an inline equation:  $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:

-$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty  < x <  \infty $$
+$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty  < x <  \infty$$

 And that is an equation by itself. Cheers!
--- a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json
+++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.1.0",
+  "version": "1.2.0",
  "name": "lorem_ipsum",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -163,5 +163,6 @@
  "pictures": [],
  "tables": [],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/tablecell.docx.json
+++ b/tests/data/groundtruth/docling_v2/tablecell.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.1.0",
+  "version": "1.2.0",
  "name": "tablecell",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -399,5 +399,6 @@
    }
  ],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json
+++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json
--- a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.1.0",
+  "version": "1.2.0",
  "name": "unit_test_headers",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -748,5 +748,6 @@
  "pictures": [],
  "tables": [],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.1.0",
+  "version": "1.2.0",
  "name": "unit_test_headers_numbered",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -802,5 +802,6 @@
  "pictures": [],
  "tables": [],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.1.0",
+  "version": "1.2.0",
  "name": "unit_test_lists",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -979,5 +979,6 @@
  "pictures": [],
  "tables": [],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt
@ -3,7 +3,7 @@ item-0 at level 0: unspecified: group _root_
  item-2 at level 1: title: Swimming in the lake
    item-3 at level 2: paragraph: Duck
    item-4 at level 2: picture
-    item-5 at level 2: text: Figure 1: This is a cute duckling
+    item-5 at level 2: paragraph: Figure 1: This is a cute duckling
    item-6 at level 2: section_header: Let’s swim!
      item-7 at level 3: paragraph: To get started with swimming, fi ...  down in a water and try not to drown:
      item-8 at level 3: list: group list
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json
--- a/tests/data/groundtruth/docling_v2/word_tables.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json
@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.1.0",
+  "version": "1.2.0",
  "name": "word_tables",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -2372,5 +2372,6 @@
    }
  ],
  "key_value_items": [],
+  "form_items": [],
  "pages": {}
 }