Merge branch 'main' of https://github.com/docling-project/docling

2025-12-12 14:48:21 +00:00 · 2025-09-29 12:56:11 +05:30
parent fbdbf53aa8 a873200c9d
commit 9b3065ba2b
59 changed files with 5859 additions and 828 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,19 @@
 ## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22
 ### Feature
 * Rich tables for MSWord backend ([#2291](https://github.com/docling-project/docling/issues/2291)) ([`e2482a2`](https://github.com/docling-project/docling/commit/e2482a2ada52b2b8a41c4402b27e125adbe4385f))
 * Add a backend parser for WebVTT files ([#2288](https://github.com/docling-project/docling/issues/2288)) ([`46efaae`](https://github.com/docling-project/docling/commit/46efaaefee17a6b83e02a050f9f3c8a51afbbd53))
 ### Fix
 * Correct y-axis scaling in draw_table_cells ([#2287](https://github.com/docling-project/docling/issues/2287)) ([`b5628f1`](https://github.com/docling-project/docling/commit/b5628f12273297d9db1393f4b734cfa337caa8c9))
 ### Documentation
 * Update API VLM example with granite-docling ([#2294](https://github.com/docling-project/docling/issues/2294)) ([`8b7e83a`](https://github.com/docling-project/docling/commit/8b7e83a8c7b9e333c31d5ae0b96213e3c70c6bf3))
 * Fix examples rendering ([#2281](https://github.com/docling-project/docling/issues/2281)) ([`8322c2e`](https://github.com/docling-project/docling/commit/8322c2ea9b4fbb1625bcbf1ec1b3dea6c1cd3ed0))
 ## [v2.53.0](https://github.com/docling-project/docling/releases/tag/v2.53.0) - 2025-09-17
 ### Feature
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
 * 💬 Parsing of Web Video Text Tracks (WebVTT) files
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
 * 📝 Parsing of Web Video Text Tracks (WebVTT) files
 ## Installation
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -3,6 +3,7 @@ import re
 import warnings
 from copy import deepcopy
 from enum import Enum
 from html import unescape
 from io import BytesIO
 from pathlib import Path
 from typing import Literal, Optional, Union, cast
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            fig_caption: Optional[TextItem] = None
            if element.title is not None and element.title != "":
                title = unescape(element.title)
                fig_caption = doc.add_text(
                    label=DocItemLabel.CAPTION,
-                    text=element.title,
+                    text=title,
                    formatting=formatting,
                    hyperlink=hyperlink,
                )
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            snippet_text = (
                element.children.strip() if isinstance(element.children, str) else ""
            )
            snippet_text = unescape(snippet_text)
            # Detect start of the table:
            if "|" in snippet_text or self.in_table:
                # most likely part of the markdown table
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -12,8 +12,11 @@ from docling_core.types.doc import (
    ImageRef,
    ListGroup,
    NodeItem,
    RefItem,
    RichTableCell,
    TableCell,
    TableData,
    TextItem,
 )
 from docling_core.types.doc.document import Formatting
 from docx import Document
@@ -128,7 +131,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
-            doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
+            doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
            # doc, _ = doc_info
            return doc
        else:
            raise RuntimeError(
@@ -172,7 +176,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        body: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> DoclingDocument:
+        # parent:
    ) -> tuple[DoclingDocument, list[RefItem]]:
        added_elements = []
        for element in body:
            tag_name = etree.QName(element).localname
            # Check for Inline Images (blip elements)
@@ -230,8 +236,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                                    parent=self.parents[level - 1],
                                    name="shape-text",
                                )
                                added_elements.append(shape_group.get_ref())
                                doc.add_text(
-                                    label=DocItemLabel.PARAGRAPH,
+                                    label=DocItemLabel.TEXT,
                                    parent=shape_group,
                                    text=text_content,
                                )
@@ -246,23 +253,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    _log.debug(
                        f"Found textbox content with {len(textbox_elements)} elements"
                    )
-                    self._handle_textbox_content(textbox_elements, docx_obj, doc)
+                    tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
                    added_elements.extend(tbc)
            # Check for Tables
            if element.tag.endswith("tbl"):
                try:
-                    self._handle_tables(element, docx_obj, doc)
+                    t = self._handle_tables(element, docx_obj, doc)
                    added_elements.extend(t)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
            # Check for Image
            elif drawing_blip:
-                self._handle_pictures(docx_obj, drawing_blip, doc)
+                pics = self._handle_pictures(docx_obj, drawing_blip, doc)
                added_elements.extend(pics)
                # Check for Text after the Image
                if (
                    tag_name in ["p"]
                    and element.find(".//w:t", namespaces=namespaces) is not None
                ):
-                    self._handle_text_elements(element, docx_obj, doc)
+                    te1 = self._handle_text_elements(element, docx_obj, doc)
                    added_elements.extend(te1)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -270,15 +281,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
                    for p in paragraphs:
-                        self._handle_text_elements(p, docx_obj, doc)
+                        te = self._handle_text_elements(p, docx_obj, doc)
                        added_elements.extend(te)
            # Check for Text
            elif tag_name in ["p"]:
                # "tcPr", "sectPr"
-                self._handle_text_elements(element, docx_obj, doc)
+                te = self._handle_text_elements(element, docx_obj, doc)
                added_elements.extend(te)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
-        return doc
+        return doc, added_elements
    def _str_to_int(
        self, s: Optional[str], default: Optional[int] = 0
@@ -674,14 +687,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        textbox_elements: list,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> None:
+    ) -> List[RefItem]:
        elem_ref: List[RefItem] = []
        """Process textbox content and add it to the document structure."""
        level = self._get_level()
        # Create a textbox group to contain all text from the textbox
        textbox_group = doc.add_group(
            label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
        )
-
+        elem_ref.append(textbox_group.get_ref())
        # Set this as the current parent to ensure textbox content
        # is properly nested in document structure
        original_parent = self.parents[level]
@@ -729,11 +743,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # Mark this paragraph as processed
            processed_paragraphs.add(paragraph_id)
-            self._handle_text_elements(p, docx_obj, doc)
+            elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
        # Restore original parent
        self.parents[level] = original_parent
-        return
+        return elem_ref
    def _handle_equations_in_text(self, element, text):
        only_texts = []
@@ -803,7 +817,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> None:
+    ) -> List[RefItem]:
        elem_ref: List[RefItem] = []
        paragraph = Paragraph(element, docx_obj)
        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
@@ -811,7 +826,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        )
        if text is None:
-            return
+            return elem_ref
        text = text.strip()
        # Common styles for bullet and numbered lists.
@@ -832,15 +847,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # Check if this is actually a numbered list by examining the numFmt
            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
-            self._add_list_item(
+            li = self._add_list_item(
                doc=doc,
                numid=numid,
                ilevel=ilevel,
                elements=paragraph_elements,
                is_numbered=is_numbered,
            )
            elem_ref.extend(li)  # MUST BE REF!!!
            self._update_history(p_style_id, p_level, numid, ilevel)
-            return
+            return elem_ref
        elif (
            numid is None
            and self._prev_numid() is not None
@@ -860,9 +876,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        if p_style_id in ["Title"]:
            for key in range(len(self.parents)):
                self.parents[key] = None
-            self.parents[0] = doc.add_text(
+            te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
-                parent=None, label=DocItemLabel.TITLE, text=text
+            self.parents[0] = te
-            )
+            elem_ref.append(te.get_ref())
        elif "Heading" in p_style_id:
            style_element = getattr(paragraph.style, "element", None)
            if style_element is not None:
@@ -871,7 +887,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                )
            else:
                is_numbered_style = False
-            self._add_header(doc, p_level, text, is_numbered_style)
+            h1 = self._add_header(doc, p_level, text, is_numbered_style)
            elem_ref.extend(h1)
        elif len(equations) > 0:
            if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
@@ -879,15 +896,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            ) > 0:
                # Standalone equation
                level = self._get_level()
-                doc.add_text(
+                t1 = doc.add_text(
                    label=DocItemLabel.FORMULA,
                    parent=self.parents[level - 1],
                    text=text.replace("<eq>", "").replace("</eq>", ""),
                )
                elem_ref.append(t1.get_ref())
            else:
                # Inline equation
                level = self._get_level()
                inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
                elem_ref.append(inline_equation.get_ref())
                text_tmp = text
                for eq in equations:
                    if len(text_tmp) == 0:
@@ -899,23 +918,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
                    if len(pre_eq_text) > 0:
-                        doc.add_text(
+                        e1 = doc.add_text(
-                            label=DocItemLabel.PARAGRAPH,
+                            label=DocItemLabel.TEXT,
                            parent=inline_equation,
                            text=pre_eq_text,
                        )
-                    doc.add_text(
+                        elem_ref.append(e1.get_ref())
                    e2 = doc.add_text(
                        label=DocItemLabel.FORMULA,
                        parent=inline_equation,
                        text=eq.replace("<eq>", "").replace("</eq>", ""),
                    )
                    elem_ref.append(e2.get_ref())
                if len(text_tmp) > 0:
-                    doc.add_text(
+                    e3 = doc.add_text(
-                        label=DocItemLabel.PARAGRAPH,
+                        label=DocItemLabel.TEXT,
                        parent=inline_equation,
                        text=text_tmp.strip(),
                    )
                    elem_ref.append(e3.get_ref())
        elif p_style_id in [
            "Paragraph",
@@ -934,13 +956,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                paragraph_elements=paragraph_elements,
            )
            for text, format, hyperlink in paragraph_elements:
-                doc.add_text(
+                t2 = doc.add_text(
-                    label=DocItemLabel.PARAGRAPH,
+                    label=DocItemLabel.TEXT,
                    parent=parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
                elem_ref.append(t2.get_ref())
        else:
            # Text style names can, and will have, not only default values but user values too
@@ -952,16 +975,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                paragraph_elements=paragraph_elements,
            )
            for text, format, hyperlink in paragraph_elements:
-                doc.add_text(
+                t3 = doc.add_text(
-                    label=DocItemLabel.PARAGRAPH,
+                    label=DocItemLabel.TEXT,
                    parent=parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
                elem_ref.append(t3.get_ref())
        self._update_history(p_style_id, p_level, numid, ilevel)
-        return
+        return elem_ref
    def _add_header(
        self,
@@ -969,17 +993,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        curr_level: Optional[int],
        text: str,
        is_numbered_style: bool = False,
-    ) -> None:
+    ) -> List[RefItem]:
        elem_ref: List[RefItem] = []
        level = self._get_level()
        if isinstance(curr_level, int):
            if curr_level > level:
                # add invisible group
                for i in range(level, curr_level):
-                    self.parents[i] = doc.add_group(
+                    gr1 = doc.add_group(
                        parent=self.parents[i - 1],
                        label=GroupLabel.SECTION,
                        name=f"header-{i}",
                    )
                    elem_ref.append(gr1.get_ref())
                    self.parents[i] = gr1
            elif curr_level < level:
                # remove the tail
                for key in range(len(self.parents)):
@@ -1019,12 +1047,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                text = f"{self.numbered_headers[previous_level]}.{text}"
                previous_level -= 1
-        self.parents[current_level] = doc.add_heading(
+        hd = doc.add_heading(
            parent=self.parents[parent_level],
            text=text,
            level=add_level,
        )
-        return
+        self.parents[current_level] = hd
        elem_ref.append(hd.get_ref())
        return elem_ref
    def _add_formatted_list_item(
        self,
@@ -1033,12 +1063,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        marker: str,
        enumerated: bool,
        level: int,
-    ) -> None:
+    ) -> List[RefItem]:
        elem_ref: List[RefItem] = []
        # This should not happen by construction
        if not isinstance(self.parents[level], ListGroup):
-            return
+            return elem_ref
        if not elements:
-            return
+            return elem_ref
        if len(elements) == 1:
            text, format, hyperlink = elements[0]
@@ -1068,6 +1099,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                        formatting=format,
                        hyperlink=hyperlink,
                    )
        return elem_ref
    def _add_list_item(
        self,
@@ -1077,10 +1109,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        ilevel: int,
        elements: list,
        is_numbered: bool = False,
-    ) -> None:
+    ) -> List[RefItem]:
-        # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
+        elem_ref: List[RefItem] = []
        # this method is always called with is_numbered. Numbered lists should be properly addressed.
        if not elements:
-            return None
+            return elem_ref
        enum_marker = ""
        level = self._get_level()
@@ -1091,9 +1124,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # Reset counters for the new numbering sequence
            self._reset_list_counters_for_new_sequence(numid)
-            self.parents[level] = doc.add_list_group(
+            list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
-                name="list", parent=self.parents[level - 1]
+            self.parents[level] = list_gr
-            )
+            elem_ref.append(list_gr.get_ref())
            # Set marker and enumerated arguments if this is an enumeration element.
            if is_numbered:
@@ -1114,9 +1147,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self.level_at_new_list + prev_indent + 1,
                self.level_at_new_list + ilevel + 1,
            ):
-                self.parents[i] = doc.add_list_group(
+                list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
-                    name="list", parent=self.parents[i - 1]
+                self.parents[i] = list_gr1
-                )
+                elem_ref.append(list_gr1.get_ref())
            # TODO: Set marker and enumerated arguments if this is an enumeration element.
            if is_numbered:
@@ -1156,7 +1189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
        elif self._prev_numid() == numid or prev_indent == ilevel:
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            # Set marker and enumerated arguments if this is an enumeration element.
            if is_numbered:
                counter = self._get_list_counter(numid, ilevel)
                enum_marker = str(counter) + "."
@@ -1165,15 +1198,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            self._add_formatted_list_item(
                doc, elements, enum_marker, is_numbered, level - 1
            )
-
+        return elem_ref
        return
    def _handle_tables(
        self,
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
        doc: DoclingDocument,
-    ) -> None:
+    ) -> List[RefItem]:
        elem_ref: List[RefItem] = []
        table: Table = Table(element, docx_obj)
        num_rows = len(table.rows)
        num_cols = len(table.columns)
@@ -1184,9 +1217,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            # In case we have a table of only 1 cell, we consider it furniture
            # And proceed processing the content of the cell as though it's in the document body
            self._walk_linear(cell_element._element, docx_obj, doc)
-            return
+            return elem_ref
        data = TableData(num_rows=num_rows, num_cols=num_cols)
        level = self._get_level()
        docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
        elem_ref.append(docling_table.get_ref())
        cell_set: set[CT_Tc] = set()
        for row_idx, row in enumerate(table.rows):
            _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
@@ -1223,27 +1260,87 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                else:
                    text = text.replace("<eq>", "$").replace("</eq>", "$")
-                table_cell = TableCell(
+                provs_in_cell: List[RefItem] = []
-                    text=text,
+                _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
-                    row_span=spanned_idx - row_idx,
+                ref_for_rich_cell = provs_in_cell[0]
-                    col_span=cell.grid_span,
+                rich_table_cell = False
                    start_row_offset_idx=row.grid_cols_before + row_idx,
                    end_row_offset_idx=row.grid_cols_before + spanned_idx,
                    start_col_offset_idx=col_idx,
                    end_col_offset_idx=col_idx + cell.grid_span,
                    column_header=row.grid_cols_before + row_idx == 0,
                    row_header=False,
                )
                data.table_cells.append(table_cell)
                col_idx += cell.grid_span
-        level = self._get_level()
+                def group_cell_elements(
-        doc.add_table(data=data, parent=self.parents[level - 1])
+                    group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
-        return
+                ) -> RefItem:
                    group_element = doc.add_group(
                        label=GroupLabel.UNSPECIFIED,
                        name=group_name,
                        parent=docling_table,
                    )
                    for prov in provs_in_cell:
                        group_element.children.append(prov)
                        pr_item = prov.resolve(doc)
                        item_parent = pr_item.parent.resolve(doc)
                        if pr_item.get_ref() in item_parent.children:
                            item_parent.children.remove(pr_item.get_ref())
                        pr_item.parent = group_element.get_ref()
                    ref_for_rich_cell = group_element.get_ref()
                    return ref_for_rich_cell
                if len(provs_in_cell) > 1:
                    # Cell has multiple elements, we need to group them
                    rich_table_cell = True
                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
                    ref_for_rich_cell = group_cell_elements(
                        group_name, doc, provs_in_cell
                    )
                elif len(provs_in_cell) == 1:
                    item_ref = provs_in_cell[0]
                    pr_item = item_ref.resolve(doc)
                    if isinstance(pr_item, TextItem):
                        # Cell has only one element and it's just a text
                        rich_table_cell = False
                        doc.delete_items(node_items=[pr_item])
                    else:
                        rich_table_cell = True
                        group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
                        ref_for_rich_cell = group_cell_elements(
                            group_name, doc, provs_in_cell
                        )
                else:
                    rich_table_cell = False
                if rich_table_cell:
                    rich_cell = RichTableCell(
                        text=text,
                        row_span=spanned_idx - row_idx,
                        col_span=cell.grid_span,
                        start_row_offset_idx=row.grid_cols_before + row_idx,
                        end_row_offset_idx=row.grid_cols_before + spanned_idx,
                        start_col_offset_idx=col_idx,
                        end_col_offset_idx=col_idx + cell.grid_span,
                        column_header=row.grid_cols_before + row_idx == 0,
                        row_header=False,
                        ref=ref_for_rich_cell,  # points to an artificial group around children
                    )
                    doc.add_table_cell(table_item=docling_table, cell=rich_cell)
                    col_idx += cell.grid_span
                else:
                    simple_cell = TableCell(
                        text=text,
                        row_span=spanned_idx - row_idx,
                        col_span=cell.grid_span,
                        start_row_offset_idx=row.grid_cols_before + row_idx,
                        end_row_offset_idx=row.grid_cols_before + spanned_idx,
                        start_col_offset_idx=col_idx,
                        end_col_offset_idx=col_idx + cell.grid_span,
                        column_header=row.grid_cols_before + row_idx == 0,
                        row_header=False,
                    )
                    doc.add_table_cell(table_item=docling_table, cell=simple_cell)
                    col_idx += cell.grid_span
        return elem_ref
    def _handle_pictures(
        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
-    ) -> None:
+    ) -> List[RefItem]:
        def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
            image_data: Optional[bytes] = None
            rId = drawing_blip[0].get(
@@ -1255,28 +1352,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                image_data = image_part.blob  # Get the binary image data
            return image_data
        elem_ref: List[RefItem] = []
        level = self._get_level()
        # Open the BytesIO object with PIL to create an Image
        image_data: Optional[bytes] = get_docx_image(drawing_blip)
        if image_data is None:
            _log.warning("Warning: image cannot be found")
-            doc.add_picture(
+            p1 = doc.add_picture(
                parent=self.parents[level - 1],
                caption=None,
            )
            elem_ref.append(p1.get_ref())
        else:
            try:
                image_bytes = BytesIO(image_data)
                pil_image = Image.open(image_bytes)
-                doc.add_picture(
+                p2 = doc.add_picture(
                    parent=self.parents[level - 1],
                    image=ImageRef.from_pil(image=pil_image, dpi=72),
                    caption=None,
                )
                elem_ref.append(p2.get_ref())
            except (UnidentifiedImageError, OSError):
                _log.warning("Warning: image cannot be loaded by Pillow")
-                doc.add_picture(
+                p3 = doc.add_picture(
                    parent=self.parents[level - 1],
                    caption=None,
                )
-        return
+                elem_ref.append(p3.get_ref())
        return elem_ref
--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -0,0 +1,572 @@
 import logging
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import Annotated, ClassVar, Literal, Optional, Union, cast
 from docling_core.types.doc import (
    ContentLayer,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    Formatting,
    GroupLabel,
    NodeItem,
 )
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.types import StringConstraints
 from typing_extensions import Self, override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 class _WebVTTTimestamp(BaseModel):
    """Model representing a WebVTT timestamp.
    A WebVTT timestamp is always interpreted relative to the current playback position
    of the media data that the WebVTT file is to be synchronized with.
    """
    model_config = ConfigDict(regex_engine="python-re")
    raw: Annotated[
        str,
        Field(
            description="A representation of the WebVTT Timestamp as a single string"
        ),
    ]
    _pattern: ClassVar[re.Pattern] = re.compile(
        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
    )
    _hours: int
    _minutes: int
    _seconds: int
    _millis: int
    @model_validator(mode="after")
    def validate_raw(self) -> Self:
        m = self._pattern.match(self.raw)
        if not m:
            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
        self._hours = int(m.group(1)) if m.group(1) else 0
        self._minutes = int(m.group(2))
        self._seconds = int(m.group(3))
        self._millis = int(m.group(4))
        if self._minutes < 0 or self._minutes > 59:
            raise ValueError("Minutes must be between 0 and 59")
        if self._seconds < 0 or self._seconds > 59:
            raise ValueError("Seconds must be between 0 and 59")
        return self
    @property
    def seconds(self) -> float:
        """A representation of the WebVTT Timestamp in seconds"""
        return (
            self._hours * 3600
            + self._minutes * 60
            + self._seconds
            + self._millis / 1000.0
        )
    @override
    def __str__(self) -> str:
        return self.raw
 _WebVTTCueIdentifier = Annotated[
    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
 ]
 class _WebVTTCueTimings(BaseModel):
    """Model representating WebVTT cue timings."""
    start: Annotated[
        _WebVTTTimestamp, Field(description="Start time offset of the cue")
    ]
    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
    @model_validator(mode="after")
    def check_order(self) -> Self:
        if self.start and self.end:
            if self.end.seconds <= self.start.seconds:
                raise ValueError("End timestamp must be greater than start timestamp")
        return self
    @override
    def __str__(self):
        return f"{self.start} --> {self.end}"
 class _WebVTTCueTextSpan(BaseModel):
    """Model representing a WebVTT cue text span."""
    text: str
    span_type: Literal["text"] = "text"
    @field_validator("text", mode="after")
    @classmethod
    def validate_text(cls, value: str) -> str:
        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
            raise ValueError("Cue text span contains invalid characters")
        if len(value) == 0:
            raise ValueError("Cue text span cannot be empty")
        return value
    @override
    def __str__(self):
        return self.text
 class _WebVTTCueVoiceSpan(BaseModel):
    """Model representing a WebVTT cue voice span."""
    annotation: Annotated[
        str,
        Field(
            description=(
                "Cue span start tag annotation text representing the name of thevoice"
            )
        ),
    ]
    classes: Annotated[
        list[str],
        Field(description="List of classes representing the cue span's significance"),
    ] = []
    components: Annotated[
        list["_WebVTTCueComponent"],
        Field(description="The components representing the cue internal text"),
    ] = []
    span_type: Literal["v"] = "v"
    @field_validator("annotation", mode="after")
    @classmethod
    def validate_annotation(cls, value: str) -> str:
        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
            raise ValueError(
                "Cue span start tag annotation contains invalid characters"
            )
        if not value:
            raise ValueError("Cue text span cannot be empty")
        return value
    @field_validator("classes", mode="after")
    @classmethod
    def validate_classes(cls, value: list[str]) -> list[str]:
        for item in value:
            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
                raise ValueError(
                    "A cue span start tag class contains invalid characters"
                )
            if not item:
                raise ValueError("Cue span start tag classes cannot be empty")
        return value
    @override
    def __str__(self):
        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
        inner = "".join(str(span) for span in self.components)
        return f"<{tag} {self.annotation}>{inner}</v>"
 class _WebVTTCueClassSpan(BaseModel):
    span_type: Literal["c"] = "c"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<c>{inner}</c>"
 class _WebVTTCueItalicSpan(BaseModel):
    span_type: Literal["i"] = "i"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<i>{inner}</i>"
 class _WebVTTCueBoldSpan(BaseModel):
    span_type: Literal["b"] = "b"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<b>{inner}</b>"
 class _WebVTTCueUnderlineSpan(BaseModel):
    span_type: Literal["u"] = "u"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<u>{inner}</u>"
 _WebVTTCueComponent = Annotated[
    Union[
        _WebVTTCueTextSpan,
        _WebVTTCueClassSpan,
        _WebVTTCueItalicSpan,
        _WebVTTCueBoldSpan,
        _WebVTTCueUnderlineSpan,
        _WebVTTCueVoiceSpan,
    ],
    Field(discriminator="span_type", description="The WebVTT cue component"),
 ]
 class _WebVTTCueBlock(BaseModel):
    """Model representing a WebVTT cue block.
    The optional WebVTT cue settings list is not supported.
    The cue payload is limited to the following spans: text, class, italic, bold,
    underline, and voice.
    """
    model_config = ConfigDict(regex_engine="python-re")
    identifier: Optional[_WebVTTCueIdentifier] = Field(
        None, description="The WebVTT cue identifier"
    )
    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
    _pattern_block: ClassVar[re.Pattern] = re.compile(
        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
    )
    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
    )
    @field_validator("payload", mode="after")
    @classmethod
    def validate_payload(cls, payload):
        for voice in payload:
            if "-->" in str(voice):
                raise ValueError("Cue payload must not contain '-->'")
        return payload
    @classmethod
    def parse(cls, raw: str) -> "_WebVTTCueBlock":
        lines = raw.strip().splitlines()
        if not lines:
            raise ValueError("Cue block must have at least one line")
        identifier: Optional[_WebVTTCueIdentifier] = None
        timing_line = lines[0]
        if "-->" not in timing_line and len(lines) > 1:
            identifier = timing_line
            timing_line = lines[1]
            cue_lines = lines[2:]
        else:
            cue_lines = lines[1:]
        if "-->" not in timing_line:
            raise ValueError("Cue block must contain WebVTT cue timings")
        start, end = [t.strip() for t in timing_line.split("-->")]
        end = re.split(" |\t", end)[0]  # ignore the cue settings list
        timings: _WebVTTCueTimings = _WebVTTCueTimings(
            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
        )
        cue_text = " ".join(cue_lines).strip()
        if cue_text.startswith("<v") and "</v>" not in cue_text:
            # adding close tag for cue voice spans without end tag
            cue_text += "</v>"
        stack: list[list[_WebVTTCueComponent]] = [[]]
        tag_stack: list[Union[str, tuple]] = []
        pos = 0
        matches = list(cls._pattern_block.finditer(cue_text))
        i = 0
        while i < len(matches):
            match = matches[i]
            if match.start() > pos:
                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
            tag = match.group(0)
            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
                tag_type = tag[1:2]
                tag_stack.append(tag_type)
                stack.append([])
            elif tag == "</i>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueItalicSpan(components=children))
                tag_stack.pop()
            elif tag == "</b>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueBoldSpan(components=children))
                tag_stack.pop()
            elif tag == "</u>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
                tag_stack.pop()
            elif tag == "</c>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueClassSpan(components=children))
                tag_stack.pop()
            elif tag.startswith("<v"):
                tag_stack.append(("v", tag))
                stack.append([])
            elif tag.startswith("</v"):
                children = stack.pop() if stack else []
                if (
                    tag_stack
                    and isinstance(tag_stack[-1], tuple)
                    and tag_stack[-1][0] == "v"
                ):
                    _, voice = cast(tuple, tag_stack.pop())
                    voice_match = cls._pattern_voice_tag.match(voice)
                    if voice_match:
                        class_string = voice_match.group("class")
                        annotation = voice_match.group("annotation")
                        if annotation:
                            classes: list[str] = []
                            if class_string:
                                classes = [c for c in class_string.split(".") if c]
                            stack[-1].append(
                                _WebVTTCueVoiceSpan(
                                    annotation=annotation.strip(),
                                    classes=classes,
                                    components=children,
                                )
                            )
            pos = match.end()
            i += 1
        if pos < len(cue_text):
            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
        return cls(
            identifier=identifier,
            timings=timings,
            payload=stack[0],
        )
    def __str__(self):
        parts = []
        if self.identifier:
            parts.append(f"{self.identifier}\n")
        timings_line = str(self.timings)
        parts.append(timings_line + "\n")
        for idx, span in enumerate(self.payload):
            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
                # the end tag may be omitted for brevity
                parts.append(str(span).removesuffix("</v>"))
            else:
                parts.append(str(span))
        return "".join(parts)
 class _WebVTTFile(BaseModel):
    """A model representing a WebVTT file."""
    cue_blocks: list[_WebVTTCueBlock]
    @staticmethod
    def verify_signature(content: str) -> bool:
        if not content:
            return False
        elif len(content) == 6:
            return content == "WEBVTT"
        elif len(content) > 6 and content.startswith("WEBVTT"):
            return content[6] in (" ", "\t", "\n")
        else:
            return False
    @classmethod
    def parse(cls, raw: str) -> "_WebVTTFile":
        # Normalize newlines to LF
        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
        # Check WebVTT signature
        if not cls.verify_signature(raw):
            raise ValueError("Invalid WebVTT file signature")
        # Strip "WEBVTT" header line
        lines = raw.split("\n", 1)
        body = lines[1] if len(lines) > 1 else ""
        # Remove NOTE/STYLE/REGION blocks
        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
        # Split into cue blocks
        raw_blocks = re.split(r"\n\s*\n", body.strip())
        cues: list[_WebVTTCueBlock] = []
        for block in raw_blocks:
            try:
                cues.append(_WebVTTCueBlock.parse(block))
            except ValueError as e:
                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
        return cls(cue_blocks=cues)
    def __iter__(self):
        return iter(self.cue_blocks)
    def __getitem__(self, idx):
        return self.cue_blocks[idx]
    def __len__(self):
        return len(self.cue_blocks)
 class WebVTTDocumentBackend(DeclarativeDocumentBackend):
    """Declarative backend for WebVTT (.vtt) files.
    This parser reads the content of a WebVTT file and converts
    it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
    Each cue becomes a TextItem and the items are appended to the
    document body by the cue's start time.
    """
    @override
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
        self.content: str = ""
        try:
            if isinstance(self.path_or_stream, BytesIO):
                self.content = self.path_or_stream.getvalue().decode("utf-8")
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, encoding="utf-8") as f:
                    self.content = f.read()
        except Exception as e:
            raise RuntimeError(
                "Could not initialize the WebVTT backend for file with hash "
                f"{self.document_hash}."
            ) from e
    @override
    def is_valid(self) -> bool:
        return _WebVTTFile.verify_signature(self.content)
    @classmethod
    @override
    def supports_pagination(cls) -> bool:
        return False
    @override
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    @override
    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.VTT}
    @staticmethod
    def _add_text_from_component(
        doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
    ) -> None:
        """Adds a TextItem to a document by extracting text from a cue span component.
        TODO: address nesting
        """
        formatting = Formatting()
        text = ""
        if isinstance(item, _WebVTTCueItalicSpan):
            formatting.italic = True
        elif isinstance(item, _WebVTTCueBoldSpan):
            formatting.bold = True
        elif isinstance(item, _WebVTTCueUnderlineSpan):
            formatting.underline = True
        if isinstance(item, _WebVTTCueTextSpan):
            text = item.text
        else:
            # TODO: address nesting
            text = "".join(
                [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
            )
        if text := text.strip():
            doc.add_text(
                label=DocItemLabel.TEXT,
                text=text,
                parent=parent,
                content_layer=ContentLayer.BODY,
                formatting=formatting,
            )
    @override
    def convert(self) -> DoclingDocument:
        _log.debug("Starting WebVTT conversion...")
        if not self.is_valid():
            raise RuntimeError("Invalid WebVTT document.")
        origin = DocumentOrigin(
            filename=self.file.name or "file",
            mimetype="text/vtt",
            binary_hash=self.document_hash,
        )
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
        for block in vtt.cue_blocks:
            block_group = doc.add_group(
                label=GroupLabel.SECTION,
                name="WebVTT cue block",
                parent=None,
                content_layer=ContentLayer.BODY,
            )
            if block.identifier:
                doc.add_text(
                    label=DocItemLabel.TEXT,
                    text=str(block.identifier),
                    parent=block_group,
                    content_layer=ContentLayer.BODY,
                )
            doc.add_text(
                label=DocItemLabel.TEXT,
                text=str(block.timings),
                parent=block_group,
                content_layer=ContentLayer.BODY,
            )
            for cue_span in block.payload:
                if isinstance(cue_span, _WebVTTCueVoiceSpan):
                    voice_group = doc.add_group(
                        label=GroupLabel.INLINE,
                        name="WebVTT cue voice span",
                        parent=block_group,
                        content_layer=ContentLayer.BODY,
                    )
                    voice = cue_span.annotation
                    if classes := cue_span.classes:
                        voice += f" ({', '.join(classes)})"
                    voice += ": "
                    doc.add_text(
                        label=DocItemLabel.TEXT,
                        text=voice,
                        parent=voice_group,
                        content_layer=ContentLayer.BODY,
                    )
                    for item in cue_span.components:
                        WebVTTDocumentBackend._add_text_from_component(
                            doc, item, voice_group
                        )
                else:
                    WebVTTDocumentBackend._add_text_from_component(
                        doc, cue_span, block_group
                    )
        return doc
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,7 +1,6 @@
 import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Optional, Type, Union
 import numpy as np
 from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
 )
 from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (
+from docling_core.types.io import DocumentStream
    DocumentStream,
 )
 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
    METS_GBS = "mets_gbs"
    JSON_DOCLING = "json_docling"
    AUDIO = "audio"
    VTT = "vtt"
 class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
    DOCTAGS = "doctags"
-FormatToExtensions: Dict[InputFormat, List[str]] = {
+FormatToExtensions: dict[InputFormat, list[str]] = {
    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
    InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.METS_GBS: ["tar.gz"],
    InputFormat.JSON_DOCLING: ["json"],
    InputFormat.AUDIO: ["wav", "mp3"],
    InputFormat.VTT: ["vtt"],
 }
-FormatToMimeType: Dict[InputFormat, List[str]] = {
+FormatToMimeType: dict[InputFormat, list[str]] = {
    InputFormat.DOCX: [
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    InputFormat.METS_GBS: ["application/mets+xml"],
    InputFormat.JSON_DOCLING: ["application/json"],
    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
    InputFormat.VTT: ["text/vtt"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
-    cells: List[TextCell] = []
+    cells: list[TextCell] = []
-    children: List["Cluster"] = []  # Add child cluster support
+    children: list["Cluster"] = []  # Add child cluster support
    @field_serializer("confidence")
    def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
 class LayoutPrediction(BaseModel):
-    clusters: List[Cluster] = []
+    clusters: list[Cluster] = []
 class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
 class Table(BasePageElement):
-    otsl_seq: List[str]
+    otsl_seq: list[str]
    num_rows: int = 0
    num_cols: int = 0
-    table_cells: List[TableCell]
+    table_cells: list[TableCell]
 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, Table] = {}
+    table_map: dict[int, Table] = {}
 class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
 class FigureElement(BasePageElement):
-    annotations: List[PictureDataType] = []
+    annotations: list[PictureDataType] = []
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
 class FigureClassificationPrediction(BaseModel):
    figure_count: int = 0
-    figure_map: Dict[int, FigureElement] = {}
+    figure_map: dict[int, FigureElement] = {}
 class EquationPrediction(BaseModel):
    equation_count: int = 0
-    equation_map: Dict[int, TextElement] = {}
+    equation_map: dict[int, TextElement] = {}
 class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
 class AssembledUnit(BaseModel):
-    elements: List[PageElement] = []
+    elements: list[PageElement] = []
-    body: List[PageElement] = []
+    body: list[PageElement] = []
-    headers: List[PageElement] = []
+    headers: list[PageElement] = []
 class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
-    _image_cache: Dict[
+    _image_cache: dict[
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
    @property
-    def cells(self) -> List[TextCell]:
+    def cells(self) -> list[TextCell]:
        """Return text cells as a read-only view of parsed_page.textline_cells."""
        if self.parsed_page is not None:
            return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
    id: str
    model: Optional[str] = None  # returned by openai
-    choices: List[OpenAiResponseChoice]
+    choices: list[OpenAiResponseChoice]
    created: int
    usage: OpenAiResponseUsage
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
 class ConfidenceReport(PageConfidenceScores):
-    pages: Dict[int, PageConfidenceScores] = Field(
+    pages: dict[int, PageConfidenceScores] = Field(
        default_factory=lambda: defaultdict(PageConfidenceScores)
    )
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
            mime = FormatToMimeType[InputFormat.PPTX][0]
        elif ext in FormatToExtensions[InputFormat.XLSX]:
            mime = FormatToMimeType[InputFormat.XLSX][0]
        elif ext in FormatToExtensions[InputFormat.VTT]:
            mime = FormatToMimeType[InputFormat.VTT][0]
        return mime
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.backend.noop_backend import NoOpBackend
 from docling.backend.webvtt_backend import WebVTTDocumentBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
        InputFormat.VTT: FormatOption(
            pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
        ),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -3,7 +3,7 @@
 #
 # What this example does
 # - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output.
-# - Shows two setups: default (Transformers/SmolDocling) and macOS MPS/MLX.
+# - Shows two setups: default (Transformers/GraniteDocling) and macOS MPS/MLX.
 #
 # Prerequisites
 # - Install Docling with VLM extras and the appropriate backend (Transformers or MLX).
@@ -15,7 +15,7 @@
 #
 # Notes
 # - `source` may be a local path or a URL to a PDF.
-# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.SMOLDOCLING_MLX`).
+# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.GRANITEDOCLING_MLX`).
 # - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`.
 # %%
--- a/docs/index.md
+++ b/docs/index.md
@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️  Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
 * 💬 Parsing of Web Video Text Tracks (WebVTT) files
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
 * 📝 Parsing of Web Video Text Tracks (WebVTT) files
 ## Get started
--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
 | PDF | |
 | DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
 | Markdown | |
-| AsciiDoc | |
+| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
 | HTML, XHTML | |
 | CSV | |
 | PNG, JPEG, TIFF, BMP, WEBP | Image formats |
 | WebVTT | Web Video Text Tracks format for displaying timed text |
 Schema-specific support:
@@ -32,4 +33,4 @@ Schema-specific support:
 | Markdown | |
 | JSON | Lossless serialization of Docling Document |
 | Text | Plain text, i.e. without Markdown markers |
-| Doctags | |
+| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.53.0"  # DO NOT EDIT, updated automatically
+version = "2.54.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
@@ -44,7 +44,7 @@ authors = [
 requires-python = '>=3.9,<4.0'
 dependencies = [
  'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.48.0,<3.0.0)',
+  'docling-core[chunking] (>=2.48.2,<3.0.0)',
  'docling-parse (>=4.4.0,<5.0.0)',
  "docling-ibm-models>=3.9.1,<4",
  'filetype (>=1.2.0,<2.0.0)',
--- a/tests/data/groundtruth/docling_v2/equations.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt
@@ -1,40 +1,40 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: inline: group group
-    item-2 at level 2: paragraph: This is a word document and this is an inline equation: 
+    item-2 at level 2: text: This is a word document and this is an inline equation: 
    item-3 at level 2: formula: A= \pi r^{2}
-    item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
+    item-4 at level 2: text: . If instead, I want an equation by line, I can do this:
-  item-5 at level 1: paragraph: 
+  item-5 at level 1: text: 
  item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
-  item-7 at level 1: paragraph: And that is an equation by itself. Cheers!
+  item-7 at level 1: text: And that is an equation by itself. Cheers!
-  item-8 at level 1: paragraph: 
+  item-8 at level 1: text: 
-  item-9 at level 1: paragraph: This is another equation:
+  item-9 at level 1: text: This is another equation:
  item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
-  item-11 at level 1: paragraph: 
+  item-11 at level 1: text: 
-  item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
+  item-12 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
-  item-13 at level 1: paragraph: 
+  item-13 at level 1: text: 
-  item-14 at level 1: paragraph: 
+  item-14 at level 1: text: 
  item-15 at level 1: inline: group group
-    item-16 at level 2: paragraph: This is a word document and this is an inline equation: 
+    item-16 at level 2: text: This is a word document and this is an inline equation: 
    item-17 at level 2: formula: A= \pi r^{2}
-    item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
+    item-18 at level 2: text: . If instead, I want an equation by line, I can do this:
-  item-19 at level 1: paragraph: 
+  item-19 at level 1: text: 
  item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
-  item-21 at level 1: paragraph: 
+  item-21 at level 1: text: 
-  item-22 at level 1: paragraph: And that is an equation by itself. Cheers!
+  item-22 at level 1: text: And that is an equation by itself. Cheers!
-  item-23 at level 1: paragraph: 
+  item-23 at level 1: text: 
-  item-24 at level 1: paragraph: This is another equation:
+  item-24 at level 1: text: This is another equation:
-  item-25 at level 1: paragraph: 
+  item-25 at level 1: text: 
  item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
-  item-27 at level 1: paragraph: 
+  item-27 at level 1: text: 
-  item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
+  item-28 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
-  item-29 at level 1: paragraph: 
+  item-29 at level 1: text: 
-  item-30 at level 1: paragraph: 
+  item-30 at level 1: text: 
  item-31 at level 1: inline: group group
-    item-32 at level 2: paragraph: This is a word document and this is an inline equation: 
+    item-32 at level 2: text: This is a word document and this is an inline equation: 
    item-33 at level 2: formula: A= \pi r^{2}
-    item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
+    item-34 at level 2: text: . If instead, I want an equation by line, I can do this:
-  item-35 at level 1: paragraph: 
+  item-35 at level 1: text: 
  item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty  < x <  \infty
-  item-37 at level 1: paragraph: 
+  item-37 at level 1: text: 
-  item-38 at level 1: paragraph: And that is an equation by itself. Cheers!
+  item-38 at level 1: text: And that is an equation by itself. Cheers!
-  item-39 at level 1: paragraph: 
+  item-39 at level 1: text: 
--- a/tests/data/groundtruth/docling_v2/equations.docx.json
+++ b/tests/data/groundtruth/docling_v2/equations.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "equations",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -182,7 +182,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is a word document and this is an inline equation: ",
      "text": "This is a word document and this is an inline equation: "
@@ -206,7 +206,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": ". If instead, I want an equation by line, I can do this:",
      "text": ". If instead, I want an equation by line, I can do this:"
@@ -218,7 +218,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -242,7 +242,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "And that is an equation by itself. Cheers!",
      "text": "And that is an equation by itself. Cheers!",
@@ -261,7 +261,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -273,7 +273,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is another equation:",
      "text": "This is another equation:",
@@ -304,7 +304,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -316,7 +316,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
      "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
@@ -335,7 +335,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -347,7 +347,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -359,7 +359,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is a word document and this is an inline equation: ",
      "text": "This is a word document and this is an inline equation: "
@@ -383,7 +383,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": ". If instead, I want an equation by line, I can do this:",
      "text": ". If instead, I want an equation by line, I can do this:"
@@ -395,7 +395,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -419,7 +419,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -431,7 +431,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "And that is an equation by itself. Cheers!",
      "text": "And that is an equation by itself. Cheers!",
@@ -450,7 +450,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -462,7 +462,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is another equation:",
      "text": "This is another equation:",
@@ -481,7 +481,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -505,7 +505,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -517,7 +517,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
      "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
@@ -536,7 +536,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -548,7 +548,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -560,7 +560,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is a word document and this is an inline equation: ",
      "text": "This is a word document and this is an inline equation: "
@@ -584,7 +584,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": ". If instead, I want an equation by line, I can do this:",
      "text": ". If instead, I want an equation by line, I can do this:"
@@ -596,7 +596,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -620,7 +620,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -632,7 +632,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "And that is an equation by itself. Cheers!",
      "text": "And that is an equation by itself. Cheers!",
@@ -651,7 +651,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/escaped_characters.md.json
+++ b/tests/data/groundtruth/docling_v2/escaped_characters.md.json
@@ -0,0 +1,675 @@
 {
  "schema_name": "DoclingDocument",
  "version": "1.7.0",
  "name": "escaped_characters",
  "origin": {
    "mimetype": "text/html",
    "binary_hash": 10682185258371912110,
    "filename": "escaped_characters.md"
  },
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
  "body": {
    "self_ref": "#/body",
    "children": [
      {
        "$ref": "#/texts/0"
      },
      {
        "$ref": "#/texts/1"
      },
      {
        "$ref": "#/texts/4"
      },
      {
        "$ref": "#/texts/7"
      },
      {
        "$ref": "#/texts/9"
      },
      {
        "$ref": "#/texts/11"
      },
      {
        "$ref": "#/texts/12"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/texts/4"
      },
      "children": [
        {
          "$ref": "#/texts/5"
        }
      ],
      "content_layer": "body",
      "name": "ordered list",
      "label": "list"
    },
    {
      "self_ref": "#/groups/1",
      "parent": {
        "$ref": "#/texts/4"
      },
      "children": [
        {
          "$ref": "#/texts/6"
        }
      ],
      "content_layer": "body",
      "name": "list",
      "label": "list"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "furniture",
      "label": "title",
      "prov": [],
      "orig": "escaped_characters",
      "text": "escaped_characters"
    },
    {
      "self_ref": "#/texts/1",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/2"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Headers:",
      "text": "Headers:"
    },
    {
      "self_ref": "#/texts/2",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [
        {
          "$ref": "#/texts/3"
        }
      ],
      "content_layer": "body",
      "label": "section_header",
      "prov": [],
      "orig": "& < > \" '",
      "text": "& < > \" '",
      "level": 1
    },
    {
      "self_ref": "#/texts/3",
      "parent": {
        "$ref": "#/texts/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Text: 00:16.000 ----> 00:18.000 & < > \" '",
      "text": "Text: 00:16.000 ----> 00:18.000 & < > \" '"
    },
    {
      "self_ref": "#/texts/4",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/groups/0"
        },
        {
          "$ref": "#/groups/1"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Lists",
      "text": "Lists"
    },
    {
      "self_ref": "#/texts/5",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "& < > \" '",
      "text": "& < > \" '",
      "enumerated": true,
      "marker": ""
    },
    {
      "self_ref": "#/texts/6",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "& < > \" '",
      "text": "& < > \" '",
      "enumerated": false,
      "marker": ""
    },
    {
      "self_ref": "#/texts/7",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/8"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Inline code",
      "text": "Inline code"
    },
    {
      "self_ref": "#/texts/8",
      "parent": {
        "$ref": "#/texts/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "& < > \" '",
      "text": "& < > \" '",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/9",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/10"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Code block",
      "text": "Code block"
    },
    {
      "self_ref": "#/texts/10",
      "parent": {
        "$ref": "#/texts/9"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "& < > \" '",
      "text": "& < > \" '",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/11",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/tables/0"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Table",
      "text": "Table"
    },
    {
      "self_ref": "#/texts/12",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/13"
        },
        {
          "$ref": "#/texts/14"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Raw HTML",
      "text": "Raw HTML"
    },
    {
      "self_ref": "#/texts/13",
      "parent": {
        "$ref": "#/texts/12"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "& < > \" '/div>",
      "text": "& < > \" '/div>"
    },
    {
      "self_ref": "#/texts/14",
      "parent": {
        "$ref": "#/texts/12"
      },
      "children": [
        {
          "$ref": "#/texts/15"
        }
      ],
      "content_layer": "body",
      "label": "section_header",
      "prov": [],
      "orig": "Link",
      "text": "Link",
      "level": 1
    },
    {
      "self_ref": "#/texts/15",
      "parent": {
        "$ref": "#/texts/14"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "& < > \" '",
      "text": "& < > \" '",
      "hyperlink": "https://en.wikipedia.org/wiki/Albert_Einstein"
    }
  ],
  "pictures": [],
  "tables": [
    {
      "self_ref": "#/tables/0",
      "parent": {
        "$ref": "#/texts/11"
      },
      "children": [],
      "content_layer": "body",
      "label": "table",
      "prov": [],
      "captions": [],
      "references": [],
      "footnotes": [],
      "data": {
        "table_cells": [
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 0,
            "end_row_offset_idx": 1,
            "start_col_offset_idx": 0,
            "end_col_offset_idx": 1,
            "text": "Key",
            "column_header": true,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 0,
            "end_row_offset_idx": 1,
            "start_col_offset_idx": 1,
            "end_col_offset_idx": 2,
            "text": "Example",
            "column_header": true,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 1,
            "end_row_offset_idx": 2,
            "start_col_offset_idx": 0,
            "end_col_offset_idx": 1,
            "text": "Ampersand",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 1,
            "end_row_offset_idx": 2,
            "start_col_offset_idx": 1,
            "end_col_offset_idx": 2,
            "text": "&",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 2,
            "end_row_offset_idx": 3,
            "start_col_offset_idx": 0,
            "end_col_offset_idx": 1,
            "text": "Less-than",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 2,
            "end_row_offset_idx": 3,
            "start_col_offset_idx": 1,
            "end_col_offset_idx": 2,
            "text": "<",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 3,
            "end_row_offset_idx": 4,
            "start_col_offset_idx": 0,
            "end_col_offset_idx": 1,
            "text": "Greater-than",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 3,
            "end_row_offset_idx": 4,
            "start_col_offset_idx": 1,
            "end_col_offset_idx": 2,
            "text": ">",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 4,
            "end_row_offset_idx": 5,
            "start_col_offset_idx": 0,
            "end_col_offset_idx": 1,
            "text": "Quotes",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 4,
            "end_row_offset_idx": 5,
            "start_col_offset_idx": 1,
            "end_col_offset_idx": 2,
            "text": "\"",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 5,
            "end_row_offset_idx": 6,
            "start_col_offset_idx": 0,
            "end_col_offset_idx": 1,
            "text": "Apostrophes",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
            "col_span": 1,
            "start_row_offset_idx": 5,
            "end_row_offset_idx": 6,
            "start_col_offset_idx": 1,
            "end_col_offset_idx": 2,
            "text": "'",
            "column_header": false,
            "row_header": false,
            "row_section": false,
            "fillable": false
          }
        ],
        "num_rows": 6,
        "num_cols": 2,
        "grid": [
          [
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 0,
              "end_row_offset_idx": 1,
              "start_col_offset_idx": 0,
              "end_col_offset_idx": 1,
              "text": "Key",
              "column_header": true,
              "row_header": false,
              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 0,
              "end_row_offset_idx": 1,
              "start_col_offset_idx": 1,
              "end_col_offset_idx": 2,
              "text": "Example",
              "column_header": true,
              "row_header": false,
              "row_section": false,
              "fillable": false
            }
          ],
          [
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 1,
              "end_row_offset_idx": 2,
              "start_col_offset_idx": 0,
              "end_col_offset_idx": 1,
              "text": "Ampersand",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 1,
              "end_row_offset_idx": 2,
              "start_col_offset_idx": 1,
              "end_col_offset_idx": 2,
              "text": "&",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            }
          ],
          [
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 2,
              "end_row_offset_idx": 3,
              "start_col_offset_idx": 0,
              "end_col_offset_idx": 1,
              "text": "Less-than",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 2,
              "end_row_offset_idx": 3,
              "start_col_offset_idx": 1,
              "end_col_offset_idx": 2,
              "text": "<",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            }
          ],
          [
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 3,
              "end_row_offset_idx": 4,
              "start_col_offset_idx": 0,
              "end_col_offset_idx": 1,
              "text": "Greater-than",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 3,
              "end_row_offset_idx": 4,
              "start_col_offset_idx": 1,
              "end_col_offset_idx": 2,
              "text": ">",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            }
          ],
          [
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 4,
              "end_row_offset_idx": 5,
              "start_col_offset_idx": 0,
              "end_col_offset_idx": 1,
              "text": "Quotes",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 4,
              "end_row_offset_idx": 5,
              "start_col_offset_idx": 1,
              "end_col_offset_idx": 2,
              "text": "\"",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            }
          ],
          [
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 5,
              "end_row_offset_idx": 6,
              "start_col_offset_idx": 0,
              "end_col_offset_idx": 1,
              "text": "Apostrophes",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
              "col_span": 1,
              "start_row_offset_idx": 5,
              "end_row_offset_idx": 6,
              "start_col_offset_idx": 1,
              "end_col_offset_idx": 2,
              "text": "'",
              "column_header": false,
              "row_header": false,
              "row_section": false,
              "fillable": false
            }
          ]
        ]
      },
      "annotations": []
    }
  ],
  "key_value_items": [],
  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/escaped_characters.md.md
+++ b/tests/data/groundtruth/docling_v2/escaped_characters.md.md
@@ -0,0 +1,41 @@
 # Headers:
 ## &amp; &lt; &gt; " '
 Text: 00:16.000 ----&gt; 00:18.000 &amp; &lt; &gt; " '
 # Lists
 1. &amp; &lt; &gt; " '
 - &amp; &lt; &gt; " '
 # Inline code
 ```
 & < > " '
 ```
 # Code block
 ```
 & < > " '
 ```
 # Table
 | Key          | Example   |
 |--------------|-----------|
 | Ampersand    | &         |
 | Less-than    | <         |
 | Greater-than | >         |
 | Quotes       | "         |
 | Apostrophes  | '         |
 # Raw HTML
 &amp; &lt; &gt; " '/div&gt;
 ## Link
 [&amp; &lt; &gt; " '](https://en.wikipedia.org/wiki/Albert_Einstein)
--- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
+++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
@@ -186,6 +186,7 @@ tables:
        column_header: true
        end_col_offset_idx: 1
        end_row_offset_idx: 1
        fillable: false
        row_header: false
        row_section: false
        row_span: 1
@@ -196,6 +197,7 @@ tables:
        column_header: true
        end_col_offset_idx: 2
        end_row_offset_idx: 1
        fillable: false
        row_header: false
        row_section: false
        row_span: 1
@@ -206,6 +208,7 @@ tables:
        column_header: false
        end_col_offset_idx: 1
        end_row_offset_idx: 2
        fillable: false
        row_header: false
        row_section: false
        row_span: 1
@@ -216,6 +219,7 @@ tables:
        column_header: false
        end_col_offset_idx: 2
        end_row_offset_idx: 2
        fillable: false
        row_header: false
        row_section: false
        row_span: 1
@@ -229,6 +233,7 @@ tables:
      column_header: true
      end_col_offset_idx: 1
      end_row_offset_idx: 1
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -239,6 +244,7 @@ tables:
      column_header: true
      end_col_offset_idx: 2
      end_row_offset_idx: 1
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -249,6 +255,7 @@ tables:
      column_header: false
      end_col_offset_idx: 1
      end_row_offset_idx: 2
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -259,6 +266,7 @@ tables:
      column_header: false
      end_col_offset_idx: 2
      end_row_offset_idx: 2
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -269,6 +277,7 @@ tables:
      column_header: true
      end_col_offset_idx: 1
      end_row_offset_idx: 1
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -279,6 +288,7 @@ tables:
      column_header: true
      end_col_offset_idx: 2
      end_row_offset_idx: 1
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -289,6 +299,7 @@ tables:
      column_header: false
      end_col_offset_idx: 1
      end_row_offset_idx: 2
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -299,6 +310,7 @@ tables:
      column_header: false
      end_col_offset_idx: 2
      end_row_offset_idx: 2
      fillable: false
      row_header: false
      row_section: false
      row_span: 1
@@ -878,4 +890,4 @@ texts:
  prov: []
  self_ref: '#/texts/48'
  text: Table Heading
-version: 1.6.0
+version: 1.7.0
--- a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.itxt
@@ -1,10 +1,10 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
+  item-1 at level 1: text: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
-  item-2 at level 1: paragraph: 
+  item-2 at level 1: text: 
-  item-3 at level 1: paragraph: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
+  item-3 at level 1: text: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
-  item-4 at level 1: paragraph: 
+  item-4 at level 1: text: 
-  item-5 at level 1: paragraph: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
+  item-5 at level 1: text: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
-  item-6 at level 1: paragraph: 
+  item-6 at level 1: text: 
-  item-7 at level 1: paragraph: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
+  item-7 at level 1: text: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
-  item-8 at level 1: paragraph: 
+  item-8 at level 1: text: 
-  item-9 at level 1: paragraph: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.
+  item-9 at level 1: text: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.
--- a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json
+++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "lorem_ipsum",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -58,7 +58,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
      "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
@@ -77,7 +77,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -89,7 +89,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
      "text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
@@ -108,7 +108,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -120,7 +120,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
      "text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
@@ -139,7 +139,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -151,7 +151,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
      "text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
@@ -170,7 +170,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -182,7 +182,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
      "text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
--- a/tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
+++ b/tests/data/groundtruth/docling_v2/mixed_without_h1.md.yaml
@@ -136,4 +136,4 @@ texts:
  prov: []
  self_ref: '#/texts/7'
  text: The end!
-version: 1.6.0
+version: 1.7.0
--- a/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt
@@ -1,3 +1,3 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: table with [2x2]
-  item-2 at level 1: paragraph: 
+  item-2 at level 1: text: 
--- a/tests/data/groundtruth/docling_v2/table_with_equations.docx.json
+++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "table_with_equations",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -37,7 +37,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -69,7 +69,8 @@
            "text": "The next cell has an equation",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -81,7 +82,8 @@
            "text": "$A= \\pi r^{2}$",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -93,7 +95,8 @@
            "text": "The next cell has another equation",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -105,7 +108,8 @@
            "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          }
        ],
        "num_rows": 2,
@@ -122,7 +126,8 @@
              "text": "The next cell has an equation",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -134,7 +139,8 @@
              "text": "$A= \\pi r^{2}$",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ],
          [
@@ -148,7 +154,8 @@
              "text": "The next cell has another equation",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -160,7 +167,8 @@
              "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ]
        ]
--- a/tests/data/groundtruth/docling_v2/tablecell.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/tablecell.docx.itxt
@@ -2,9 +2,9 @@ item-0 at level 0: unspecified: group _root_
  item-1 at level 1: list: group list
    item-2 at level 2: list_item: Hello world1
    item-3 at level 2: list_item: Hello2
-  item-4 at level 1: paragraph: 
+  item-4 at level 1: text: 
-  item-5 at level 1: paragraph: Some text before
+  item-5 at level 1: text: Some text before
  item-6 at level 1: table with [3x3]
-  item-7 at level 1: paragraph: 
+  item-7 at level 1: text: 
-  item-8 at level 1: paragraph: 
+  item-8 at level 1: text: 
-  item-9 at level 1: paragraph: Some text after
+  item-9 at level 1: text: Some text after
--- a/tests/data/groundtruth/docling_v2/tablecell.docx.json
+++ b/tests/data/groundtruth/docling_v2/tablecell.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "tablecell",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -112,7 +112,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -124,7 +124,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Some text before",
      "text": "Some text before",
@@ -143,7 +143,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -155,7 +155,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -167,7 +167,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Some text after",
      "text": "Some text after",
@@ -206,7 +206,8 @@
            "text": "Tab1",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -218,7 +219,8 @@
            "text": "Tab2",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -230,7 +232,8 @@
            "text": "Tab3",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -242,7 +245,8 @@
            "text": "A",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -254,7 +258,8 @@
            "text": "B",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -266,7 +271,8 @@
            "text": "C",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -278,7 +284,8 @@
            "text": "D",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -290,7 +297,8 @@
            "text": "E",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -302,7 +310,8 @@
            "text": "F",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          }
        ],
        "num_rows": 3,
@@ -319,7 +328,8 @@
              "text": "Tab1",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -331,7 +341,8 @@
              "text": "Tab2",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -343,7 +354,8 @@
              "text": "Tab3",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ],
          [
@@ -357,7 +369,8 @@
              "text": "A",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -369,7 +382,8 @@
              "text": "B",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -381,7 +395,8 @@
              "text": "C",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ],
          [
@@ -395,7 +410,8 @@
              "text": "D",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -407,7 +423,8 @@
              "text": "E",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -419,7 +436,8 @@
              "text": "F",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ]
        ]
--- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt
@@ -1,8 +1,8 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: Test with three images in unusual formats
+  item-1 at level 1: text: Test with three images in unusual formats
-  item-2 at level 1: paragraph: Raster in emf:
+  item-2 at level 1: text: Raster in emf:
  item-3 at level 1: picture
-  item-4 at level 1: paragraph: Vector in emf:
+  item-4 at level 1: text: Vector in emf:
  item-5 at level 1: picture
-  item-6 at level 1: paragraph: Raster in webp:
+  item-6 at level 1: text: Raster in webp:
  item-7 at level 1: picture
--- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json
+++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "test_emf_docx",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -52,7 +52,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Test with three images in unusual formats",
      "text": "Test with three images in unusual formats",
@@ -71,7 +71,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Raster in emf:",
      "text": "Raster in emf:",
@@ -90,7 +90,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Vector in emf:",
      "text": "Vector in emf:",
@@ -109,7 +109,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Raster in webp:",
      "text": "Raster in webp:",
--- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt
@@ -1,90 +1,90 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
+  item-1 at level 1: text: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
-  item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
+  item-2 at level 1: text: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
-  item-3 at level 1: paragraph: 
+  item-3 at level 1: text: 
  item-4 at level 1: section: group textbox
-    item-5 at level 2: paragraph: Student falls ill
+    item-5 at level 2: text: Student falls ill
-    item-6 at level 2: paragraph: 
+    item-6 at level 2: text: 
    item-7 at level 2: list: group list
      item-8 at level 3: list_item: Suggested Reportable Symptoms:
 ＊ ... sh
 ＊ Blisters
 ＊ Headache
 ＊ Sore throat
-  item-9 at level 1: paragraph: 
+  item-9 at level 1: text: 
-  item-10 at level 1: paragraph: 
+  item-10 at level 1: text: 
  item-11 at level 1: section: group textbox
-    item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
+    item-12 at level 2: text: If a caregiver suspects that wit ... the same suggested reportable symptoms
-  item-13 at level 1: paragraph: 
+  item-13 at level 1: text: 
-  item-14 at level 1: paragraph: 
+  item-14 at level 1: text: 
-  item-15 at level 1: paragraph: 
+  item-15 at level 1: text: 
-  item-16 at level 1: paragraph: 
+  item-16 at level 1: text: 
  item-17 at level 1: section: group textbox
-    item-18 at level 2: paragraph: Yes
+    item-18 at level 2: text: Yes
-  item-19 at level 1: paragraph: 
+  item-19 at level 1: text: 
-  item-20 at level 1: paragraph: 
+  item-20 at level 1: text: 
  item-21 at level 1: section: group textbox
    item-22 at level 2: list: group list
      item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
      item-24 at level 3: list_item: A report must also be submitted  ... d Infectious Disease Reporting System.
-    item-25 at level 2: paragraph: 
+    item-25 at level 2: text: 
  item-26 at level 1: list: group list
-  item-27 at level 1: paragraph: 
+  item-27 at level 1: text: 
-  item-28 at level 1: paragraph: 
+  item-28 at level 1: text: 
-  item-29 at level 1: paragraph: 
+  item-29 at level 1: text: 
-  item-30 at level 1: paragraph: 
+  item-30 at level 1: text: 
-  item-31 at level 1: paragraph: 
+  item-31 at level 1: text: 
  item-32 at level 1: section: group textbox
-    item-33 at level 2: paragraph: Health Bureau:
+    item-33 at level 2: text: Health Bureau:
-    item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
+    item-34 at level 2: text: Upon receiving a report from the ... rt to the Centers for Disease Control.
    item-35 at level 2: list: group list
      item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
      item-37 at level 3: list_item: Implement appropriate epidemic p ...  the Communicable Disease Control Act.
-    item-38 at level 2: paragraph: 
+    item-38 at level 2: text: 
  item-39 at level 1: list: group list
-  item-40 at level 1: paragraph: 
+  item-40 at level 1: text: 
  item-41 at level 1: section: group textbox
-    item-42 at level 2: paragraph: Department of Education:
+    item-42 at level 2: text: Department of Education:
 Collabo ... vention measures at all school levels.
-  item-43 at level 1: paragraph: 
+  item-43 at level 1: text: 
-  item-44 at level 1: paragraph: 
+  item-44 at level 1: text: 
-  item-45 at level 1: paragraph: 
+  item-45 at level 1: text: 
-  item-46 at level 1: paragraph: 
+  item-46 at level 1: text: 
-  item-47 at level 1: paragraph: 
+  item-47 at level 1: text: 
-  item-48 at level 1: paragraph: 
+  item-48 at level 1: text: 
-  item-49 at level 1: paragraph: 
+  item-49 at level 1: text: 
  item-50 at level 1: section: group textbox
    item-51 at level 2: inline: group group
-      item-52 at level 3: paragraph: The Health Bureau will handle
+      item-52 at level 3: text: The Health Bureau will handle
-      item-53 at level 3: paragraph: reporting and specimen collection
+      item-53 at level 3: text: reporting and specimen collection
-      item-54 at level 3: paragraph: .
+      item-54 at level 3: text: .
-    item-55 at level 2: paragraph: 
+    item-55 at level 2: text: 
-  item-56 at level 1: paragraph: 
+  item-56 at level 1: text: 
-  item-57 at level 1: paragraph: 
+  item-57 at level 1: text: 
-  item-58 at level 1: paragraph: 
+  item-58 at level 1: text: 
  item-59 at level 1: section: group textbox
-    item-60 at level 2: paragraph: Whether the epidemic has eased.
+    item-60 at level 2: text: Whether the epidemic has eased.
-    item-61 at level 2: paragraph: 
+    item-61 at level 2: text: 
-  item-62 at level 1: paragraph: 
+  item-62 at level 1: text: 
  item-63 at level 1: section: group textbox
-    item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
+    item-64 at level 2: text: Whether the test results are pos ... legally designated infectious disease.
-    item-65 at level 2: paragraph: No
+    item-65 at level 2: text: No
-  item-66 at level 1: paragraph: 
+  item-66 at level 1: text: 
-  item-67 at level 1: paragraph: 
+  item-67 at level 1: text: 
  item-68 at level 1: section: group textbox
-    item-69 at level 2: paragraph: Yes
+    item-69 at level 2: text: Yes
-  item-70 at level 1: paragraph: 
+  item-70 at level 1: text: 
  item-71 at level 1: section: group textbox
-    item-72 at level 2: paragraph: Yes
+    item-72 at level 2: text: Yes
-  item-73 at level 1: paragraph: 
+  item-73 at level 1: text: 
-  item-74 at level 1: paragraph: 
+  item-74 at level 1: text: 
  item-75 at level 1: section: group textbox
-    item-76 at level 2: paragraph: Case closed.
+    item-76 at level 2: text: Case closed.
-    item-77 at level 2: paragraph: 
+    item-77 at level 2: text: 
-    item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
+    item-78 at level 2: text: The Health Bureau will carry out ... ters for Disease Control if necessary.
-  item-79 at level 1: paragraph: 
+  item-79 at level 1: text: 
  item-80 at level 1: section: group textbox
-    item-81 at level 2: paragraph: No
+    item-81 at level 2: text: No
-  item-82 at level 1: paragraph: 
+  item-82 at level 1: text: 
-  item-83 at level 1: paragraph: 
+  item-83 at level 1: text: 
-  item-84 at level 1: paragraph: 
+  item-84 at level 1: text: 
--- a/tests/data/groundtruth/docling_v2/textbox.docx.json
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "textbox",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -491,7 +491,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
      "text": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
@@ -510,7 +510,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
      "text": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
@@ -529,7 +529,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -541,7 +541,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Student falls ill",
      "text": "Student falls ill",
@@ -560,7 +560,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -593,7 +593,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -605,7 +605,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -617,7 +617,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
      "text": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
@@ -636,7 +636,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -648,7 +648,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -660,7 +660,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -672,7 +672,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -684,7 +684,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Yes",
      "text": "Yes",
@@ -703,7 +703,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -715,7 +715,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -769,7 +769,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -781,7 +781,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -793,7 +793,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -805,7 +805,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -817,7 +817,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -829,7 +829,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -841,7 +841,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Health Bureau:",
      "text": "Health Bureau:",
@@ -860,7 +860,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
      "text": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
@@ -921,7 +921,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -933,7 +933,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -945,7 +945,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
      "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
@@ -964,7 +964,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -976,7 +976,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -988,7 +988,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1000,7 +1000,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1012,7 +1012,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1024,7 +1024,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1036,7 +1036,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1048,7 +1048,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "The Health Bureau will handle",
      "text": "The Health Bureau will handle",
@@ -1067,7 +1067,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "reporting and specimen collection",
      "text": "reporting and specimen collection",
@@ -1086,7 +1086,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": ".",
      "text": ".",
@@ -1105,7 +1105,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1117,7 +1117,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1129,7 +1129,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1141,7 +1141,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1153,7 +1153,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Whether the epidemic has eased.",
      "text": "Whether the epidemic has eased.",
@@ -1172,7 +1172,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1184,7 +1184,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1196,7 +1196,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Whether the test results are positive for a legally designated infectious disease.",
      "text": "Whether the test results are positive for a legally designated infectious disease.",
@@ -1215,7 +1215,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "No",
      "text": "No",
@@ -1234,7 +1234,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1246,7 +1246,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1258,7 +1258,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Yes",
      "text": "Yes",
@@ -1277,7 +1277,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1289,7 +1289,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Yes",
      "text": "Yes",
@@ -1308,7 +1308,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1320,7 +1320,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1332,7 +1332,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Case closed.",
      "text": "Case closed.",
@@ -1351,7 +1351,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1363,7 +1363,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
      "text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
@@ -1382,7 +1382,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1394,7 +1394,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "No",
      "text": "No",
@@ -1413,7 +1413,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1425,7 +1425,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1437,7 +1437,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
@@ -1,18 +1,18 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: italic
+  item-1 at level 1: text: italic
-  item-2 at level 1: paragraph: bold
+  item-2 at level 1: text: bold
-  item-3 at level 1: paragraph: underline
+  item-3 at level 1: text: underline
-  item-4 at level 1: paragraph: hyperlink
+  item-4 at level 1: text: hyperlink
-  item-5 at level 1: paragraph: italic and bold hyperlink
+  item-5 at level 1: text: italic and bold hyperlink
  item-6 at level 1: inline: group group
-    item-7 at level 2: paragraph: Normal
+    item-7 at level 2: text: Normal
-    item-8 at level 2: paragraph: italic
+    item-8 at level 2: text: italic
-    item-9 at level 2: paragraph: bold
+    item-9 at level 2: text: bold
-    item-10 at level 2: paragraph: underline
+    item-10 at level 2: text: underline
-    item-11 at level 2: paragraph: and
+    item-11 at level 2: text: and
-    item-12 at level 2: paragraph: hyperlink
+    item-12 at level 2: text: hyperlink
-    item-13 at level 2: paragraph: on the same line
+    item-13 at level 2: text: on the same line
-  item-14 at level 1: paragraph: 
+  item-14 at level 1: text: 
  item-15 at level 1: list: group list
    item-16 at level 2: list_item: Italic bullet 1
    item-17 at level 2: list_item: Bold bullet 2
@@ -29,4 +29,4 @@ item-0 at level 0: unspecified: group _root_
          item-28 at level 5: text: Nested
          item-29 at level 5: text: italic
          item-30 at level 5: text: bold
-  item-31 at level 1: paragraph: 
+  item-31 at level 1: text: 
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "unit_test_formatting",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -174,7 +174,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "italic",
      "text": "italic",
@@ -193,7 +193,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
@@ -212,7 +212,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "underline",
      "text": "underline",
@@ -231,7 +231,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "hyperlink",
      "text": "hyperlink",
@@ -251,7 +251,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "italic and bold hyperlink",
      "text": "italic and bold hyperlink",
@@ -271,7 +271,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Normal",
      "text": "Normal",
@@ -290,7 +290,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "italic",
      "text": "italic",
@@ -309,7 +309,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
@@ -328,7 +328,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "underline",
      "text": "underline",
@@ -347,7 +347,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "and",
      "text": "and",
@@ -366,7 +366,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "hyperlink",
      "text": "hyperlink",
@@ -386,7 +386,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "on the same line",
      "text": "on the same line",
@@ -405,7 +405,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -649,7 +649,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.itxt
@@ -1,48 +1,48 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: title: Test Document
-    item-2 at level 2: paragraph: 
+    item-2 at level 2: text: 
    item-3 at level 2: section_header: Section 1
-      item-4 at level 3: paragraph: 
+      item-4 at level 3: text: 
-      item-5 at level 3: paragraph: Paragraph 1.1
+      item-5 at level 3: text: Paragraph 1.1
-      item-6 at level 3: paragraph: 
+      item-6 at level 3: text: 
-      item-7 at level 3: paragraph: Paragraph 1.2
+      item-7 at level 3: text: Paragraph 1.2
-      item-8 at level 3: paragraph: 
+      item-8 at level 3: text: 
      item-9 at level 3: section_header: Section 1.1
-        item-10 at level 4: paragraph: 
+        item-10 at level 4: text: 
-        item-11 at level 4: paragraph: Paragraph 1.1.1
+        item-11 at level 4: text: Paragraph 1.1.1
-        item-12 at level 4: paragraph: 
+        item-12 at level 4: text: 
-        item-13 at level 4: paragraph: Paragraph 1.1.2
+        item-13 at level 4: text: Paragraph 1.1.2
-        item-14 at level 4: paragraph: 
+        item-14 at level 4: text: 
      item-15 at level 3: section_header: Section 1.2
-        item-16 at level 4: paragraph: 
+        item-16 at level 4: text: 
-        item-17 at level 4: paragraph: Paragraph 1.1.1
+        item-17 at level 4: text: Paragraph 1.1.1
-        item-18 at level 4: paragraph: 
+        item-18 at level 4: text: 
-        item-19 at level 4: paragraph: Paragraph 1.1.2
+        item-19 at level 4: text: Paragraph 1.1.2
-        item-20 at level 4: paragraph: 
+        item-20 at level 4: text: 
        item-21 at level 4: section_header: Section 1.2.3
-          item-22 at level 5: paragraph: 
+          item-22 at level 5: text: 
-          item-23 at level 5: paragraph: Paragraph 1.2.3.1
+          item-23 at level 5: text: Paragraph 1.2.3.1
-          item-24 at level 5: paragraph: 
+          item-24 at level 5: text: 
-          item-25 at level 5: paragraph: Paragraph 1.2.3.1
+          item-25 at level 5: text: Paragraph 1.2.3.1
-          item-26 at level 5: paragraph: 
+          item-26 at level 5: text: 
-          item-27 at level 5: paragraph: 
+          item-27 at level 5: text: 
    item-28 at level 2: section_header: Section 2
-      item-29 at level 3: paragraph: 
+      item-29 at level 3: text: 
-      item-30 at level 3: paragraph: Paragraph 2.1
+      item-30 at level 3: text: Paragraph 2.1
-      item-31 at level 3: paragraph: 
+      item-31 at level 3: text: 
-      item-32 at level 3: paragraph: Paragraph 2.2
+      item-32 at level 3: text: Paragraph 2.2
-      item-33 at level 3: paragraph: 
+      item-33 at level 3: text: 
      item-34 at level 3: section: group header-2
        item-35 at level 4: section_header: Section 2.1.1
-          item-36 at level 5: paragraph: 
+          item-36 at level 5: text: 
-          item-37 at level 5: paragraph: Paragraph 2.1.1.1
+          item-37 at level 5: text: Paragraph 2.1.1.1
-          item-38 at level 5: paragraph: 
+          item-38 at level 5: text: 
-          item-39 at level 5: paragraph: Paragraph 2.1.1.1
+          item-39 at level 5: text: Paragraph 2.1.1.1
-          item-40 at level 5: paragraph: 
+          item-40 at level 5: text: 
      item-41 at level 3: section_header: Section 2.1
-        item-42 at level 4: paragraph: 
+        item-42 at level 4: text: 
-        item-43 at level 4: paragraph: Paragraph 2.1.1
+        item-43 at level 4: text: Paragraph 2.1.1
-        item-44 at level 4: paragraph: 
+        item-44 at level 4: text: 
-        item-45 at level 4: paragraph: Paragraph 2.1.2
+        item-45 at level 4: text: Paragraph 2.1.2
-        item-46 at level 4: paragraph: 
+        item-46 at level 4: text: 
-        item-47 at level 4: paragraph: 
+        item-47 at level 4: text: 
--- a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "unit_test_headers",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -71,7 +71,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -118,7 +118,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -130,7 +130,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1",
      "text": "Paragraph 1.1",
@@ -149,7 +149,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -161,7 +161,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.2",
      "text": "Paragraph 1.2",
@@ -180,7 +180,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -221,7 +221,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -233,7 +233,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.1",
      "text": "Paragraph 1.1.1",
@@ -252,7 +252,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -264,7 +264,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.2",
      "text": "Paragraph 1.1.2",
@@ -283,7 +283,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -327,7 +327,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -339,7 +339,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.1",
      "text": "Paragraph 1.1.1",
@@ -358,7 +358,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -370,7 +370,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.2",
      "text": "Paragraph 1.1.2",
@@ -389,7 +389,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -433,7 +433,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -445,7 +445,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.2.3.1",
      "text": "Paragraph 1.2.3.1",
@@ -464,7 +464,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -476,7 +476,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.2.3.1",
      "text": "Paragraph 1.2.3.1",
@@ -495,7 +495,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -507,7 +507,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -554,7 +554,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -566,7 +566,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1",
      "text": "Paragraph 2.1",
@@ -585,7 +585,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -597,7 +597,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.2",
      "text": "Paragraph 2.2",
@@ -616,7 +616,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -657,7 +657,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -669,7 +669,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1.1",
      "text": "Paragraph 2.1.1.1",
@@ -688,7 +688,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -700,7 +700,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1.1",
      "text": "Paragraph 2.1.1.1",
@@ -719,7 +719,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -763,7 +763,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -775,7 +775,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1",
      "text": "Paragraph 2.1.1",
@@ -794,7 +794,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -806,7 +806,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.2",
      "text": "Paragraph 2.1.2",
@@ -825,7 +825,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -837,7 +837,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt
@@ -1,52 +1,52 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: title: Test Document
-    item-2 at level 2: paragraph: 
+    item-2 at level 2: text: 
    item-3 at level 2: section_header: 1 Section 1
-  item-4 at level 1: paragraph: 
+  item-4 at level 1: text: 
-  item-5 at level 1: paragraph: Paragraph 1.1
+  item-5 at level 1: text: Paragraph 1.1
-  item-6 at level 1: paragraph: 
+  item-6 at level 1: text: 
-  item-7 at level 1: paragraph: Paragraph 1.2
+  item-7 at level 1: text: Paragraph 1.2
-  item-8 at level 1: paragraph: 
+  item-8 at level 1: text: 
  item-9 at level 1: section: group header-0
    item-10 at level 2: section: group header-1
      item-11 at level 3: section_header: 1.1 Section 1.1
-        item-12 at level 4: paragraph: 
+        item-12 at level 4: text: 
-        item-13 at level 4: paragraph: Paragraph 1.1.1
+        item-13 at level 4: text: Paragraph 1.1.1
-        item-14 at level 4: paragraph: 
+        item-14 at level 4: text: 
-        item-15 at level 4: paragraph: Paragraph 1.1.2
+        item-15 at level 4: text: Paragraph 1.1.2
-        item-16 at level 4: paragraph: 
+        item-16 at level 4: text: 
      item-17 at level 3: section_header: 1.2 Section 1.2
-        item-18 at level 4: paragraph: 
+        item-18 at level 4: text: 
-        item-19 at level 4: paragraph: Paragraph 1.1.1
+        item-19 at level 4: text: Paragraph 1.1.1
-        item-20 at level 4: paragraph: 
+        item-20 at level 4: text: 
-        item-21 at level 4: paragraph: Paragraph 1.1.2
+        item-21 at level 4: text: Paragraph 1.1.2
-        item-22 at level 4: paragraph: 
+        item-22 at level 4: text: 
        item-23 at level 4: section_header: 1.2.1 Section 1.2.3
-          item-24 at level 5: paragraph: 
+          item-24 at level 5: text: 
-          item-25 at level 5: paragraph: Paragraph 1.2.3.1
+          item-25 at level 5: text: Paragraph 1.2.3.1
-          item-26 at level 5: paragraph: 
+          item-26 at level 5: text: 
-          item-27 at level 5: paragraph: Paragraph 1.2.3.1
+          item-27 at level 5: text: Paragraph 1.2.3.1
-          item-28 at level 5: paragraph: 
+          item-28 at level 5: text: 
-          item-29 at level 5: paragraph: 
+          item-29 at level 5: text: 
    item-30 at level 2: section_header: 2 Section 2
-  item-31 at level 1: paragraph: 
+  item-31 at level 1: text: 
-  item-32 at level 1: paragraph: Paragraph 2.1
+  item-32 at level 1: text: Paragraph 2.1
-  item-33 at level 1: paragraph: 
+  item-33 at level 1: text: 
-  item-34 at level 1: paragraph: Paragraph 2.2
+  item-34 at level 1: text: Paragraph 2.2
-  item-35 at level 1: paragraph: 
+  item-35 at level 1: text: 
  item-36 at level 1: section: group header-0
    item-37 at level 2: section: group header-1
      item-38 at level 3: section: group header-2
        item-39 at level 4: section_header: 2.1.1 Section 2.1.1
-          item-40 at level 5: paragraph: 
+          item-40 at level 5: text: 
-          item-41 at level 5: paragraph: Paragraph 2.1.1.1
+          item-41 at level 5: text: Paragraph 2.1.1.1
-          item-42 at level 5: paragraph: 
+          item-42 at level 5: text: 
-          item-43 at level 5: paragraph: Paragraph 2.1.1.1
+          item-43 at level 5: text: Paragraph 2.1.1.1
-          item-44 at level 5: paragraph: 
+          item-44 at level 5: text: 
      item-45 at level 3: section_header: 2.2 Section 2.1
-        item-46 at level 4: paragraph: 
+        item-46 at level 4: text: 
-        item-47 at level 4: paragraph: Paragraph 2.1.1
+        item-47 at level 4: text: Paragraph 2.1.1
-        item-48 at level 4: paragraph: 
+        item-48 at level 4: text: 
-        item-49 at level 4: paragraph: Paragraph 2.1.2
+        item-49 at level 4: text: Paragraph 2.1.2
-        item-50 at level 4: paragraph: 
+        item-50 at level 4: text: 
-        item-51 at level 4: paragraph: 
+        item-51 at level 4: text: 
--- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "unit_test_headers_numbered",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -169,7 +169,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -194,7 +194,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -206,7 +206,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1",
      "text": "Paragraph 1.1",
@@ -225,7 +225,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -237,7 +237,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.2",
      "text": "Paragraph 1.2",
@@ -256,7 +256,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -297,7 +297,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -309,7 +309,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.1",
      "text": "Paragraph 1.1.1",
@@ -328,7 +328,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -340,7 +340,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.2",
      "text": "Paragraph 1.1.2",
@@ -359,7 +359,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -403,7 +403,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -415,7 +415,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.1",
      "text": "Paragraph 1.1.1",
@@ -434,7 +434,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -446,7 +446,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.1.2",
      "text": "Paragraph 1.1.2",
@@ -465,7 +465,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -509,7 +509,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -521,7 +521,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.2.3.1",
      "text": "Paragraph 1.2.3.1",
@@ -540,7 +540,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -552,7 +552,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 1.2.3.1",
      "text": "Paragraph 1.2.3.1",
@@ -571,7 +571,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -583,7 +583,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -608,7 +608,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -620,7 +620,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1",
      "text": "Paragraph 2.1",
@@ -639,7 +639,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -651,7 +651,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.2",
      "text": "Paragraph 2.2",
@@ -670,7 +670,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -711,7 +711,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -723,7 +723,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1.1",
      "text": "Paragraph 2.1.1.1",
@@ -742,7 +742,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -754,7 +754,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1.1",
      "text": "Paragraph 2.1.1.1",
@@ -773,7 +773,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -817,7 +817,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -829,7 +829,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1",
      "text": "Paragraph 2.1.1",
@@ -848,7 +848,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -860,7 +860,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.2",
      "text": "Paragraph 2.1.2",
@@ -879,7 +879,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -891,7 +891,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.itxt
@@ -1,25 +1,25 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group header-0
    item-2 at level 2: section_header: Test Document
-      item-3 at level 3: paragraph: 
+      item-3 at level 3: text: 
-      item-4 at level 3: paragraph: 
+      item-4 at level 3: text: 
-      item-5 at level 3: paragraph: Paragraph 2.1.1
+      item-5 at level 3: text: Paragraph 2.1.1
-      item-6 at level 3: paragraph: 
+      item-6 at level 3: text: 
-      item-7 at level 3: paragraph: Paragraph 2.1.2
+      item-7 at level 3: text: Paragraph 2.1.2
-      item-8 at level 3: paragraph: 
+      item-8 at level 3: text: 
      item-9 at level 3: section: group header-2
        item-10 at level 4: section_header: Test 1:
          item-11 at level 5: list: group list
            item-12 at level 6: list_item: List item 1
            item-13 at level 6: list_item: List item 2
            item-14 at level 6: list_item: List item 3
-          item-15 at level 5: paragraph: 
+          item-15 at level 5: text: 
        item-16 at level 4: section_header: Test 2:
          item-17 at level 5: list: group list
            item-18 at level 6: list_item: List item a
            item-19 at level 6: list_item: List item b
            item-20 at level 6: list_item: List item c
-          item-21 at level 5: paragraph: 
+          item-21 at level 5: text: 
        item-22 at level 4: section_header: Test 3:
          item-23 at level 5: list: group list
            item-24 at level 6: list_item: List item 1
@@ -29,14 +29,14 @@ item-0 at level 0: unspecified: group _root_
              item-28 at level 7: list_item: List item 1.2
              item-29 at level 7: list_item: List item 1.3
            item-30 at level 6: list_item: List item 3
-          item-31 at level 5: paragraph: 
+          item-31 at level 5: text: 
        item-32 at level 4: section_header: Test 4:
          item-33 at level 5: list: group list
            item-34 at level 6: list_item: List item 1
            item-35 at level 6: list: group list
              item-36 at level 7: list_item: List item 1.1
            item-37 at level 6: list_item: List item 2
-          item-38 at level 5: paragraph: 
+          item-38 at level 5: text: 
        item-39 at level 4: section_header: Test 5:
          item-40 at level 5: list: group list
            item-41 at level 6: list_item: List item 1
@@ -45,7 +45,7 @@ item-0 at level 0: unspecified: group _root_
              item-44 at level 7: list: group list
                item-45 at level 8: list_item: List item 1.1.1
            item-46 at level 6: list_item: List item 3
-          item-47 at level 5: paragraph: 
+          item-47 at level 5: text: 
        item-48 at level 4: section_header: Test 6:
          item-49 at level 5: list: group list
            item-50 at level 6: list_item: List item 1
@@ -56,6 +56,6 @@ item-0 at level 0: unspecified: group _root_
              item-55 at level 7: list: group list
                item-56 at level 8: list_item: List item 1.2.1
            item-57 at level 6: list_item: List item 3
-          item-58 at level 5: paragraph: 
+          item-58 at level 5: text: 
-          item-59 at level 5: paragraph: 
+          item-59 at level 5: text: 
-          item-60 at level 5: paragraph: 
+          item-60 at level 5: text: 
--- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "unit_test_lists",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -338,7 +338,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -350,7 +350,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -362,7 +362,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.1",
      "text": "Paragraph 2.1.1",
@@ -381,7 +381,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -393,7 +393,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Paragraph 2.1.2",
      "text": "Paragraph 2.1.2",
@@ -412,7 +412,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -507,7 +507,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -602,7 +602,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -760,7 +760,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -855,7 +855,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -971,7 +971,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1135,7 +1135,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1147,7 +1147,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -1159,7 +1159,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -0,0 +1,66 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group WebVTT cue block
    item-2 at level 2: text: 00:11.000 --> 00:13.000
    item-3 at level 2: inline: group WebVTT cue voice span
      item-4 at level 3: text: Roger Bingham: 
      item-5 at level 3: text: We are in New York City
  item-6 at level 1: section: group WebVTT cue block
    item-7 at level 2: text: 00:13.000 --> 00:16.000
    item-8 at level 2: inline: group WebVTT cue voice span
      item-9 at level 3: text: Roger Bingham: 
      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
  item-11 at level 1: section: group WebVTT cue block
    item-12 at level 2: text: 00:16.000 --> 00:18.000
    item-13 at level 2: inline: group WebVTT cue voice span
      item-14 at level 3: text: Roger Bingham: 
      item-15 at level 3: text: from the American Museum of Natural History
  item-16 at level 1: section: group WebVTT cue block
    item-17 at level 2: text: 00:18.000 --> 00:20.000
    item-18 at level 2: inline: group WebVTT cue voice span
      item-19 at level 3: text: Roger Bingham: 
      item-20 at level 3: text: And with me is Neil deGrasse Tyson
  item-21 at level 1: section: group WebVTT cue block
    item-22 at level 2: text: 00:20.000 --> 00:22.000
    item-23 at level 2: inline: group WebVTT cue voice span
      item-24 at level 3: text: Roger Bingham: 
      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
  item-26 at level 1: section: group WebVTT cue block
    item-27 at level 2: text: 00:22.000 --> 00:24.000
    item-28 at level 2: inline: group WebVTT cue voice span
      item-29 at level 3: text: Roger Bingham: 
      item-30 at level 3: text: at the AMNH.
  item-31 at level 1: section: group WebVTT cue block
    item-32 at level 2: text: 00:24.000 --> 00:26.000
    item-33 at level 2: inline: group WebVTT cue voice span
      item-34 at level 3: text: Roger Bingham: 
      item-35 at level 3: text: Thank you for walking down here.
  item-36 at level 1: section: group WebVTT cue block
    item-37 at level 2: text: 00:27.000 --> 00:30.000
    item-38 at level 2: inline: group WebVTT cue voice span
      item-39 at level 3: text: Roger Bingham: 
      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
  item-41 at level 1: section: group WebVTT cue block
    item-42 at level 2: text: 00:30.000 --> 00:31.500
    item-43 at level 2: inline: group WebVTT cue voice span
      item-44 at level 3: text: Roger Bingham: 
      item-45 at level 3: text: When we e-mailed—
  item-46 at level 1: section: group WebVTT cue block
    item-47 at level 2: text: 00:30.500 --> 00:32.500
    item-48 at level 2: inline: group WebVTT cue voice span
      item-49 at level 3: text: Neil deGrasse Tyson: 
      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
  item-51 at level 1: section: group WebVTT cue block
    item-52 at level 2: text: 00:32.000 --> 00:35.500
    item-53 at level 2: inline: group WebVTT cue voice span
      item-54 at level 3: text: Roger Bingham: 
      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
  item-56 at level 1: section: group WebVTT cue block
    item-57 at level 2: text: 00:32.500 --> 00:33.500
    item-58 at level 2: inline: group WebVTT cue voice span
      item-59 at level 3: text: Neil deGrasse Tyson: 
      item-60 at level 3: text: Laughs
  item-61 at level 1: section: group WebVTT cue block
    item-62 at level 2: text: 00:35.500 --> 00:38.000
    item-63 at level 2: inline: group WebVTT cue voice span
      item-64 at level 3: text: Roger Bingham: 
      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -0,0 +1,51 @@
 00:11.000 --> 00:13.000
 Roger Bingham:  We are in New York City
 00:13.000 --> 00:16.000
 Roger Bingham:  We’re actually at the Lucern Hotel, just down the street
 00:16.000 --> 00:18.000
 Roger Bingham:  from the American Museum of Natural History
 00:18.000 --> 00:20.000
 Roger Bingham:  And with me is Neil deGrasse Tyson
 00:20.000 --> 00:22.000
 Roger Bingham:  Astrophysicist, Director of the Hayden Planetarium
 00:22.000 --> 00:24.000
 Roger Bingham:  at the AMNH.
 00:24.000 --> 00:26.000
 Roger Bingham:  Thank you for walking down here.
 00:27.000 --> 00:30.000
 Roger Bingham:  And I want to do a follow-up on the last conversation we did.
 00:30.000 --> 00:31.500
 Roger Bingham:  When we e-mailed—
 00:30.500 --> 00:32.500
 Neil deGrasse Tyson:  Didn’t we talk about enough in that conversation?
 00:32.000 --> 00:35.500
 Roger Bingham:  No! No no no no; 'cos 'cos obviously 'cos
 00:32.500 --> 00:33.500
 Neil deGrasse Tyson:  *Laughs*
 00:35.500 --> 00:38.000
 Roger Bingham:  You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -0,0 +1,22 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group WebVTT cue block
    item-2 at level 2: text: 00:00.000 --> 00:02.000
    item-3 at level 2: inline: group WebVTT cue voice span
      item-4 at level 3: text: Esme (first, loud): 
      item-5 at level 3: text: It’s a blue apple tree!
  item-6 at level 1: section: group WebVTT cue block
    item-7 at level 2: text: 00:02.000 --> 00:04.000
    item-8 at level 2: inline: group WebVTT cue voice span
      item-9 at level 3: text: Mary: 
      item-10 at level 3: text: No way!
  item-11 at level 1: section: group WebVTT cue block
    item-12 at level 2: text: 00:04.000 --> 00:06.000
    item-13 at level 2: inline: group WebVTT cue voice span
      item-14 at level 3: text: Esme: 
      item-15 at level 3: text: Hee!
    item-16 at level 2: text: laughter
  item-17 at level 1: section: group WebVTT cue block
    item-18 at level 2: text: 00:06.000 --> 00:08.000
    item-19 at level 2: inline: group WebVTT cue voice span
      item-20 at level 3: text: Mary (loud): 
      item-21 at level 3: text: That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -0,0 +1,376 @@
 {
  "schema_name": "DoclingDocument",
  "version": "1.6.0",
  "name": "webvtt_example_02",
  "origin": {
    "mimetype": "text/vtt",
    "binary_hash": 12867774546881601731,
    "filename": "webvtt_example_02.vtt"
  },
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
  "body": {
    "self_ref": "#/body",
    "children": [
      {
        "$ref": "#/groups/0"
      },
      {
        "$ref": "#/groups/2"
      },
      {
        "$ref": "#/groups/4"
      },
      {
        "$ref": "#/groups/6"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/0"
        },
        {
          "$ref": "#/groups/1"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/1",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [
        {
          "$ref": "#/texts/1"
        },
        {
          "$ref": "#/texts/2"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/2",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/3"
        },
        {
          "$ref": "#/groups/3"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/3",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [
        {
          "$ref": "#/texts/4"
        },
        {
          "$ref": "#/texts/5"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/4",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/6"
        },
        {
          "$ref": "#/groups/5"
        },
        {
          "$ref": "#/texts/9"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/5",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [
        {
          "$ref": "#/texts/7"
        },
        {
          "$ref": "#/texts/8"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/6",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/10"
        },
        {
          "$ref": "#/groups/7"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/7",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [
        {
          "$ref": "#/texts/11"
        },
        {
          "$ref": "#/texts/12"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:00.000 --> 00:02.000",
      "text": "00:00.000 --> 00:02.000"
    },
    {
      "self_ref": "#/texts/1",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Esme (first, loud): ",
      "text": "Esme (first, loud): "
    },
    {
      "self_ref": "#/texts/2",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "It’s a blue apple tree!",
      "text": "It’s a blue apple tree!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/3",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:02.000 --> 00:04.000",
      "text": "00:02.000 --> 00:04.000"
    },
    {
      "self_ref": "#/texts/4",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Mary: ",
      "text": "Mary: "
    },
    {
      "self_ref": "#/texts/5",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "No way!",
      "text": "No way!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/6",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:04.000 --> 00:06.000",
      "text": "00:04.000 --> 00:06.000"
    },
    {
      "self_ref": "#/texts/7",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Esme: ",
      "text": "Esme: "
    },
    {
      "self_ref": "#/texts/8",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Hee!",
      "text": "Hee!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/9",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "laughter",
      "text": "laughter",
      "formatting": {
        "bold": false,
        "italic": true,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/10",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:06.000 --> 00:08.000",
      "text": "00:06.000 --> 00:08.000"
    },
    {
      "self_ref": "#/texts/11",
      "parent": {
        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Mary (loud): ",
      "text": "Mary (loud): "
    },
    {
      "self_ref": "#/texts/12",
      "parent": {
        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "That’s awesome!",
      "text": "That’s awesome!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    }
  ],
  "pictures": [],
  "tables": [],
  "key_value_items": [],
  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -0,0 +1,17 @@
 00:00.000 --> 00:02.000
 Esme (first, loud):  It’s a blue apple tree!
 00:02.000 --> 00:04.000
 Mary:  No way!
 00:04.000 --> 00:06.000
 Esme:  Hee!
 *laughter*
 00:06.000 --> 00:08.000
 Mary (loud):  That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -0,0 +1,77 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group WebVTT cue block
    item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
    item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
    item-4 at level 2: inline: group WebVTT cue voice span
      item-5 at level 3: text: Speaker A: 
      item-6 at level 3: text: OK, I think now we should be recording
  item-7 at level 1: section: group WebVTT cue block
    item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
    item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
    item-10 at level 2: inline: group WebVTT cue voice span
      item-11 at level 3: text: Speaker A: 
      item-12 at level 3: text: properly.
  item-13 at level 1: section: group WebVTT cue block
    item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
    item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
    item-16 at level 2: text: Good.
  item-17 at level 1: section: group WebVTT cue block
    item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
    item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
    item-20 at level 2: inline: group WebVTT cue voice span
      item-21 at level 3: text: Speaker A: 
      item-22 at level 3: text: Yeah.
  item-23 at level 1: section: group WebVTT cue block
    item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
    item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
    item-26 at level 2: inline: group WebVTT cue voice span
      item-27 at level 3: text: Speaker B: 
      item-28 at level 3: text: I was also thinking.
  item-29 at level 1: section: group WebVTT cue block
    item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
    item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
    item-32 at level 2: inline: group WebVTT cue voice span
      item-33 at level 3: text: Speaker B: 
      item-34 at level 3: text: Would be maybe good to create items,
  item-35 at level 1: section: group WebVTT cue block
    item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
    item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
    item-38 at level 2: inline: group WebVTT cue voice span
      item-39 at level 3: text: Speaker B: 
      item-40 at level 3: text: some metadata, some options that can be specific.
  item-41 at level 1: section: group WebVTT cue block
    item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
    item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
    item-44 at level 2: inline: group WebVTT cue voice span
      item-45 at level 3: text: Speaker A: 
      item-46 at level 3: text: Yeah, I mean I think you went even more than
  item-47 at level 1: section: group WebVTT cue block
    item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
    item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
    item-50 at level 2: inline: group WebVTT cue voice span
      item-51 at level 3: text: Speaker B: 
      item-52 at level 3: text: But we preserved the atoms.
  item-53 at level 1: section: group WebVTT cue block
    item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
    item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
    item-56 at level 2: inline: group WebVTT cue voice span
      item-57 at level 3: text: Speaker A: 
      item-58 at level 3: text: than me. I just opened the format.
  item-59 at level 1: section: group WebVTT cue block
    item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
    item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
    item-62 at level 2: inline: group WebVTT cue voice span
      item-63 at level 3: text: Speaker A: 
      item-64 at level 3: text: give it a try, yeah.
  item-65 at level 1: section: group WebVTT cue block
    item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
    item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
    item-68 at level 2: inline: group WebVTT cue voice span
      item-69 at level 3: text: Speaker B: 
      item-70 at level 3: text: Okay, talk to you later.
  item-71 at level 1: section: group WebVTT cue block
    item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
    item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
    item-74 at level 2: inline: group WebVTT cue voice span
      item-75 at level 3: text: Speaker A: 
      item-76 at level 3: text: See you.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -0,0 +1,77 @@
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
 00:00:04.963 --> 00:00:08.571
 Speaker A:  OK, I think now we should be recording
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
 00:00:08.571 --> 00:00:09.403
 Speaker A:  properly.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
 00:00:10.683 --> 00:00:11.563
 Good.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
 00:00:13.363 --> 00:00:13.803
 Speaker A:  Yeah.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
 00:00:49.603 --> 00:00:53.363
 Speaker B:  I was also thinking.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
 00:00:54.963 --> 00:01:02.072
 Speaker B:  Would be maybe good to create items,
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
 00:01:02.072 --> 00:01:06.811
 Speaker B:  some metadata, some options that can be specific.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
 00:01:10.243 --> 00:01:13.014
 Speaker A:  Yeah, I mean I think you went even more than
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
 00:01:10.563 --> 00:01:12.643
 Speaker B:  But we preserved the atoms.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
 00:01:13.014 --> 00:01:15.907
 Speaker A:  than me. I just opened the format.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
 00:01:50.222 --> 00:01:51.643
 Speaker A:  give it a try, yeah.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
 00:01:52.043 --> 00:01:55.043
 Speaker B:  Okay, talk to you later.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
 00:01:54.603 --> 00:01:55.283
 Speaker A:  See you.
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
@@ -1,16 +1,16 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: Transcript
+  item-1 at level 1: text: Transcript
-  item-2 at level 1: paragraph: February 20, 2025, 8:32PM
+  item-2 at level 1: text: February 20, 2025, 8:32PM
  item-3 at level 1: picture
  item-4 at level 1: inline: group group
-    item-5 at level 2: paragraph: This is test 1
+    item-5 at level 2: text: This is test 1
-    item-6 at level 2: paragraph: 0:08
+    item-6 at level 2: text: 0:08
 Correct, he is not.
-  item-7 at level 1: paragraph: 
+  item-7 at level 1: text: 
  item-8 at level 1: picture
  item-9 at level 1: inline: group group
-    item-10 at level 2: paragraph: This is test 2
+    item-10 at level 2: text: This is test 2
-    item-11 at level 2: paragraph: 0:16
+    item-11 at level 2: text: 0:16
 Yeah, exactly.
-  item-12 at level 1: paragraph: 
+  item-12 at level 1: text: 
-  item-13 at level 1: paragraph: 
+  item-13 at level 1: text: 
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "word_image_anchors",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -93,7 +93,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Transcript",
      "text": "Transcript",
@@ -112,7 +112,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "February 20, 2025, 8:32PM",
      "text": "February 20, 2025, 8:32PM",
@@ -131,7 +131,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is test 1",
      "text": "This is test 1",
@@ -150,7 +150,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "0:08\nCorrect, he is not.",
      "text": "0:08\nCorrect, he is not.",
@@ -169,7 +169,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -181,7 +181,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "This is test 2",
      "text": "This is test 2",
@@ -200,7 +200,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "0:16\nYeah, exactly.",
      "text": "0:16\nYeah, exactly.",
@@ -219,7 +219,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -231,7 +231,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt
@@ -1,28 +1,28 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: Summer activities
+  item-1 at level 1: text: Summer activities
  item-2 at level 1: title: Swimming in the lake
-    item-3 at level 2: paragraph: Duck
+    item-3 at level 2: text: Duck
    item-4 at level 2: picture
-    item-5 at level 2: paragraph: Figure 1: This is a cute duckling
+    item-5 at level 2: text: Figure 1: This is a cute duckling
    item-6 at level 2: section_header: Let’s swim!
-      item-7 at level 3: paragraph: To get started with swimming, fi ...  down in a water and try not to drown:
+      item-7 at level 3: text: To get started with swimming, fi ...  down in a water and try not to drown:
      item-8 at level 3: list: group list
        item-9 at level 4: list_item: You can relax and look around
        item-10 at level 4: list_item: Paddle about
        item-11 at level 4: list_item: Enjoy summer warmth
-      item-12 at level 3: paragraph: Also, don’t forget:
+      item-12 at level 3: text: Also, don’t forget:
      item-13 at level 3: list: group list
        item-14 at level 4: list_item: Wear sunglasses
        item-15 at level 4: list_item: Don’t forget to drink water
        item-16 at level 4: list_item: Use sun cream
-      item-17 at level 3: paragraph: Hmm, what else…
+      item-17 at level 3: text: Hmm, what else…
      item-18 at level 3: section_header: Let’s eat
-        item-19 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice
+        item-19 at level 4: text: After we had a good day of swimm ... , it’s important to eat something nice
-        item-20 at level 4: paragraph: I like to eat leaves
+        item-20 at level 4: text: I like to eat leaves
-        item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat:
+        item-21 at level 4: text: Here are some interesting things a respectful duck could eat:
        item-22 at level 4: table with [4x3]
-        item-23 at level 4: paragraph: 
+        item-23 at level 4: text: 
-        item-24 at level 4: paragraph: And let’s add another list in the end:
+        item-24 at level 4: text: And let’s add another list in the end:
        item-25 at level 4: list: group list
          item-26 at level 5: list_item: Leaves
          item-27 at level 5: list_item: Berries
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json
@@ -1,6 +1,6 @@
 {
  "schema_name": "DoclingDocument",
-  "version": "1.6.0",
+  "version": "1.7.0",
  "name": "word_sample",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -98,7 +98,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Summer activities",
      "text": "Summer activities",
@@ -142,7 +142,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Duck",
      "text": "Duck",
@@ -161,7 +161,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Figure 1: This is a cute duckling",
      "text": "Figure 1: This is a cute duckling",
@@ -212,7 +212,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "To get started with swimming, first lay down in a water and try not to drown:",
      "text": "To get started with swimming, first lay down in a water and try not to drown:",
@@ -294,7 +294,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Also, don’t forget:",
      "text": "Also, don’t forget:",
@@ -376,7 +376,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Hmm, what else…",
      "text": "Hmm, what else…",
@@ -430,7 +430,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "After we had a good day of swimming in the lake, it’s important to eat something nice",
      "text": "After we had a good day of swimming in the lake, it’s important to eat something nice",
@@ -449,7 +449,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "I like to eat leaves",
      "text": "I like to eat leaves",
@@ -468,7 +468,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "Here are some interesting things a respectful duck could eat:",
      "text": "Here are some interesting things a respectful duck could eat:",
@@ -487,7 +487,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "",
      "text": ""
@@ -499,7 +499,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
      "prov": [],
      "orig": "And let’s add another list in the end:",
      "text": "And let’s add another list in the end:",
@@ -625,7 +625,8 @@
            "text": "",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -637,7 +638,8 @@
            "text": "Food",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -649,7 +651,8 @@
            "text": "Calories per portion",
            "column_header": true,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -661,7 +664,8 @@
            "text": "Leaves",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -673,7 +677,8 @@
            "text": "Ash, Elm, Maple",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -685,7 +690,8 @@
            "text": "50",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -697,7 +703,8 @@
            "text": "Berries",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -709,7 +716,8 @@
            "text": "Blueberry, Strawberry, Cranberry",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -721,7 +729,8 @@
            "text": "150",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -733,7 +742,8 @@
            "text": "Grain",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -745,7 +755,8 @@
            "text": "Corn, Buckwheat, Barley",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          },
          {
            "row_span": 1,
@@ -757,7 +768,8 @@
            "text": "200",
            "column_header": false,
            "row_header": false,
-            "row_section": false
+            "row_section": false,
            "fillable": false
          }
        ],
        "num_rows": 4,
@@ -774,7 +786,8 @@
              "text": "",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -786,7 +799,8 @@
              "text": "Food",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -798,7 +812,8 @@
              "text": "Calories per portion",
              "column_header": true,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ],
          [
@@ -812,7 +827,8 @@
              "text": "Leaves",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -824,7 +840,8 @@
              "text": "Ash, Elm, Maple",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -836,7 +853,8 @@
              "text": "50",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ],
          [
@@ -850,7 +868,8 @@
              "text": "Berries",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -862,7 +881,8 @@
              "text": "Blueberry, Strawberry, Cranberry",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -874,7 +894,8 @@
              "text": "150",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ],
          [
@@ -888,7 +909,8 @@
              "text": "Grain",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -900,7 +922,8 @@
              "text": "Corn, Buckwheat, Barley",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            },
            {
              "row_span": 1,
@@ -912,7 +935,8 @@
              "text": "200",
              "column_header": false,
              "row_header": false,
-              "row_section": false
+              "row_section": false,
              "fillable": false
            }
          ]
        ]
--- a/tests/data/groundtruth/docling_v2/word_tables.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt
@@ -1,19 +1,19 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group header-0
    item-2 at level 2: section_header: Test with tables
-      item-3 at level 3: paragraph: A uniform table
+      item-3 at level 3: text: A uniform table
      item-4 at level 3: table with [3x3]
-      item-5 at level 3: paragraph: 
+      item-5 at level 3: text: 
-      item-6 at level 3: paragraph: A non-uniform table with horizontal spans
+      item-6 at level 3: text: A non-uniform table with horizontal spans
      item-7 at level 3: table with [3x3]
-      item-8 at level 3: paragraph: 
+      item-8 at level 3: text: 
-      item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns
+      item-9 at level 3: text: A non-uniform table with horizontal spans in inner columns
      item-10 at level 3: table with [3x4]
-      item-11 at level 3: paragraph: 
+      item-11 at level 3: text: 
-      item-12 at level 3: paragraph: A non-uniform table with vertical spans
+      item-12 at level 3: text: A non-uniform table with vertical spans
      item-13 at level 3: table with [5x3]
-      item-14 at level 3: paragraph: 
+      item-14 at level 3: text: 
-      item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells
+      item-15 at level 3: text: A non-uniform table with all kinds of spans and empty cells
      item-16 at level 3: table with [9x5]
-      item-17 at level 3: paragraph: 
+      item-17 at level 3: text: 
-      item-18 at level 3: paragraph: 
+      item-18 at level 3: text: 
--- a/tests/data/groundtruth/docling_v2/word_tables.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json
--- a/tests/data/md/escaped_characters.md
+++ b/tests/data/md/escaped_characters.md
@@ -0,0 +1,33 @@
 # Headers: 
 ## &amp; &lt; &gt; &quot; &#39; 
 Text:
 00:16.000 ----&gt; 00:18.000
 &amp; &lt; &gt; &quot; &#39; 
 # Lists
 1. &amp; &lt; &gt; &quot; &#39; 
 - &amp; &lt; &gt; &quot; &#39;
 # Inline code
 `&amp; &lt; &gt; &quot; &#39; `
 # Code block
 ```
 &amp; &lt; &gt; &quot; &#39; 
 ```
 # Table
 | Key                 | Example           |
 | ------------------- | ----------------- |         
 | Ampersand           | &amp;             |
 | Less-than           | &lt;              |
 | Greater-than        | &gt;              |
 | Quotes              | &quot;            |
 | Apostrophes         | &#39;             |
 # Raw HTML
 <div title="">&amp; &lt; &gt; &quot; &#39;/div> 
 ## Link
 [&amp; &lt; &gt; &quot; &#39;](https://en.wikipedia.org/wiki/Albert_Einstein)
--- a/tests/data/webvtt/webvtt_example_01.vtt
+++ b/tests/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
 WEBVTT
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 00:11.000 --> 00:13.000
 <v Roger Bingham>We are in New York City
 00:13.000 --> 00:16.000
 <v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
 00:16.000 --> 00:18.000
 <v Roger Bingham>from the American Museum of Natural History
 00:18.000 --> 00:20.000
 <v Roger Bingham>And with me is Neil deGrasse Tyson
 00:20.000 --> 00:22.000
 <v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
 00:22.000 --> 00:24.000
 <v Roger Bingham>at the AMNH.
 00:24.000 --> 00:26.000
 <v Roger Bingham>Thank you for walking down here.
 00:27.000 --> 00:30.000
 <v Roger Bingham>And I want to do a follow-up on the last conversation we did.
 00:30.000 --> 00:31.500 align:right size:50%
 <v Roger Bingham>When we e-mailed—
 00:30.500 --> 00:32.500 align:left size:50%
 <v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
 00:32.000 --> 00:35.500 align:right size:50%
 <v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
 00:32.500 --> 00:33.500 align:left size:50%
 <v Neil deGrasse Tyson><i>Laughs</i>
 00:35.500 --> 00:38.000
 <v Roger Bingham>You know I’m so excited my glasses are falling off here.
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
 WEBVTT
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 00:00.000 --> 00:02.000
 <v.first.loud Esme>It’s a blue apple tree!
 00:02.000 --> 00:04.000
 <v Mary>No way!
 00:04.000 --> 00:06.000
 <v Esme>Hee!</v> <i>laughter</i>
 00:06.000 --> 00:08.000
 <v.loud Mary>That’s awesome!
--- a/tests/data/webvtt/webvtt_example_03.vtt
+++ b/tests/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
 WEBVTT
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
 00:00:04.963 --> 00:00:08.571
 <v Speaker A>OK,
 I think now we should be recording</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
 00:00:08.571 --> 00:00:09.403
 <v Speaker A>properly.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
 00:00:10.683 --> 00:00:11.563
 Good.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
 00:00:13.363 --> 00:00:13.803
 <v Speaker A>Yeah.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
 00:00:49.603 --> 00:00:53.363
 <v Speaker B>I was also thinking.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
 00:00:54.963 --> 00:01:02.072
 <v Speaker B>Would be maybe good to create items,</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
 00:01:02.072 --> 00:01:06.811
 <v Speaker B>some metadata,
 some options that can be specific.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
 00:01:10.243 --> 00:01:13.014
 <v Speaker A>Yeah,
 I mean I think you went even more than</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
 00:01:10.563 --> 00:01:12.643
 <v Speaker B>But we preserved the atoms.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
 00:01:13.014 --> 00:01:15.907
 <v Speaker A>than me.
 I just opened the format.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
 00:01:50.222 --> 00:01:51.643
 <v Speaker A>give it a try, yeah.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
 00:01:52.043 --> 00:01:55.043
 <v Speaker B>Okay, talk to you later.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
 00:01:54.603 --> 00:01:55.283
 <v Speaker A>See you.</v>
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@@ -26,10 +26,12 @@ def test_convert_valid():
    assert len(relevant_paths) > 0
    yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
    json_filter = ["escaped_characters"]
    for in_path in relevant_paths:
        md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
        yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
        json_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.json"
        in_doc = InputDocument(
            path_or_stream=in_path,
@@ -45,6 +47,9 @@ def test_convert_valid():
        act_doc = backend.convert()
        act_data = act_doc.export_to_markdown()
        if in_path.stem in json_filter:
            assert verify_document(act_doc, json_gt_path, GENERATE), "export to json"
        if GEN_TEST_DATA:
            with open(md_gt_path, mode="w", encoding="utf-8") as f:
                f.write(f"{act_data}\n")
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -0,0 +1,232 @@
 # Assisted by watsonx Code Assistant
 from pathlib import Path
 import pytest
 from docling_core.types.doc import DoclingDocument
 from pydantic import ValidationError
 from docling.backend.webvtt_backend import (
    _WebVTTCueItalicSpan,
    _WebVTTCueTextSpan,
    _WebVTTCueTimings,
    _WebVTTCueVoiceSpan,
    _WebVTTFile,
    _WebVTTTimestamp,
 )
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter
 from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export
 GENERATE = GEN_TEST_DATA
 def test_vtt_cue_commponents():
    """Test WebVTT components."""
    valid_timestamps = [
        "00:01:02.345",
        "12:34:56.789",
        "02:34.567",
        "00:00:00.000",
    ]
    valid_total_seconds = [
        1 * 60 + 2.345,
        12 * 3600 + 34 * 60 + 56.789,
        2 * 60 + 34.567,
        0.0,
    ]
    for idx, ts in enumerate(valid_timestamps):
        model = _WebVTTTimestamp(raw=ts)
        assert model.seconds == valid_total_seconds[idx]
    """Test invalid WebVTT timestamps."""
    invalid_timestamps = [
        "00:60:02.345",  # minutes > 59
        "00:01:60.345",  # seconds > 59
        "00:01:02.1000",  # milliseconds > 999
        "01:02:03",  # missing milliseconds
        "01:02",  # missing milliseconds
        ":01:02.345",  # extra : for missing hours
        "abc:01:02.345",  # invalid format
    ]
    for ts in invalid_timestamps:
        with pytest.raises(ValidationError):
            _WebVTTTimestamp(raw=ts)
    """Test the timestamp __str__ method."""
    model = _WebVTTTimestamp(raw="00:01:02.345")
    assert str(model) == "00:01:02.345"
    """Test valid cue timings."""
    start = _WebVTTTimestamp(raw="00:10.005")
    end = _WebVTTTimestamp(raw="00:14.007")
    cue_timings = _WebVTTCueTimings(start=start, end=end)
    assert cue_timings.start == start
    assert cue_timings.end == end
    assert str(cue_timings) == "00:10.005 --> 00:14.007"
    """Test invalid cue timings with end timestamp before start."""
    start = _WebVTTTimestamp(raw="00:10.700")
    end = _WebVTTTimestamp(raw="00:10.500")
    with pytest.raises(ValidationError) as excinfo:
        _WebVTTCueTimings(start=start, end=end)
    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
    """Test invalid cue timings with missing end."""
    start = _WebVTTTimestamp(raw="00:10.500")
    with pytest.raises(ValidationError) as excinfo:
        _WebVTTCueTimings(start=start)
    assert "Field required" in str(excinfo.value)
    """Test invalid cue timings with missing start."""
    end = _WebVTTTimestamp(raw="00:10.500")
    with pytest.raises(ValidationError) as excinfo:
        _WebVTTCueTimings(end=end)
    assert "Field required" in str(excinfo.value)
    """Test with valid text."""
    valid_text = "This is a valid cue text span."
    span = _WebVTTCueTextSpan(text=valid_text)
    assert span.text == valid_text
    assert str(span) == valid_text
    """Test with text containing newline characters."""
    invalid_text = "This cue text span\ncontains a newline."
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text=invalid_text)
    """Test with text containing ampersand."""
    invalid_text = "This cue text span contains &."
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text=invalid_text)
    """Test with text containing less-than sign."""
    invalid_text = "This cue text span contains <."
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text=invalid_text)
    """Test with empty text."""
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text="")
    """Test that annotation validation works correctly."""
    valid_annotation = "valid-annotation"
    invalid_annotation = "invalid\nannotation"
    with pytest.raises(ValidationError):
        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
    """Test that classes validation works correctly."""
    annotation = "speaker name"
    valid_classes = ["class1", "class2"]
    invalid_classes = ["class\nwith\nnewlines", ""]
    with pytest.raises(ValidationError):
        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
    """Test that components validation works correctly."""
    annotation = "speaker name"
    valid_components = [_WebVTTCueTextSpan(text="random text")]
    invalid_components = [123, "not a component"]
    with pytest.raises(ValidationError):
        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
    """Test valid cue voice spans."""
    cue_span = _WebVTTCueVoiceSpan(
        annotation="speaker",
        classes=["loud", "clear"],
        components=[_WebVTTCueTextSpan(text="random text")],
    )
    expected_str = "<v.loud.clear speaker>random text</v>"
    assert str(cue_span) == expected_str
    cue_span = _WebVTTCueVoiceSpan(
        annotation="speaker",
        components=[_WebVTTCueTextSpan(text="random text")],
    )
    expected_str = "<v speaker>random text</v>"
    assert str(cue_span) == expected_str
 def test_webvtt_file():
    """Test WebVTT files."""
    with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
        content = f.read()
        vtt = _WebVTTFile.parse(content)
    assert len(vtt) == 13
    block = vtt.cue_blocks[11]
    assert str(block.timings) == "00:32.500 --> 00:33.500"
    assert len(block.payload) == 1
    cue_span = block.payload[0]
    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
    assert cue_span.annotation == "Neil deGrasse Tyson"
    assert not cue_span.classes
    assert len(cue_span.components) == 1
    comp = cue_span.components[0]
    assert isinstance(comp, _WebVTTCueItalicSpan)
    assert len(comp.components) == 1
    comp2 = comp.components[0]
    assert isinstance(comp2, _WebVTTCueTextSpan)
    assert comp2.text == "Laughs"
    with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
        content = f.read()
        vtt = _WebVTTFile.parse(content)
    assert len(vtt) == 4
    reverse = (
        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
        "https://www.w3.org/TR/webvtt1/\n\n"
    )
    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
    assert content == reverse
    with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
        content = f.read()
        vtt = _WebVTTFile.parse(content)
    assert len(vtt) == 13
    for block in vtt:
        assert block.identifier
    block = vtt.cue_blocks[0]
    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
    assert len(block.payload) == 1
    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
    block = vtt.cue_blocks[2]
    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
    assert len(block.payload) == 1
    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
    assert block.payload[0].text == "Good."
 def test_e2e_vtt_conversions():
    directory = Path("./tests/data/webvtt/")
    vtt_paths = sorted(directory.rglob("*.vtt"))
    converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
    for vtt in vtt_paths:
        gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
        conv_result: ConversionResult = converter.convert(vtt)
        doc: DoclingDocument = conv_result.document
        pred_md: str = doc.export_to_markdown(escape_html=False)
        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
            "export to md"
        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
            "export to indented-text"
        )
        assert verify_document(doc, str(gt_path) + ".json", GENERATE)
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
    doc_path.write_text("xyz", encoding="utf-8")
    assert dci._guess_format(doc_path) is None
    # Valid WebVTT
    buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
    stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
    assert dci._guess_format(stream) == InputFormat.VTT
    # Valid Docling JSON
    test_str = '{"name": ""}'
    stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
--- a/uv.lock
+++ b/uv.lock
@@ -1049,7 +1049,7 @@ wheels = [
 [[package]]
 name = "docling"
-version = "2.53.0"
+version = "2.54.0"
 source = { editable = "." }
 dependencies = [
    { name = "accelerate" },
@@ -1154,7 +1154,7 @@ requires-dist = [
    { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
    { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
    { name = "certifi", specifier = ">=2024.7.4" },
-    { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
+    { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
    { name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
    { name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
    { name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [
 [[package]]
 name = "docling-core"
-version = "2.48.1"
+version = "2.48.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
    { name = "typer" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
+    { url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
 ]
 [package.optional-dependencies]