feat: add a backend parser for WebVTT files (#2288)

* feat: add a backend parser for WebVTT files Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: update README with VTT support Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: add description to supported formats Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade docling-core to unescape WebVTT in markdown Pin the new release of docling-core 2.48.2. Do not escape HTML reserved characters when exporting WebVTT documents to markdown. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test: add missing copyright notice Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-09-22 15:24:34 +02:00
parent b5628f1227
commit 46efaaefee
23 changed files with 3969 additions and 34 deletions
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad

 ## Features

-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
+* 💬 Parsing of Web Video Text Tracks (WebVTT) files

 ### Coming soon

 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
-* 📝 Parsing of Web Video Text Tracks (WebVTT) files

 ## Installation

--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -0,0 +1,572 @@
+import logging
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+
+from docling_core.types.doc import (
+    ContentLayer,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    Formatting,
+    GroupLabel,
+    NodeItem,
+)
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic.types import StringConstraints
+from typing_extensions import Self, override
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class _WebVTTTimestamp(BaseModel):
+    """Model representing a WebVTT timestamp.
+
+    A WebVTT timestamp is always interpreted relative to the current playback position
+    of the media data that the WebVTT file is to be synchronized with.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    raw: Annotated[
+        str,
+        Field(
+            description="A representation of the WebVTT Timestamp as a single string"
+        ),
+    ]
+
+    _pattern: ClassVar[re.Pattern] = re.compile(
+        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
+    )
+    _hours: int
+    _minutes: int
+    _seconds: int
+    _millis: int
+
+    @model_validator(mode="after")
+    def validate_raw(self) -> Self:
+        m = self._pattern.match(self.raw)
+        if not m:
+            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
+        self._hours = int(m.group(1)) if m.group(1) else 0
+        self._minutes = int(m.group(2))
+        self._seconds = int(m.group(3))
+        self._millis = int(m.group(4))
+
+        if self._minutes < 0 or self._minutes > 59:
+            raise ValueError("Minutes must be between 0 and 59")
+        if self._seconds < 0 or self._seconds > 59:
+            raise ValueError("Seconds must be between 0 and 59")
+
+        return self
+
+    @property
+    def seconds(self) -> float:
+        """A representation of the WebVTT Timestamp in seconds"""
+        return (
+            self._hours * 3600
+            + self._minutes * 60
+            + self._seconds
+            + self._millis / 1000.0
+        )
+
+    @override
+    def __str__(self) -> str:
+        return self.raw
+
+
+_WebVTTCueIdentifier = Annotated[
+    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
+]
+
+
+class _WebVTTCueTimings(BaseModel):
+    """Model representating WebVTT cue timings."""
+
+    start: Annotated[
+        _WebVTTTimestamp, Field(description="Start time offset of the cue")
+    ]
+    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
+
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        if self.start and self.end:
+            if self.end.seconds <= self.start.seconds:
+                raise ValueError("End timestamp must be greater than start timestamp")
+        return self
+
+    @override
+    def __str__(self):
+        return f"{self.start} --> {self.end}"
+
+
+class _WebVTTCueTextSpan(BaseModel):
+    """Model representing a WebVTT cue text span."""
+
+    text: str
+    span_type: Literal["text"] = "text"
+
+    @field_validator("text", mode="after")
+    @classmethod
+    def validate_text(cls, value: str) -> str:
+        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
+            raise ValueError("Cue text span contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Cue text span cannot be empty")
+        return value
+
+    @override
+    def __str__(self):
+        return self.text
+
+
+class _WebVTTCueVoiceSpan(BaseModel):
+    """Model representing a WebVTT cue voice span."""
+
+    annotation: Annotated[
+        str,
+        Field(
+            description=(
+                "Cue span start tag annotation text representing the name of thevoice"
+            )
+        ),
+    ]
+    classes: Annotated[
+        list[str],
+        Field(description="List of classes representing the cue span's significance"),
+    ] = []
+    components: Annotated[
+        list["_WebVTTCueComponent"],
+        Field(description="The components representing the cue internal text"),
+    ] = []
+    span_type: Literal["v"] = "v"
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    def validate_annotation(cls, value: str) -> str:
+        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
+            raise ValueError(
+                "Cue span start tag annotation contains invalid characters"
+            )
+        if not value:
+            raise ValueError("Cue text span cannot be empty")
+        return value
+
+    @field_validator("classes", mode="after")
+    @classmethod
+    def validate_classes(cls, value: list[str]) -> list[str]:
+        for item in value:
+            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
+                raise ValueError(
+                    "A cue span start tag class contains invalid characters"
+                )
+            if not item:
+                raise ValueError("Cue span start tag classes cannot be empty")
+        return value
+
+    @override
+    def __str__(self):
+        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
+        inner = "".join(str(span) for span in self.components)
+        return f"<{tag} {self.annotation}>{inner}</v>"
+
+
+class _WebVTTCueClassSpan(BaseModel):
+    span_type: Literal["c"] = "c"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<c>{inner}</c>"
+
+
+class _WebVTTCueItalicSpan(BaseModel):
+    span_type: Literal["i"] = "i"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<i>{inner}</i>"
+
+
+class _WebVTTCueBoldSpan(BaseModel):
+    span_type: Literal["b"] = "b"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<b>{inner}</b>"
+
+
+class _WebVTTCueUnderlineSpan(BaseModel):
+    span_type: Literal["u"] = "u"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<u>{inner}</u>"
+
+
+_WebVTTCueComponent = Annotated[
+    Union[
+        _WebVTTCueTextSpan,
+        _WebVTTCueClassSpan,
+        _WebVTTCueItalicSpan,
+        _WebVTTCueBoldSpan,
+        _WebVTTCueUnderlineSpan,
+        _WebVTTCueVoiceSpan,
+    ],
+    Field(discriminator="span_type", description="The WebVTT cue component"),
+]
+
+
+class _WebVTTCueBlock(BaseModel):
+    """Model representing a WebVTT cue block.
+
+    The optional WebVTT cue settings list is not supported.
+    The cue payload is limited to the following spans: text, class, italic, bold,
+    underline, and voice.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    identifier: Optional[_WebVTTCueIdentifier] = Field(
+        None, description="The WebVTT cue identifier"
+    )
+    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
+
+    _pattern_block: ClassVar[re.Pattern] = re.compile(
+        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
+    )
+    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
+        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
+        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
+    )
+
+    @field_validator("payload", mode="after")
+    @classmethod
+    def validate_payload(cls, payload):
+        for voice in payload:
+            if "-->" in str(voice):
+                raise ValueError("Cue payload must not contain '-->'")
+        return payload
+
+    @classmethod
+    def parse(cls, raw: str) -> "_WebVTTCueBlock":
+        lines = raw.strip().splitlines()
+        if not lines:
+            raise ValueError("Cue block must have at least one line")
+        identifier: Optional[_WebVTTCueIdentifier] = None
+        timing_line = lines[0]
+        if "-->" not in timing_line and len(lines) > 1:
+            identifier = timing_line
+            timing_line = lines[1]
+            cue_lines = lines[2:]
+        else:
+            cue_lines = lines[1:]
+
+        if "-->" not in timing_line:
+            raise ValueError("Cue block must contain WebVTT cue timings")
+
+        start, end = [t.strip() for t in timing_line.split("-->")]
+        end = re.split(" |\t", end)[0]  # ignore the cue settings list
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(
+            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+        )
+        cue_text = " ".join(cue_lines).strip()
+        if cue_text.startswith("<v") and "</v>" not in cue_text:
+            # adding close tag for cue voice spans without end tag
+            cue_text += "</v>"
+
+        stack: list[list[_WebVTTCueComponent]] = [[]]
+        tag_stack: list[Union[str, tuple]] = []
+
+        pos = 0
+        matches = list(cls._pattern_block.finditer(cue_text))
+        i = 0
+        while i < len(matches):
+            match = matches[i]
+            if match.start() > pos:
+                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
+            tag = match.group(0)
+
+            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
+                tag_type = tag[1:2]
+                tag_stack.append(tag_type)
+                stack.append([])
+            elif tag == "</i>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueItalicSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</b>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueBoldSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</u>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</c>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueClassSpan(components=children))
+                tag_stack.pop()
+            elif tag.startswith("<v"):
+                tag_stack.append(("v", tag))
+                stack.append([])
+            elif tag.startswith("</v"):
+                children = stack.pop() if stack else []
+                if (
+                    tag_stack
+                    and isinstance(tag_stack[-1], tuple)
+                    and tag_stack[-1][0] == "v"
+                ):
+                    _, voice = cast(tuple, tag_stack.pop())
+                    voice_match = cls._pattern_voice_tag.match(voice)
+                    if voice_match:
+                        class_string = voice_match.group("class")
+                        annotation = voice_match.group("annotation")
+                        if annotation:
+                            classes: list[str] = []
+                            if class_string:
+                                classes = [c for c in class_string.split(".") if c]
+                            stack[-1].append(
+                                _WebVTTCueVoiceSpan(
+                                    annotation=annotation.strip(),
+                                    classes=classes,
+                                    components=children,
+                                )
+                            )
+
+            pos = match.end()
+            i += 1
+
+        if pos < len(cue_text):
+            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
+
+        return cls(
+            identifier=identifier,
+            timings=timings,
+            payload=stack[0],
+        )
+
+    def __str__(self):
+        parts = []
+        if self.identifier:
+            parts.append(f"{self.identifier}\n")
+        timings_line = str(self.timings)
+        parts.append(timings_line + "\n")
+        for idx, span in enumerate(self.payload):
+            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
+                # the end tag may be omitted for brevity
+                parts.append(str(span).removesuffix("</v>"))
+            else:
+                parts.append(str(span))
+
+        return "".join(parts)
+
+
+class _WebVTTFile(BaseModel):
+    """A model representing a WebVTT file."""
+
+    cue_blocks: list[_WebVTTCueBlock]
+
+    @staticmethod
+    def verify_signature(content: str) -> bool:
+        if not content:
+            return False
+        elif len(content) == 6:
+            return content == "WEBVTT"
+        elif len(content) > 6 and content.startswith("WEBVTT"):
+            return content[6] in (" ", "\t", "\n")
+        else:
+            return False
+
+    @classmethod
+    def parse(cls, raw: str) -> "_WebVTTFile":
+        # Normalize newlines to LF
+        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
+
+        # Check WebVTT signature
+        if not cls.verify_signature(raw):
+            raise ValueError("Invalid WebVTT file signature")
+
+        # Strip "WEBVTT" header line
+        lines = raw.split("\n", 1)
+        body = lines[1] if len(lines) > 1 else ""
+
+        # Remove NOTE/STYLE/REGION blocks
+        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
+        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
+
+        # Split into cue blocks
+        raw_blocks = re.split(r"\n\s*\n", body.strip())
+        cues: list[_WebVTTCueBlock] = []
+        for block in raw_blocks:
+            try:
+                cues.append(_WebVTTCueBlock.parse(block))
+            except ValueError as e:
+                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
+
+        return cls(cue_blocks=cues)
+
+    def __iter__(self):
+        return iter(self.cue_blocks)
+
+    def __getitem__(self, idx):
+        return self.cue_blocks[idx]
+
+    def __len__(self):
+        return len(self.cue_blocks)
+
+
+class WebVTTDocumentBackend(DeclarativeDocumentBackend):
+    """Declarative backend for WebVTT (.vtt) files.
+
+    This parser reads the content of a WebVTT file and converts
+    it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
+
+    Each cue becomes a TextItem and the items are appended to the
+    document body by the cue's start time.
+    """
+
+    @override
+    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        self.content: str = ""
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.content = self.path_or_stream.getvalue().decode("utf-8")
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, encoding="utf-8") as f:
+                    self.content = f.read()
+        except Exception as e:
+            raise RuntimeError(
+                "Could not initialize the WebVTT backend for file with hash "
+                f"{self.document_hash}."
+            ) from e
+
+    @override
+    def is_valid(self) -> bool:
+        return _WebVTTFile.verify_signature(self.content)
+
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @override
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.VTT}
+
+    @staticmethod
+    def _add_text_from_component(
+        doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
+    ) -> None:
+        """Adds a TextItem to a document by extracting text from a cue span component.
+
+        TODO: address nesting
+        """
+        formatting = Formatting()
+        text = ""
+        if isinstance(item, _WebVTTCueItalicSpan):
+            formatting.italic = True
+        elif isinstance(item, _WebVTTCueBoldSpan):
+            formatting.bold = True
+        elif isinstance(item, _WebVTTCueUnderlineSpan):
+            formatting.underline = True
+        if isinstance(item, _WebVTTCueTextSpan):
+            text = item.text
+        else:
+            # TODO: address nesting
+            text = "".join(
+                [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
+            )
+        if text := text.strip():
+            doc.add_text(
+                label=DocItemLabel.TEXT,
+                text=text,
+                parent=parent,
+                content_layer=ContentLayer.BODY,
+                formatting=formatting,
+            )
+
+    @override
+    def convert(self) -> DoclingDocument:
+        _log.debug("Starting WebVTT conversion...")
+        if not self.is_valid():
+            raise RuntimeError("Invalid WebVTT document.")
+
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="text/vtt",
+            binary_hash=self.document_hash,
+        )
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+
+        vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
+        for block in vtt.cue_blocks:
+            block_group = doc.add_group(
+                label=GroupLabel.SECTION,
+                name="WebVTT cue block",
+                parent=None,
+                content_layer=ContentLayer.BODY,
+            )
+            if block.identifier:
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    text=str(block.identifier),
+                    parent=block_group,
+                    content_layer=ContentLayer.BODY,
+                )
+            doc.add_text(
+                label=DocItemLabel.TEXT,
+                text=str(block.timings),
+                parent=block_group,
+                content_layer=ContentLayer.BODY,
+            )
+            for cue_span in block.payload:
+                if isinstance(cue_span, _WebVTTCueVoiceSpan):
+                    voice_group = doc.add_group(
+                        label=GroupLabel.INLINE,
+                        name="WebVTT cue voice span",
+                        parent=block_group,
+                        content_layer=ContentLayer.BODY,
+                    )
+                    voice = cue_span.annotation
+                    if classes := cue_span.classes:
+                        voice += f" ({', '.join(classes)})"
+                    voice += ": "
+                    doc.add_text(
+                        label=DocItemLabel.TEXT,
+                        text=voice,
+                        parent=voice_group,
+                        content_layer=ContentLayer.BODY,
+                    )
+                    for item in cue_span.components:
+                        WebVTTDocumentBackend._add_text_from_component(
+                            doc, item, voice_group
+                        )
+                else:
+                    WebVTTDocumentBackend._add_text_from_component(
+                        doc, cue_span, block_group
+                    )
+
+        return doc
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,7 +1,6 @@
-import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Optional, Type, Union

 import numpy as np
 from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
 )
 from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (
-    DocumentStream,
-)
+from docling_core.types.io import DocumentStream

 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
    METS_GBS = "mets_gbs"
    JSON_DOCLING = "json_docling"
    AUDIO = "audio"
+    VTT = "vtt"


 class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
    DOCTAGS = "doctags"


-FormatToExtensions: Dict[InputFormat, List[str]] = {
+FormatToExtensions: dict[InputFormat, list[str]] = {
    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
    InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.METS_GBS: ["tar.gz"],
    InputFormat.JSON_DOCLING: ["json"],
    InputFormat.AUDIO: ["wav", "mp3"],
+    InputFormat.VTT: ["vtt"],
 }

-FormatToMimeType: Dict[InputFormat, List[str]] = {
+FormatToMimeType: dict[InputFormat, list[str]] = {
    InputFormat.DOCX: [
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    InputFormat.METS_GBS: ["application/mets+xml"],
    InputFormat.JSON_DOCLING: ["application/json"],
    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
+    InputFormat.VTT: ["text/vtt"],
 }

 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
-    cells: List[TextCell] = []
-    children: List["Cluster"] = []  # Add child cluster support
+    cells: list[TextCell] = []
+    children: list["Cluster"] = []  # Add child cluster support

    @field_serializer("confidence")
    def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):


 class LayoutPrediction(BaseModel):
-    clusters: List[Cluster] = []
+    clusters: list[Cluster] = []


 class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(


 class Table(BasePageElement):
-    otsl_seq: List[str]
+    otsl_seq: list[str]
    num_rows: int = 0
    num_cols: int = 0
-    table_cells: List[TableCell]
+    table_cells: list[TableCell]


 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, Table] = {}
+    table_map: dict[int, Table] = {}


 class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):


 class FigureElement(BasePageElement):
-    annotations: List[PictureDataType] = []
+    annotations: list[PictureDataType] = []
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):

 class FigureClassificationPrediction(BaseModel):
    figure_count: int = 0
-    figure_map: Dict[int, FigureElement] = {}
+    figure_map: dict[int, FigureElement] = {}


 class EquationPrediction(BaseModel):
    equation_count: int = 0
-    equation_map: Dict[int, TextElement] = {}
+    equation_map: dict[int, TextElement] = {}


 class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]


 class AssembledUnit(BaseModel):
-    elements: List[PageElement] = []
-    body: List[PageElement] = []
-    headers: List[PageElement] = []
+    elements: list[PageElement] = []
+    body: list[PageElement] = []
+    headers: list[PageElement] = []


 class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
-    _image_cache: Dict[
+    _image_cache: dict[
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.

    @property
-    def cells(self) -> List[TextCell]:
+    def cells(self) -> list[TextCell]:
        """Return text cells as a read-only view of parsed_page.textline_cells."""
        if self.parsed_page is not None:
            return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):

    id: str
    model: Optional[str] = None  # returned by openai
-    choices: List[OpenAiResponseChoice]
+    choices: list[OpenAiResponseChoice]
    created: int
    usage: OpenAiResponseUsage

@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):


 class ConfidenceReport(PageConfidenceScores):
-    pages: Dict[int, PageConfidenceScores] = Field(
+    pages: dict[int, PageConfidenceScores] = Field(
        default_factory=lambda: defaultdict(PageConfidenceScores)
    )

--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
            mime = FormatToMimeType[InputFormat.PPTX][0]
        elif ext in FormatToExtensions[InputFormat.XLSX]:
            mime = FormatToMimeType[InputFormat.XLSX][0]
+        elif ext in FormatToExtensions[InputFormat.VTT]:
+            mime = FormatToMimeType[InputFormat.VTT][0]

        return mime

--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.backend.noop_backend import NoOpBackend
+from docling.backend.webvtt_backend import WebVTTDocumentBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
+        InputFormat.VTT: FormatOption(
+            pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
+        ),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docs/index.md
+++ b/docs/index.md
@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad

 ## Features

-* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️  Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
+* 💬 Parsing of Web Video Text Tracks (WebVTT) files

 ### Coming soon

 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
-* 📝 Parsing of Web Video Text Tracks (WebVTT) files

 ## Get started

--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
 | PDF | |
 | DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
 | Markdown | |
-| AsciiDoc | |
+| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
 | HTML, XHTML | |
 | CSV | |
 | PNG, JPEG, TIFF, BMP, WEBP | Image formats |
+| WebVTT | Web Video Text Tracks format for displaying timed text |

 Schema-specific support:

@@ -32,4 +33,4 @@ Schema-specific support:
 | Markdown | |
 | JSON | Lossless serialization of Docling Document |
 | Text | Plain text, i.e. without Markdown markers |
-| Doctags | |
+| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ authors = [
 requires-python = '>=3.9,<4.0'
 dependencies = [
  'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.48.0,<3.0.0)',
+  'docling-core[chunking] (>=2.48.2,<3.0.0)',
  'docling-parse (>=4.4.0,<5.0.0)',
  "docling-ibm-models>=3.9.1,<4",
  'filetype (>=1.2.0,<2.0.0)',
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -0,0 +1,66 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group WebVTT cue block
+    item-2 at level 2: text: 00:11.000 --> 00:13.000
+    item-3 at level 2: inline: group WebVTT cue voice span
+      item-4 at level 3: text: Roger Bingham: 
+      item-5 at level 3: text: We are in New York City
+  item-6 at level 1: section: group WebVTT cue block
+    item-7 at level 2: text: 00:13.000 --> 00:16.000
+    item-8 at level 2: inline: group WebVTT cue voice span
+      item-9 at level 3: text: Roger Bingham: 
+      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
+  item-11 at level 1: section: group WebVTT cue block
+    item-12 at level 2: text: 00:16.000 --> 00:18.000
+    item-13 at level 2: inline: group WebVTT cue voice span
+      item-14 at level 3: text: Roger Bingham: 
+      item-15 at level 3: text: from the American Museum of Natural History
+  item-16 at level 1: section: group WebVTT cue block
+    item-17 at level 2: text: 00:18.000 --> 00:20.000
+    item-18 at level 2: inline: group WebVTT cue voice span
+      item-19 at level 3: text: Roger Bingham: 
+      item-20 at level 3: text: And with me is Neil deGrasse Tyson
+  item-21 at level 1: section: group WebVTT cue block
+    item-22 at level 2: text: 00:20.000 --> 00:22.000
+    item-23 at level 2: inline: group WebVTT cue voice span
+      item-24 at level 3: text: Roger Bingham: 
+      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
+  item-26 at level 1: section: group WebVTT cue block
+    item-27 at level 2: text: 00:22.000 --> 00:24.000
+    item-28 at level 2: inline: group WebVTT cue voice span
+      item-29 at level 3: text: Roger Bingham: 
+      item-30 at level 3: text: at the AMNH.
+  item-31 at level 1: section: group WebVTT cue block
+    item-32 at level 2: text: 00:24.000 --> 00:26.000
+    item-33 at level 2: inline: group WebVTT cue voice span
+      item-34 at level 3: text: Roger Bingham: 
+      item-35 at level 3: text: Thank you for walking down here.
+  item-36 at level 1: section: group WebVTT cue block
+    item-37 at level 2: text: 00:27.000 --> 00:30.000
+    item-38 at level 2: inline: group WebVTT cue voice span
+      item-39 at level 3: text: Roger Bingham: 
+      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
+  item-41 at level 1: section: group WebVTT cue block
+    item-42 at level 2: text: 00:30.000 --> 00:31.500
+    item-43 at level 2: inline: group WebVTT cue voice span
+      item-44 at level 3: text: Roger Bingham: 
+      item-45 at level 3: text: When we e-mailed—
+  item-46 at level 1: section: group WebVTT cue block
+    item-47 at level 2: text: 00:30.500 --> 00:32.500
+    item-48 at level 2: inline: group WebVTT cue voice span
+      item-49 at level 3: text: Neil deGrasse Tyson: 
+      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
+  item-51 at level 1: section: group WebVTT cue block
+    item-52 at level 2: text: 00:32.000 --> 00:35.500
+    item-53 at level 2: inline: group WebVTT cue voice span
+      item-54 at level 3: text: Roger Bingham: 
+      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
+  item-56 at level 1: section: group WebVTT cue block
+    item-57 at level 2: text: 00:32.500 --> 00:33.500
+    item-58 at level 2: inline: group WebVTT cue voice span
+      item-59 at level 3: text: Neil deGrasse Tyson: 
+      item-60 at level 3: text: Laughs
+  item-61 at level 1: section: group WebVTT cue block
+    item-62 at level 2: text: 00:35.500 --> 00:38.000
+    item-63 at level 2: inline: group WebVTT cue voice span
+      item-64 at level 3: text: Roger Bingham: 
+      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -0,0 +1,51 @@
+00:11.000 --> 00:13.000
+
+Roger Bingham:  We are in New York City
+
+00:13.000 --> 00:16.000
+
+Roger Bingham:  We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+
+Roger Bingham:  from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+
+Roger Bingham:  And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+
+Roger Bingham:  Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+
+Roger Bingham:  at the AMNH.
+
+00:24.000 --> 00:26.000
+
+Roger Bingham:  Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+
+Roger Bingham:  And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500
+
+Roger Bingham:  When we e-mailed—
+
+00:30.500 --> 00:32.500
+
+Neil deGrasse Tyson:  Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500
+
+Roger Bingham:  No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500
+
+Neil deGrasse Tyson:  *Laughs*
+
+00:35.500 --> 00:38.000
+
+Roger Bingham:  You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -0,0 +1,22 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group WebVTT cue block
+    item-2 at level 2: text: 00:00.000 --> 00:02.000
+    item-3 at level 2: inline: group WebVTT cue voice span
+      item-4 at level 3: text: Esme (first, loud): 
+      item-5 at level 3: text: It’s a blue apple tree!
+  item-6 at level 1: section: group WebVTT cue block
+    item-7 at level 2: text: 00:02.000 --> 00:04.000
+    item-8 at level 2: inline: group WebVTT cue voice span
+      item-9 at level 3: text: Mary: 
+      item-10 at level 3: text: No way!
+  item-11 at level 1: section: group WebVTT cue block
+    item-12 at level 2: text: 00:04.000 --> 00:06.000
+    item-13 at level 2: inline: group WebVTT cue voice span
+      item-14 at level 3: text: Esme: 
+      item-15 at level 3: text: Hee!
+    item-16 at level 2: text: laughter
+  item-17 at level 1: section: group WebVTT cue block
+    item-18 at level 2: text: 00:06.000 --> 00:08.000
+    item-19 at level 2: inline: group WebVTT cue voice span
+      item-20 at level 3: text: Mary (loud): 
+      item-21 at level 3: text: That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -0,0 +1,376 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.6.0",
+  "name": "webvtt_example_02",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 12867774546881601731,
+    "filename": "webvtt_example_02.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/2"
+      },
+      {
+        "$ref": "#/groups/4"
+      },
+      {
+        "$ref": "#/groups/6"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/0"
+        },
+        {
+          "$ref": "#/groups/1"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/groups/3"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/3",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/4"
+        },
+        {
+          "$ref": "#/texts/5"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/groups/5"
+        },
+        {
+          "$ref": "#/texts/9"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/5",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/10"
+        },
+        {
+          "$ref": "#/groups/7"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/7",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/11"
+        },
+        {
+          "$ref": "#/texts/12"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:00.000 --> 00:02.000",
+      "text": "00:00.000 --> 00:02.000"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Esme (first, loud): ",
+      "text": "Esme (first, loud): "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "It’s a blue apple tree!",
+      "text": "It’s a blue apple tree!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:02.000 --> 00:04.000",
+      "text": "00:02.000 --> 00:04.000"
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Mary: ",
+      "text": "Mary: "
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "No way!",
+      "text": "No way!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:04.000 --> 00:06.000",
+      "text": "00:04.000 --> 00:06.000"
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Esme: ",
+      "text": "Esme: "
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Hee!",
+      "text": "Hee!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "laughter",
+      "text": "laughter",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:06.000 --> 00:08.000",
+      "text": "00:06.000 --> 00:08.000"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/groups/7"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Mary (loud): ",
+      "text": "Mary (loud): "
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/groups/7"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "That’s awesome!",
+      "text": "That’s awesome!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -0,0 +1,17 @@
+00:00.000 --> 00:02.000
+
+Esme (first, loud):  It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+
+Mary:  No way!
+
+00:04.000 --> 00:06.000
+
+Esme:  Hee!
+
+*laughter*
+
+00:06.000 --> 00:08.000
+
+Mary (loud):  That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -0,0 +1,77 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group WebVTT cue block
+    item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+    item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
+    item-4 at level 2: inline: group WebVTT cue voice span
+      item-5 at level 3: text: Speaker A: 
+      item-6 at level 3: text: OK, I think now we should be recording
+  item-7 at level 1: section: group WebVTT cue block
+    item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+    item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
+    item-10 at level 2: inline: group WebVTT cue voice span
+      item-11 at level 3: text: Speaker A: 
+      item-12 at level 3: text: properly.
+  item-13 at level 1: section: group WebVTT cue block
+    item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+    item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
+    item-16 at level 2: text: Good.
+  item-17 at level 1: section: group WebVTT cue block
+    item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+    item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
+    item-20 at level 2: inline: group WebVTT cue voice span
+      item-21 at level 3: text: Speaker A: 
+      item-22 at level 3: text: Yeah.
+  item-23 at level 1: section: group WebVTT cue block
+    item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+    item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
+    item-26 at level 2: inline: group WebVTT cue voice span
+      item-27 at level 3: text: Speaker B: 
+      item-28 at level 3: text: I was also thinking.
+  item-29 at level 1: section: group WebVTT cue block
+    item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+    item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
+    item-32 at level 2: inline: group WebVTT cue voice span
+      item-33 at level 3: text: Speaker B: 
+      item-34 at level 3: text: Would be maybe good to create items,
+  item-35 at level 1: section: group WebVTT cue block
+    item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+    item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
+    item-38 at level 2: inline: group WebVTT cue voice span
+      item-39 at level 3: text: Speaker B: 
+      item-40 at level 3: text: some metadata, some options that can be specific.
+  item-41 at level 1: section: group WebVTT cue block
+    item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+    item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
+    item-44 at level 2: inline: group WebVTT cue voice span
+      item-45 at level 3: text: Speaker A: 
+      item-46 at level 3: text: Yeah, I mean I think you went even more than
+  item-47 at level 1: section: group WebVTT cue block
+    item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+    item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
+    item-50 at level 2: inline: group WebVTT cue voice span
+      item-51 at level 3: text: Speaker B: 
+      item-52 at level 3: text: But we preserved the atoms.
+  item-53 at level 1: section: group WebVTT cue block
+    item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+    item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
+    item-56 at level 2: inline: group WebVTT cue voice span
+      item-57 at level 3: text: Speaker A: 
+      item-58 at level 3: text: than me. I just opened the format.
+  item-59 at level 1: section: group WebVTT cue block
+    item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+    item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
+    item-62 at level 2: inline: group WebVTT cue voice span
+      item-63 at level 3: text: Speaker A: 
+      item-64 at level 3: text: give it a try, yeah.
+  item-65 at level 1: section: group WebVTT cue block
+    item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+    item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
+    item-68 at level 2: inline: group WebVTT cue voice span
+      item-69 at level 3: text: Speaker B: 
+      item-70 at level 3: text: Okay, talk to you later.
+  item-71 at level 1: section: group WebVTT cue block
+    item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+    item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
+    item-74 at level 2: inline: group WebVTT cue voice span
+      item-75 at level 3: text: Speaker A: 
+      item-76 at level 3: text: See you.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -0,0 +1,77 @@
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+
+00:00:04.963 --> 00:00:08.571
+
+Speaker A:  OK, I think now we should be recording
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+
+00:00:08.571 --> 00:00:09.403
+
+Speaker A:  properly.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+
+00:00:10.683 --> 00:00:11.563
+
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+
+00:00:13.363 --> 00:00:13.803
+
+Speaker A:  Yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+
+00:00:49.603 --> 00:00:53.363
+
+Speaker B:  I was also thinking.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+
+00:00:54.963 --> 00:01:02.072
+
+Speaker B:  Would be maybe good to create items,
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+
+00:01:02.072 --> 00:01:06.811
+
+Speaker B:  some metadata, some options that can be specific.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+
+00:01:10.243 --> 00:01:13.014
+
+Speaker A:  Yeah, I mean I think you went even more than
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+
+00:01:10.563 --> 00:01:12.643
+
+Speaker B:  But we preserved the atoms.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+
+00:01:13.014 --> 00:01:15.907
+
+Speaker A:  than me. I just opened the format.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+
+00:01:50.222 --> 00:01:51.643
+
+Speaker A:  give it a try, yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+
+00:01:52.043 --> 00:01:55.043
+
+Speaker B:  Okay, talk to you later.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+
+00:01:54.603 --> 00:01:55.283
+
+Speaker A:  See you.
--- a/tests/data/webvtt/webvtt_example_01.vtt
+++ b/tests/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:11.000 --> 00:13.000
+<v Roger Bingham>We are in New York City
+
+00:13.000 --> 00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+<v Roger Bingham>from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+<v Roger Bingham>at the AMNH.
+
+00:24.000 --> 00:26.000
+<v Roger Bingham>Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500 align:right size:50%
+<v Roger Bingham>When we e-mailed—
+
+00:30.500 --> 00:32.500 align:left size:50%
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500 align:right size:50%
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500 align:left size:50%
+<v Neil deGrasse Tyson><i>Laughs</i>
+
+00:35.500 --> 00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:00.000 --> 00:02.000
+<v.first.loud Esme>It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+<v Mary>No way!
+
+00:04.000 --> 00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:06.000 --> 00:08.000
+<v.loud Mary>That’s awesome!
--- a/tests/data/webvtt/webvtt_example_03.vtt
+++ b/tests/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -0,0 +1,232 @@
+# Assisted by watsonx Code Assistant
+
+from pathlib import Path
+
+import pytest
+from docling_core.types.doc import DoclingDocument
+from pydantic import ValidationError
+
+from docling.backend.webvtt_backend import (
+    _WebVTTCueItalicSpan,
+    _WebVTTCueTextSpan,
+    _WebVTTCueTimings,
+    _WebVTTCueVoiceSpan,
+    _WebVTTFile,
+    _WebVTTTimestamp,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+
+from .test_data_gen_flag import GEN_TEST_DATA
+from .verify_utils import verify_document, verify_export
+
+GENERATE = GEN_TEST_DATA
+
+
+def test_vtt_cue_commponents():
+    """Test WebVTT components."""
+    valid_timestamps = [
+        "00:01:02.345",
+        "12:34:56.789",
+        "02:34.567",
+        "00:00:00.000",
+    ]
+    valid_total_seconds = [
+        1 * 60 + 2.345,
+        12 * 3600 + 34 * 60 + 56.789,
+        2 * 60 + 34.567,
+        0.0,
+    ]
+    for idx, ts in enumerate(valid_timestamps):
+        model = _WebVTTTimestamp(raw=ts)
+        assert model.seconds == valid_total_seconds[idx]
+
+    """Test invalid WebVTT timestamps."""
+    invalid_timestamps = [
+        "00:60:02.345",  # minutes > 59
+        "00:01:60.345",  # seconds > 59
+        "00:01:02.1000",  # milliseconds > 999
+        "01:02:03",  # missing milliseconds
+        "01:02",  # missing milliseconds
+        ":01:02.345",  # extra : for missing hours
+        "abc:01:02.345",  # invalid format
+    ]
+    for ts in invalid_timestamps:
+        with pytest.raises(ValidationError):
+            _WebVTTTimestamp(raw=ts)
+
+    """Test the timestamp __str__ method."""
+    model = _WebVTTTimestamp(raw="00:01:02.345")
+    assert str(model) == "00:01:02.345"
+
+    """Test valid cue timings."""
+    start = _WebVTTTimestamp(raw="00:10.005")
+    end = _WebVTTTimestamp(raw="00:14.007")
+    cue_timings = _WebVTTCueTimings(start=start, end=end)
+    assert cue_timings.start == start
+    assert cue_timings.end == end
+    assert str(cue_timings) == "00:10.005 --> 00:14.007"
+
+    """Test invalid cue timings with end timestamp before start."""
+    start = _WebVTTTimestamp(raw="00:10.700")
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start, end=end)
+    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
+
+    """Test invalid cue timings with missing end."""
+    start = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start)
+    assert "Field required" in str(excinfo.value)
+
+    """Test invalid cue timings with missing start."""
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(end=end)
+    assert "Field required" in str(excinfo.value)
+
+    """Test with valid text."""
+    valid_text = "This is a valid cue text span."
+    span = _WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
+    assert str(span) == valid_text
+
+    """Test with text containing newline characters."""
+    invalid_text = "This cue text span\ncontains a newline."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing ampersand."""
+    invalid_text = "This cue text span contains &."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing less-than sign."""
+    invalid_text = "This cue text span contains <."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with empty text."""
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text="")
+
+    """Test that annotation validation works correctly."""
+    valid_annotation = "valid-annotation"
+    invalid_annotation = "invalid\nannotation"
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
+    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+
+    """Test that classes validation works correctly."""
+    annotation = "speaker name"
+    valid_classes = ["class1", "class2"]
+    invalid_classes = ["class\nwith\nnewlines", ""]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+
+    """Test that components validation works correctly."""
+    annotation = "speaker name"
+    valid_components = [_WebVTTCueTextSpan(text="random text")]
+    invalid_components = [123, "not a component"]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+
+    """Test valid cue voice spans."""
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        classes=["loud", "clear"],
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+
+    expected_str = "<v.loud.clear speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+    expected_str = "<v speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+
+def test_webvtt_file():
+    """Test WebVTT files."""
+    with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    block = vtt.cue_blocks[11]
+    assert str(block.timings) == "00:32.500 --> 00:33.500"
+    assert len(block.payload) == 1
+    cue_span = block.payload[0]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert cue_span.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.classes
+    assert len(cue_span.components) == 1
+    comp = cue_span.components[0]
+    assert isinstance(comp, _WebVTTCueItalicSpan)
+    assert len(comp.components) == 1
+    comp2 = comp.components[0]
+    assert isinstance(comp2, _WebVTTCueTextSpan)
+    assert comp2.text == "Laughs"
+
+    with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 4
+    reverse = (
+        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
+        "https://www.w3.org/TR/webvtt1/\n\n"
+    )
+    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
+    assert content == reverse
+
+    with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    for block in vtt:
+        assert block.identifier
+    block = vtt.cue_blocks[0]
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+    block = vtt.cue_blocks[2]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
+    assert block.payload[0].text == "Good."
+
+
+def test_e2e_vtt_conversions():
+    directory = Path("./tests/data/webvtt/")
+    vtt_paths = sorted(directory.rglob("*.vtt"))
+    converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
+
+    for vtt in vtt_paths:
+        gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
+
+        conv_result: ConversionResult = converter.convert(vtt)
+
+        doc: DoclingDocument = conv_result.document
+
+        pred_md: str = doc.export_to_markdown(escape_html=False)
+        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
+            "export to md"
+        )
+
+        pred_itxt: str = doc._export_to_indented_text(
+            max_text_len=70, explicit_tables=False
+        )
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
+            "export to indented-text"
+        )
+
+        assert verify_document(doc, str(gt_path) + ".json", GENERATE)
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
    doc_path.write_text("xyz", encoding="utf-8")
    assert dci._guess_format(doc_path) is None

+    # Valid WebVTT
+    buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
+    stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
+    assert dci._guess_format(stream) == InputFormat.VTT
+
    # Valid Docling JSON
    test_str = '{"name": ""}'
    stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
--- a/uv.lock
+++ b/uv.lock
@@ -1154,7 +1154,7 @@ requires-dist = [
    { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
    { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
    { name = "certifi", specifier = ">=2024.7.4" },
-    { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
+    { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
    { name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
    { name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
    { name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [

 [[package]]
 name = "docling-core"
-version = "2.48.1"
+version = "2.48.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
    { name = "typer" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
+    { url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
 ]

 [package.optional-dependencies]
@@ -4936,6 +4936,9 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
    { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
    { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
+    { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
    { url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
    { url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
    { url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
@@ -4957,6 +4960,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
    { url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
    { url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
+    { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
    { url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
    { url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
    { url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },