feat: add a backend parser for WebVTT files (#2288)

* feat: add a backend parser for WebVTT files Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: update README with VTT support Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: add description to supported formats Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade docling-core to unescape WebVTT in markdown Pin the new release of docling-core 2.48.2. Do not escape HTML reserved characters when exporting WebVTT documents to markdown. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test: add missing copyright notice Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-09-22 15:24:34 +02:00
parent b5628f1227
commit 46efaaefee
23 changed files with 3969 additions and 34 deletions
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
 * 💬 Parsing of Web Video Text Tracks (WebVTT) files
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
 * 📝 Parsing of Web Video Text Tracks (WebVTT) files
 ## Installation
--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -0,0 +1,572 @@
 import logging
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import Annotated, ClassVar, Literal, Optional, Union, cast
 from docling_core.types.doc import (
    ContentLayer,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    Formatting,
    GroupLabel,
    NodeItem,
 )
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.types import StringConstraints
 from typing_extensions import Self, override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 class _WebVTTTimestamp(BaseModel):
    """Model representing a WebVTT timestamp.
    A WebVTT timestamp is always interpreted relative to the current playback position
    of the media data that the WebVTT file is to be synchronized with.
    """
    model_config = ConfigDict(regex_engine="python-re")
    raw: Annotated[
        str,
        Field(
            description="A representation of the WebVTT Timestamp as a single string"
        ),
    ]
    _pattern: ClassVar[re.Pattern] = re.compile(
        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
    )
    _hours: int
    _minutes: int
    _seconds: int
    _millis: int
    @model_validator(mode="after")
    def validate_raw(self) -> Self:
        m = self._pattern.match(self.raw)
        if not m:
            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
        self._hours = int(m.group(1)) if m.group(1) else 0
        self._minutes = int(m.group(2))
        self._seconds = int(m.group(3))
        self._millis = int(m.group(4))
        if self._minutes < 0 or self._minutes > 59:
            raise ValueError("Minutes must be between 0 and 59")
        if self._seconds < 0 or self._seconds > 59:
            raise ValueError("Seconds must be between 0 and 59")
        return self
    @property
    def seconds(self) -> float:
        """A representation of the WebVTT Timestamp in seconds"""
        return (
            self._hours * 3600
            + self._minutes * 60
            + self._seconds
            + self._millis / 1000.0
        )
    @override
    def __str__(self) -> str:
        return self.raw
 _WebVTTCueIdentifier = Annotated[
    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
 ]
 class _WebVTTCueTimings(BaseModel):
    """Model representating WebVTT cue timings."""
    start: Annotated[
        _WebVTTTimestamp, Field(description="Start time offset of the cue")
    ]
    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
    @model_validator(mode="after")
    def check_order(self) -> Self:
        if self.start and self.end:
            if self.end.seconds <= self.start.seconds:
                raise ValueError("End timestamp must be greater than start timestamp")
        return self
    @override
    def __str__(self):
        return f"{self.start} --> {self.end}"
 class _WebVTTCueTextSpan(BaseModel):
    """Model representing a WebVTT cue text span."""
    text: str
    span_type: Literal["text"] = "text"
    @field_validator("text", mode="after")
    @classmethod
    def validate_text(cls, value: str) -> str:
        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
            raise ValueError("Cue text span contains invalid characters")
        if len(value) == 0:
            raise ValueError("Cue text span cannot be empty")
        return value
    @override
    def __str__(self):
        return self.text
 class _WebVTTCueVoiceSpan(BaseModel):
    """Model representing a WebVTT cue voice span."""
    annotation: Annotated[
        str,
        Field(
            description=(
                "Cue span start tag annotation text representing the name of thevoice"
            )
        ),
    ]
    classes: Annotated[
        list[str],
        Field(description="List of classes representing the cue span's significance"),
    ] = []
    components: Annotated[
        list["_WebVTTCueComponent"],
        Field(description="The components representing the cue internal text"),
    ] = []
    span_type: Literal["v"] = "v"
    @field_validator("annotation", mode="after")
    @classmethod
    def validate_annotation(cls, value: str) -> str:
        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
            raise ValueError(
                "Cue span start tag annotation contains invalid characters"
            )
        if not value:
            raise ValueError("Cue text span cannot be empty")
        return value
    @field_validator("classes", mode="after")
    @classmethod
    def validate_classes(cls, value: list[str]) -> list[str]:
        for item in value:
            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
                raise ValueError(
                    "A cue span start tag class contains invalid characters"
                )
            if not item:
                raise ValueError("Cue span start tag classes cannot be empty")
        return value
    @override
    def __str__(self):
        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
        inner = "".join(str(span) for span in self.components)
        return f"<{tag} {self.annotation}>{inner}</v>"
 class _WebVTTCueClassSpan(BaseModel):
    span_type: Literal["c"] = "c"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<c>{inner}</c>"
 class _WebVTTCueItalicSpan(BaseModel):
    span_type: Literal["i"] = "i"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<i>{inner}</i>"
 class _WebVTTCueBoldSpan(BaseModel):
    span_type: Literal["b"] = "b"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<b>{inner}</b>"
 class _WebVTTCueUnderlineSpan(BaseModel):
    span_type: Literal["u"] = "u"
    components: list["_WebVTTCueComponent"]
    @override
    def __str__(self):
        inner = "".join(str(span) for span in self.components)
        return f"<u>{inner}</u>"
 _WebVTTCueComponent = Annotated[
    Union[
        _WebVTTCueTextSpan,
        _WebVTTCueClassSpan,
        _WebVTTCueItalicSpan,
        _WebVTTCueBoldSpan,
        _WebVTTCueUnderlineSpan,
        _WebVTTCueVoiceSpan,
    ],
    Field(discriminator="span_type", description="The WebVTT cue component"),
 ]
 class _WebVTTCueBlock(BaseModel):
    """Model representing a WebVTT cue block.
    The optional WebVTT cue settings list is not supported.
    The cue payload is limited to the following spans: text, class, italic, bold,
    underline, and voice.
    """
    model_config = ConfigDict(regex_engine="python-re")
    identifier: Optional[_WebVTTCueIdentifier] = Field(
        None, description="The WebVTT cue identifier"
    )
    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
    _pattern_block: ClassVar[re.Pattern] = re.compile(
        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
    )
    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
    )
    @field_validator("payload", mode="after")
    @classmethod
    def validate_payload(cls, payload):
        for voice in payload:
            if "-->" in str(voice):
                raise ValueError("Cue payload must not contain '-->'")
        return payload
    @classmethod
    def parse(cls, raw: str) -> "_WebVTTCueBlock":
        lines = raw.strip().splitlines()
        if not lines:
            raise ValueError("Cue block must have at least one line")
        identifier: Optional[_WebVTTCueIdentifier] = None
        timing_line = lines[0]
        if "-->" not in timing_line and len(lines) > 1:
            identifier = timing_line
            timing_line = lines[1]
            cue_lines = lines[2:]
        else:
            cue_lines = lines[1:]
        if "-->" not in timing_line:
            raise ValueError("Cue block must contain WebVTT cue timings")
        start, end = [t.strip() for t in timing_line.split("-->")]
        end = re.split(" |\t", end)[0]  # ignore the cue settings list
        timings: _WebVTTCueTimings = _WebVTTCueTimings(
            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
        )
        cue_text = " ".join(cue_lines).strip()
        if cue_text.startswith("<v") and "</v>" not in cue_text:
            # adding close tag for cue voice spans without end tag
            cue_text += "</v>"
        stack: list[list[_WebVTTCueComponent]] = [[]]
        tag_stack: list[Union[str, tuple]] = []
        pos = 0
        matches = list(cls._pattern_block.finditer(cue_text))
        i = 0
        while i < len(matches):
            match = matches[i]
            if match.start() > pos:
                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
            tag = match.group(0)
            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
                tag_type = tag[1:2]
                tag_stack.append(tag_type)
                stack.append([])
            elif tag == "</i>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueItalicSpan(components=children))
                tag_stack.pop()
            elif tag == "</b>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueBoldSpan(components=children))
                tag_stack.pop()
            elif tag == "</u>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
                tag_stack.pop()
            elif tag == "</c>":
                children = stack.pop()
                stack[-1].append(_WebVTTCueClassSpan(components=children))
                tag_stack.pop()
            elif tag.startswith("<v"):
                tag_stack.append(("v", tag))
                stack.append([])
            elif tag.startswith("</v"):
                children = stack.pop() if stack else []
                if (
                    tag_stack
                    and isinstance(tag_stack[-1], tuple)
                    and tag_stack[-1][0] == "v"
                ):
                    _, voice = cast(tuple, tag_stack.pop())
                    voice_match = cls._pattern_voice_tag.match(voice)
                    if voice_match:
                        class_string = voice_match.group("class")
                        annotation = voice_match.group("annotation")
                        if annotation:
                            classes: list[str] = []
                            if class_string:
                                classes = [c for c in class_string.split(".") if c]
                            stack[-1].append(
                                _WebVTTCueVoiceSpan(
                                    annotation=annotation.strip(),
                                    classes=classes,
                                    components=children,
                                )
                            )
            pos = match.end()
            i += 1
        if pos < len(cue_text):
            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
        return cls(
            identifier=identifier,
            timings=timings,
            payload=stack[0],
        )
    def __str__(self):
        parts = []
        if self.identifier:
            parts.append(f"{self.identifier}\n")
        timings_line = str(self.timings)
        parts.append(timings_line + "\n")
        for idx, span in enumerate(self.payload):
            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
                # the end tag may be omitted for brevity
                parts.append(str(span).removesuffix("</v>"))
            else:
                parts.append(str(span))
        return "".join(parts)
 class _WebVTTFile(BaseModel):
    """A model representing a WebVTT file."""
    cue_blocks: list[_WebVTTCueBlock]
    @staticmethod
    def verify_signature(content: str) -> bool:
        if not content:
            return False
        elif len(content) == 6:
            return content == "WEBVTT"
        elif len(content) > 6 and content.startswith("WEBVTT"):
            return content[6] in (" ", "\t", "\n")
        else:
            return False
    @classmethod
    def parse(cls, raw: str) -> "_WebVTTFile":
        # Normalize newlines to LF
        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
        # Check WebVTT signature
        if not cls.verify_signature(raw):
            raise ValueError("Invalid WebVTT file signature")
        # Strip "WEBVTT" header line
        lines = raw.split("\n", 1)
        body = lines[1] if len(lines) > 1 else ""
        # Remove NOTE/STYLE/REGION blocks
        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
        # Split into cue blocks
        raw_blocks = re.split(r"\n\s*\n", body.strip())
        cues: list[_WebVTTCueBlock] = []
        for block in raw_blocks:
            try:
                cues.append(_WebVTTCueBlock.parse(block))
            except ValueError as e:
                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
        return cls(cue_blocks=cues)
    def __iter__(self):
        return iter(self.cue_blocks)
    def __getitem__(self, idx):
        return self.cue_blocks[idx]
    def __len__(self):
        return len(self.cue_blocks)
 class WebVTTDocumentBackend(DeclarativeDocumentBackend):
    """Declarative backend for WebVTT (.vtt) files.
    This parser reads the content of a WebVTT file and converts
    it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
    Each cue becomes a TextItem and the items are appended to the
    document body by the cue's start time.
    """
    @override
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
        self.content: str = ""
        try:
            if isinstance(self.path_or_stream, BytesIO):
                self.content = self.path_or_stream.getvalue().decode("utf-8")
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, encoding="utf-8") as f:
                    self.content = f.read()
        except Exception as e:
            raise RuntimeError(
                "Could not initialize the WebVTT backend for file with hash "
                f"{self.document_hash}."
            ) from e
    @override
    def is_valid(self) -> bool:
        return _WebVTTFile.verify_signature(self.content)
    @classmethod
    @override
    def supports_pagination(cls) -> bool:
        return False
    @override
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    @override
    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.VTT}
    @staticmethod
    def _add_text_from_component(
        doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
    ) -> None:
        """Adds a TextItem to a document by extracting text from a cue span component.
        TODO: address nesting
        """
        formatting = Formatting()
        text = ""
        if isinstance(item, _WebVTTCueItalicSpan):
            formatting.italic = True
        elif isinstance(item, _WebVTTCueBoldSpan):
            formatting.bold = True
        elif isinstance(item, _WebVTTCueUnderlineSpan):
            formatting.underline = True
        if isinstance(item, _WebVTTCueTextSpan):
            text = item.text
        else:
            # TODO: address nesting
            text = "".join(
                [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
            )
        if text := text.strip():
            doc.add_text(
                label=DocItemLabel.TEXT,
                text=text,
                parent=parent,
                content_layer=ContentLayer.BODY,
                formatting=formatting,
            )
    @override
    def convert(self) -> DoclingDocument:
        _log.debug("Starting WebVTT conversion...")
        if not self.is_valid():
            raise RuntimeError("Invalid WebVTT document.")
        origin = DocumentOrigin(
            filename=self.file.name or "file",
            mimetype="text/vtt",
            binary_hash=self.document_hash,
        )
        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
        for block in vtt.cue_blocks:
            block_group = doc.add_group(
                label=GroupLabel.SECTION,
                name="WebVTT cue block",
                parent=None,
                content_layer=ContentLayer.BODY,
            )
            if block.identifier:
                doc.add_text(
                    label=DocItemLabel.TEXT,
                    text=str(block.identifier),
                    parent=block_group,
                    content_layer=ContentLayer.BODY,
                )
            doc.add_text(
                label=DocItemLabel.TEXT,
                text=str(block.timings),
                parent=block_group,
                content_layer=ContentLayer.BODY,
            )
            for cue_span in block.payload:
                if isinstance(cue_span, _WebVTTCueVoiceSpan):
                    voice_group = doc.add_group(
                        label=GroupLabel.INLINE,
                        name="WebVTT cue voice span",
                        parent=block_group,
                        content_layer=ContentLayer.BODY,
                    )
                    voice = cue_span.annotation
                    if classes := cue_span.classes:
                        voice += f" ({', '.join(classes)})"
                    voice += ": "
                    doc.add_text(
                        label=DocItemLabel.TEXT,
                        text=voice,
                        parent=voice_group,
                        content_layer=ContentLayer.BODY,
                    )
                    for item in cue_span.components:
                        WebVTTDocumentBackend._add_text_from_component(
                            doc, item, voice_group
                        )
                else:
                    WebVTTDocumentBackend._add_text_from_component(
                        doc, cue_span, block_group
                    )
        return doc
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,7 +1,6 @@
 import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Optional, Type, Union
 import numpy as np
 from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
 )
 from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (
+from docling_core.types.io import DocumentStream
    DocumentStream,
 )
 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
    METS_GBS = "mets_gbs"
    JSON_DOCLING = "json_docling"
    AUDIO = "audio"
    VTT = "vtt"
 class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
    DOCTAGS = "doctags"
-FormatToExtensions: Dict[InputFormat, List[str]] = {
+FormatToExtensions: dict[InputFormat, list[str]] = {
    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
    InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.METS_GBS: ["tar.gz"],
    InputFormat.JSON_DOCLING: ["json"],
    InputFormat.AUDIO: ["wav", "mp3"],
    InputFormat.VTT: ["vtt"],
 }
-FormatToMimeType: Dict[InputFormat, List[str]] = {
+FormatToMimeType: dict[InputFormat, list[str]] = {
    InputFormat.DOCX: [
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    InputFormat.METS_GBS: ["application/mets+xml"],
    InputFormat.JSON_DOCLING: ["application/json"],
    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
    InputFormat.VTT: ["text/vtt"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
    label: DocItemLabel
    bbox: BoundingBox
    confidence: float = 1.0
-    cells: List[TextCell] = []
+    cells: list[TextCell] = []
-    children: List["Cluster"] = []  # Add child cluster support
+    children: list["Cluster"] = []  # Add child cluster support
    @field_serializer("confidence")
    def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
 class LayoutPrediction(BaseModel):
-    clusters: List[Cluster] = []
+    clusters: list[Cluster] = []
 class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
 class Table(BasePageElement):
-    otsl_seq: List[str]
+    otsl_seq: list[str]
    num_rows: int = 0
    num_cols: int = 0
-    table_cells: List[TableCell]
+    table_cells: list[TableCell]
 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, Table] = {}
+    table_map: dict[int, Table] = {}
 class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
 class FigureElement(BasePageElement):
-    annotations: List[PictureDataType] = []
+    annotations: list[PictureDataType] = []
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
 class FigureClassificationPrediction(BaseModel):
    figure_count: int = 0
-    figure_map: Dict[int, FigureElement] = {}
+    figure_map: dict[int, FigureElement] = {}
 class EquationPrediction(BaseModel):
    equation_count: int = 0
-    equation_map: Dict[int, TextElement] = {}
+    equation_map: dict[int, TextElement] = {}
 class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
 class AssembledUnit(BaseModel):
-    elements: List[PageElement] = []
+    elements: list[PageElement] = []
-    body: List[PageElement] = []
+    body: list[PageElement] = []
-    headers: List[PageElement] = []
+    headers: list[PageElement] = []
 class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
-    _image_cache: Dict[
+    _image_cache: dict[
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
    @property
-    def cells(self) -> List[TextCell]:
+    def cells(self) -> list[TextCell]:
        """Return text cells as a read-only view of parsed_page.textline_cells."""
        if self.parsed_page is not None:
            return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
    id: str
    model: Optional[str] = None  # returned by openai
-    choices: List[OpenAiResponseChoice]
+    choices: list[OpenAiResponseChoice]
    created: int
    usage: OpenAiResponseUsage
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
 class ConfidenceReport(PageConfidenceScores):
-    pages: Dict[int, PageConfidenceScores] = Field(
+    pages: dict[int, PageConfidenceScores] = Field(
        default_factory=lambda: defaultdict(PageConfidenceScores)
    )
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
            mime = FormatToMimeType[InputFormat.PPTX][0]
        elif ext in FormatToExtensions[InputFormat.XLSX]:
            mime = FormatToMimeType[InputFormat.XLSX][0]
        elif ext in FormatToExtensions[InputFormat.VTT]:
            mime = FormatToMimeType[InputFormat.VTT][0]
        return mime
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.backend.noop_backend import NoOpBackend
 from docling.backend.webvtt_backend import WebVTTDocumentBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
        InputFormat.VTT: FormatOption(
            pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
        ),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docs/index.md
+++ b/docs/index.md
@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️  Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
 * 💬 Parsing of Web Video Text Tracks (WebVTT) files
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
 * 📝 Parsing of Web Video Text Tracks (WebVTT) files
 ## Get started
--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
 | PDF | |
 | DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
 | Markdown | |
-| AsciiDoc | |
+| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
 | HTML, XHTML | |
 | CSV | |
 | PNG, JPEG, TIFF, BMP, WEBP | Image formats |
 | WebVTT | Web Video Text Tracks format for displaying timed text |
 Schema-specific support:
@@ -32,4 +33,4 @@ Schema-specific support:
 | Markdown | |
 | JSON | Lossless serialization of Docling Document |
 | Text | Plain text, i.e. without Markdown markers |
-| Doctags | |
+| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ authors = [
 requires-python = '>=3.9,<4.0'
 dependencies = [
  'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.48.0,<3.0.0)',
+  'docling-core[chunking] (>=2.48.2,<3.0.0)',
  'docling-parse (>=4.4.0,<5.0.0)',
  "docling-ibm-models>=3.9.1,<4",
  'filetype (>=1.2.0,<2.0.0)',
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -0,0 +1,66 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group WebVTT cue block
    item-2 at level 2: text: 00:11.000 --> 00:13.000
    item-3 at level 2: inline: group WebVTT cue voice span
      item-4 at level 3: text: Roger Bingham: 
      item-5 at level 3: text: We are in New York City
  item-6 at level 1: section: group WebVTT cue block
    item-7 at level 2: text: 00:13.000 --> 00:16.000
    item-8 at level 2: inline: group WebVTT cue voice span
      item-9 at level 3: text: Roger Bingham: 
      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
  item-11 at level 1: section: group WebVTT cue block
    item-12 at level 2: text: 00:16.000 --> 00:18.000
    item-13 at level 2: inline: group WebVTT cue voice span
      item-14 at level 3: text: Roger Bingham: 
      item-15 at level 3: text: from the American Museum of Natural History
  item-16 at level 1: section: group WebVTT cue block
    item-17 at level 2: text: 00:18.000 --> 00:20.000
    item-18 at level 2: inline: group WebVTT cue voice span
      item-19 at level 3: text: Roger Bingham: 
      item-20 at level 3: text: And with me is Neil deGrasse Tyson
  item-21 at level 1: section: group WebVTT cue block
    item-22 at level 2: text: 00:20.000 --> 00:22.000
    item-23 at level 2: inline: group WebVTT cue voice span
      item-24 at level 3: text: Roger Bingham: 
      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
  item-26 at level 1: section: group WebVTT cue block
    item-27 at level 2: text: 00:22.000 --> 00:24.000
    item-28 at level 2: inline: group WebVTT cue voice span
      item-29 at level 3: text: Roger Bingham: 
      item-30 at level 3: text: at the AMNH.
  item-31 at level 1: section: group WebVTT cue block
    item-32 at level 2: text: 00:24.000 --> 00:26.000
    item-33 at level 2: inline: group WebVTT cue voice span
      item-34 at level 3: text: Roger Bingham: 
      item-35 at level 3: text: Thank you for walking down here.
  item-36 at level 1: section: group WebVTT cue block
    item-37 at level 2: text: 00:27.000 --> 00:30.000
    item-38 at level 2: inline: group WebVTT cue voice span
      item-39 at level 3: text: Roger Bingham: 
      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
  item-41 at level 1: section: group WebVTT cue block
    item-42 at level 2: text: 00:30.000 --> 00:31.500
    item-43 at level 2: inline: group WebVTT cue voice span
      item-44 at level 3: text: Roger Bingham: 
      item-45 at level 3: text: When we e-mailed—
  item-46 at level 1: section: group WebVTT cue block
    item-47 at level 2: text: 00:30.500 --> 00:32.500
    item-48 at level 2: inline: group WebVTT cue voice span
      item-49 at level 3: text: Neil deGrasse Tyson: 
      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
  item-51 at level 1: section: group WebVTT cue block
    item-52 at level 2: text: 00:32.000 --> 00:35.500
    item-53 at level 2: inline: group WebVTT cue voice span
      item-54 at level 3: text: Roger Bingham: 
      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
  item-56 at level 1: section: group WebVTT cue block
    item-57 at level 2: text: 00:32.500 --> 00:33.500
    item-58 at level 2: inline: group WebVTT cue voice span
      item-59 at level 3: text: Neil deGrasse Tyson: 
      item-60 at level 3: text: Laughs
  item-61 at level 1: section: group WebVTT cue block
    item-62 at level 2: text: 00:35.500 --> 00:38.000
    item-63 at level 2: inline: group WebVTT cue voice span
      item-64 at level 3: text: Roger Bingham: 
      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -0,0 +1,51 @@
 00:11.000 --> 00:13.000
 Roger Bingham:  We are in New York City
 00:13.000 --> 00:16.000
 Roger Bingham:  We’re actually at the Lucern Hotel, just down the street
 00:16.000 --> 00:18.000
 Roger Bingham:  from the American Museum of Natural History
 00:18.000 --> 00:20.000
 Roger Bingham:  And with me is Neil deGrasse Tyson
 00:20.000 --> 00:22.000
 Roger Bingham:  Astrophysicist, Director of the Hayden Planetarium
 00:22.000 --> 00:24.000
 Roger Bingham:  at the AMNH.
 00:24.000 --> 00:26.000
 Roger Bingham:  Thank you for walking down here.
 00:27.000 --> 00:30.000
 Roger Bingham:  And I want to do a follow-up on the last conversation we did.
 00:30.000 --> 00:31.500
 Roger Bingham:  When we e-mailed—
 00:30.500 --> 00:32.500
 Neil deGrasse Tyson:  Didn’t we talk about enough in that conversation?
 00:32.000 --> 00:35.500
 Roger Bingham:  No! No no no no; 'cos 'cos obviously 'cos
 00:32.500 --> 00:33.500
 Neil deGrasse Tyson:  *Laughs*
 00:35.500 --> 00:38.000
 Roger Bingham:  You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -0,0 +1,22 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group WebVTT cue block
    item-2 at level 2: text: 00:00.000 --> 00:02.000
    item-3 at level 2: inline: group WebVTT cue voice span
      item-4 at level 3: text: Esme (first, loud): 
      item-5 at level 3: text: It’s a blue apple tree!
  item-6 at level 1: section: group WebVTT cue block
    item-7 at level 2: text: 00:02.000 --> 00:04.000
    item-8 at level 2: inline: group WebVTT cue voice span
      item-9 at level 3: text: Mary: 
      item-10 at level 3: text: No way!
  item-11 at level 1: section: group WebVTT cue block
    item-12 at level 2: text: 00:04.000 --> 00:06.000
    item-13 at level 2: inline: group WebVTT cue voice span
      item-14 at level 3: text: Esme: 
      item-15 at level 3: text: Hee!
    item-16 at level 2: text: laughter
  item-17 at level 1: section: group WebVTT cue block
    item-18 at level 2: text: 00:06.000 --> 00:08.000
    item-19 at level 2: inline: group WebVTT cue voice span
      item-20 at level 3: text: Mary (loud): 
      item-21 at level 3: text: That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -0,0 +1,376 @@
 {
  "schema_name": "DoclingDocument",
  "version": "1.6.0",
  "name": "webvtt_example_02",
  "origin": {
    "mimetype": "text/vtt",
    "binary_hash": 12867774546881601731,
    "filename": "webvtt_example_02.vtt"
  },
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
  "body": {
    "self_ref": "#/body",
    "children": [
      {
        "$ref": "#/groups/0"
      },
      {
        "$ref": "#/groups/2"
      },
      {
        "$ref": "#/groups/4"
      },
      {
        "$ref": "#/groups/6"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/0"
        },
        {
          "$ref": "#/groups/1"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/1",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [
        {
          "$ref": "#/texts/1"
        },
        {
          "$ref": "#/texts/2"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/2",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/3"
        },
        {
          "$ref": "#/groups/3"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/3",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [
        {
          "$ref": "#/texts/4"
        },
        {
          "$ref": "#/texts/5"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/4",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/6"
        },
        {
          "$ref": "#/groups/5"
        },
        {
          "$ref": "#/texts/9"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/5",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [
        {
          "$ref": "#/texts/7"
        },
        {
          "$ref": "#/texts/8"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/6",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/10"
        },
        {
          "$ref": "#/groups/7"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue block",
      "label": "section"
    },
    {
      "self_ref": "#/groups/7",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [
        {
          "$ref": "#/texts/11"
        },
        {
          "$ref": "#/texts/12"
        }
      ],
      "content_layer": "body",
      "name": "WebVTT cue voice span",
      "label": "inline"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:00.000 --> 00:02.000",
      "text": "00:00.000 --> 00:02.000"
    },
    {
      "self_ref": "#/texts/1",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Esme (first, loud): ",
      "text": "Esme (first, loud): "
    },
    {
      "self_ref": "#/texts/2",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "It’s a blue apple tree!",
      "text": "It’s a blue apple tree!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/3",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:02.000 --> 00:04.000",
      "text": "00:02.000 --> 00:04.000"
    },
    {
      "self_ref": "#/texts/4",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Mary: ",
      "text": "Mary: "
    },
    {
      "self_ref": "#/texts/5",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "No way!",
      "text": "No way!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/6",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:04.000 --> 00:06.000",
      "text": "00:04.000 --> 00:06.000"
    },
    {
      "self_ref": "#/texts/7",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Esme: ",
      "text": "Esme: "
    },
    {
      "self_ref": "#/texts/8",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Hee!",
      "text": "Hee!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/9",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "laughter",
      "text": "laughter",
      "formatting": {
        "bold": false,
        "italic": true,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/10",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "00:06.000 --> 00:08.000",
      "text": "00:06.000 --> 00:08.000"
    },
    {
      "self_ref": "#/texts/11",
      "parent": {
        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Mary (loud): ",
      "text": "Mary (loud): "
    },
    {
      "self_ref": "#/texts/12",
      "parent": {
        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "That’s awesome!",
      "text": "That’s awesome!",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    }
  ],
  "pictures": [],
  "tables": [],
  "key_value_items": [],
  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -0,0 +1,17 @@
 00:00.000 --> 00:02.000
 Esme (first, loud):  It’s a blue apple tree!
 00:02.000 --> 00:04.000
 Mary:  No way!
 00:04.000 --> 00:06.000
 Esme:  Hee!
 *laughter*
 00:06.000 --> 00:08.000
 Mary (loud):  That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -0,0 +1,77 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: section: group WebVTT cue block
    item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
    item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
    item-4 at level 2: inline: group WebVTT cue voice span
      item-5 at level 3: text: Speaker A: 
      item-6 at level 3: text: OK, I think now we should be recording
  item-7 at level 1: section: group WebVTT cue block
    item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
    item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
    item-10 at level 2: inline: group WebVTT cue voice span
      item-11 at level 3: text: Speaker A: 
      item-12 at level 3: text: properly.
  item-13 at level 1: section: group WebVTT cue block
    item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
    item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
    item-16 at level 2: text: Good.
  item-17 at level 1: section: group WebVTT cue block
    item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
    item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
    item-20 at level 2: inline: group WebVTT cue voice span
      item-21 at level 3: text: Speaker A: 
      item-22 at level 3: text: Yeah.
  item-23 at level 1: section: group WebVTT cue block
    item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
    item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
    item-26 at level 2: inline: group WebVTT cue voice span
      item-27 at level 3: text: Speaker B: 
      item-28 at level 3: text: I was also thinking.
  item-29 at level 1: section: group WebVTT cue block
    item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
    item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
    item-32 at level 2: inline: group WebVTT cue voice span
      item-33 at level 3: text: Speaker B: 
      item-34 at level 3: text: Would be maybe good to create items,
  item-35 at level 1: section: group WebVTT cue block
    item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
    item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
    item-38 at level 2: inline: group WebVTT cue voice span
      item-39 at level 3: text: Speaker B: 
      item-40 at level 3: text: some metadata, some options that can be specific.
  item-41 at level 1: section: group WebVTT cue block
    item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
    item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
    item-44 at level 2: inline: group WebVTT cue voice span
      item-45 at level 3: text: Speaker A: 
      item-46 at level 3: text: Yeah, I mean I think you went even more than
  item-47 at level 1: section: group WebVTT cue block
    item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
    item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
    item-50 at level 2: inline: group WebVTT cue voice span
      item-51 at level 3: text: Speaker B: 
      item-52 at level 3: text: But we preserved the atoms.
  item-53 at level 1: section: group WebVTT cue block
    item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
    item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
    item-56 at level 2: inline: group WebVTT cue voice span
      item-57 at level 3: text: Speaker A: 
      item-58 at level 3: text: than me. I just opened the format.
  item-59 at level 1: section: group WebVTT cue block
    item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
    item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
    item-62 at level 2: inline: group WebVTT cue voice span
      item-63 at level 3: text: Speaker A: 
      item-64 at level 3: text: give it a try, yeah.
  item-65 at level 1: section: group WebVTT cue block
    item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
    item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
    item-68 at level 2: inline: group WebVTT cue voice span
      item-69 at level 3: text: Speaker B: 
      item-70 at level 3: text: Okay, talk to you later.
  item-71 at level 1: section: group WebVTT cue block
    item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
    item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
    item-74 at level 2: inline: group WebVTT cue voice span
      item-75 at level 3: text: Speaker A: 
      item-76 at level 3: text: See you.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -0,0 +1,77 @@
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
 00:00:04.963 --> 00:00:08.571
 Speaker A:  OK, I think now we should be recording
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
 00:00:08.571 --> 00:00:09.403
 Speaker A:  properly.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
 00:00:10.683 --> 00:00:11.563
 Good.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
 00:00:13.363 --> 00:00:13.803
 Speaker A:  Yeah.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
 00:00:49.603 --> 00:00:53.363
 Speaker B:  I was also thinking.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
 00:00:54.963 --> 00:01:02.072
 Speaker B:  Would be maybe good to create items,
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
 00:01:02.072 --> 00:01:06.811
 Speaker B:  some metadata, some options that can be specific.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
 00:01:10.243 --> 00:01:13.014
 Speaker A:  Yeah, I mean I think you went even more than
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
 00:01:10.563 --> 00:01:12.643
 Speaker B:  But we preserved the atoms.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
 00:01:13.014 --> 00:01:15.907
 Speaker A:  than me. I just opened the format.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
 00:01:50.222 --> 00:01:51.643
 Speaker A:  give it a try, yeah.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
 00:01:52.043 --> 00:01:55.043
 Speaker B:  Okay, talk to you later.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
 00:01:54.603 --> 00:01:55.283
 Speaker A:  See you.
--- a/tests/data/webvtt/webvtt_example_01.vtt
+++ b/tests/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
 WEBVTT
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 00:11.000 --> 00:13.000
 <v Roger Bingham>We are in New York City
 00:13.000 --> 00:16.000
 <v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
 00:16.000 --> 00:18.000
 <v Roger Bingham>from the American Museum of Natural History
 00:18.000 --> 00:20.000
 <v Roger Bingham>And with me is Neil deGrasse Tyson
 00:20.000 --> 00:22.000
 <v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
 00:22.000 --> 00:24.000
 <v Roger Bingham>at the AMNH.
 00:24.000 --> 00:26.000
 <v Roger Bingham>Thank you for walking down here.
 00:27.000 --> 00:30.000
 <v Roger Bingham>And I want to do a follow-up on the last conversation we did.
 00:30.000 --> 00:31.500 align:right size:50%
 <v Roger Bingham>When we e-mailed—
 00:30.500 --> 00:32.500 align:left size:50%
 <v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
 00:32.000 --> 00:35.500 align:right size:50%
 <v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
 00:32.500 --> 00:33.500 align:left size:50%
 <v Neil deGrasse Tyson><i>Laughs</i>
 00:35.500 --> 00:38.000
 <v Roger Bingham>You know I’m so excited my glasses are falling off here.
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
 WEBVTT
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 00:00.000 --> 00:02.000
 <v.first.loud Esme>It’s a blue apple tree!
 00:02.000 --> 00:04.000
 <v Mary>No way!
 00:04.000 --> 00:06.000
 <v Esme>Hee!</v> <i>laughter</i>
 00:06.000 --> 00:08.000
 <v.loud Mary>That’s awesome!
--- a/tests/data/webvtt/webvtt_example_03.vtt
+++ b/tests/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
 WEBVTT
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
 00:00:04.963 --> 00:00:08.571
 <v Speaker A>OK,
 I think now we should be recording</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
 00:00:08.571 --> 00:00:09.403
 <v Speaker A>properly.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
 00:00:10.683 --> 00:00:11.563
 Good.
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
 00:00:13.363 --> 00:00:13.803
 <v Speaker A>Yeah.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
 00:00:49.603 --> 00:00:53.363
 <v Speaker B>I was also thinking.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
 00:00:54.963 --> 00:01:02.072
 <v Speaker B>Would be maybe good to create items,</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
 00:01:02.072 --> 00:01:06.811
 <v Speaker B>some metadata,
 some options that can be specific.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
 00:01:10.243 --> 00:01:13.014
 <v Speaker A>Yeah,
 I mean I think you went even more than</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
 00:01:10.563 --> 00:01:12.643
 <v Speaker B>But we preserved the atoms.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
 00:01:13.014 --> 00:01:15.907
 <v Speaker A>than me.
 I just opened the format.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
 00:01:50.222 --> 00:01:51.643
 <v Speaker A>give it a try, yeah.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
 00:01:52.043 --> 00:01:55.043
 <v Speaker B>Okay, talk to you later.</v>
 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
 00:01:54.603 --> 00:01:55.283
 <v Speaker A>See you.</v>
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -0,0 +1,232 @@
 # Assisted by watsonx Code Assistant
 from pathlib import Path
 import pytest
 from docling_core.types.doc import DoclingDocument
 from pydantic import ValidationError
 from docling.backend.webvtt_backend import (
    _WebVTTCueItalicSpan,
    _WebVTTCueTextSpan,
    _WebVTTCueTimings,
    _WebVTTCueVoiceSpan,
    _WebVTTFile,
    _WebVTTTimestamp,
 )
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter
 from .test_data_gen_flag import GEN_TEST_DATA
 from .verify_utils import verify_document, verify_export
 GENERATE = GEN_TEST_DATA
 def test_vtt_cue_commponents():
    """Test WebVTT components."""
    valid_timestamps = [
        "00:01:02.345",
        "12:34:56.789",
        "02:34.567",
        "00:00:00.000",
    ]
    valid_total_seconds = [
        1 * 60 + 2.345,
        12 * 3600 + 34 * 60 + 56.789,
        2 * 60 + 34.567,
        0.0,
    ]
    for idx, ts in enumerate(valid_timestamps):
        model = _WebVTTTimestamp(raw=ts)
        assert model.seconds == valid_total_seconds[idx]
    """Test invalid WebVTT timestamps."""
    invalid_timestamps = [
        "00:60:02.345",  # minutes > 59
        "00:01:60.345",  # seconds > 59
        "00:01:02.1000",  # milliseconds > 999
        "01:02:03",  # missing milliseconds
        "01:02",  # missing milliseconds
        ":01:02.345",  # extra : for missing hours
        "abc:01:02.345",  # invalid format
    ]
    for ts in invalid_timestamps:
        with pytest.raises(ValidationError):
            _WebVTTTimestamp(raw=ts)
    """Test the timestamp __str__ method."""
    model = _WebVTTTimestamp(raw="00:01:02.345")
    assert str(model) == "00:01:02.345"
    """Test valid cue timings."""
    start = _WebVTTTimestamp(raw="00:10.005")
    end = _WebVTTTimestamp(raw="00:14.007")
    cue_timings = _WebVTTCueTimings(start=start, end=end)
    assert cue_timings.start == start
    assert cue_timings.end == end
    assert str(cue_timings) == "00:10.005 --> 00:14.007"
    """Test invalid cue timings with end timestamp before start."""
    start = _WebVTTTimestamp(raw="00:10.700")
    end = _WebVTTTimestamp(raw="00:10.500")
    with pytest.raises(ValidationError) as excinfo:
        _WebVTTCueTimings(start=start, end=end)
    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
    """Test invalid cue timings with missing end."""
    start = _WebVTTTimestamp(raw="00:10.500")
    with pytest.raises(ValidationError) as excinfo:
        _WebVTTCueTimings(start=start)
    assert "Field required" in str(excinfo.value)
    """Test invalid cue timings with missing start."""
    end = _WebVTTTimestamp(raw="00:10.500")
    with pytest.raises(ValidationError) as excinfo:
        _WebVTTCueTimings(end=end)
    assert "Field required" in str(excinfo.value)
    """Test with valid text."""
    valid_text = "This is a valid cue text span."
    span = _WebVTTCueTextSpan(text=valid_text)
    assert span.text == valid_text
    assert str(span) == valid_text
    """Test with text containing newline characters."""
    invalid_text = "This cue text span\ncontains a newline."
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text=invalid_text)
    """Test with text containing ampersand."""
    invalid_text = "This cue text span contains &."
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text=invalid_text)
    """Test with text containing less-than sign."""
    invalid_text = "This cue text span contains <."
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text=invalid_text)
    """Test with empty text."""
    with pytest.raises(ValidationError):
        _WebVTTCueTextSpan(text="")
    """Test that annotation validation works correctly."""
    valid_annotation = "valid-annotation"
    invalid_annotation = "invalid\nannotation"
    with pytest.raises(ValidationError):
        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
    """Test that classes validation works correctly."""
    annotation = "speaker name"
    valid_classes = ["class1", "class2"]
    invalid_classes = ["class\nwith\nnewlines", ""]
    with pytest.raises(ValidationError):
        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
    """Test that components validation works correctly."""
    annotation = "speaker name"
    valid_components = [_WebVTTCueTextSpan(text="random text")]
    invalid_components = [123, "not a component"]
    with pytest.raises(ValidationError):
        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
    """Test valid cue voice spans."""
    cue_span = _WebVTTCueVoiceSpan(
        annotation="speaker",
        classes=["loud", "clear"],
        components=[_WebVTTCueTextSpan(text="random text")],
    )
    expected_str = "<v.loud.clear speaker>random text</v>"
    assert str(cue_span) == expected_str
    cue_span = _WebVTTCueVoiceSpan(
        annotation="speaker",
        components=[_WebVTTCueTextSpan(text="random text")],
    )
    expected_str = "<v speaker>random text</v>"
    assert str(cue_span) == expected_str
 def test_webvtt_file():
    """Test WebVTT files."""
    with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
        content = f.read()
        vtt = _WebVTTFile.parse(content)
    assert len(vtt) == 13
    block = vtt.cue_blocks[11]
    assert str(block.timings) == "00:32.500 --> 00:33.500"
    assert len(block.payload) == 1
    cue_span = block.payload[0]
    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
    assert cue_span.annotation == "Neil deGrasse Tyson"
    assert not cue_span.classes
    assert len(cue_span.components) == 1
    comp = cue_span.components[0]
    assert isinstance(comp, _WebVTTCueItalicSpan)
    assert len(comp.components) == 1
    comp2 = comp.components[0]
    assert isinstance(comp2, _WebVTTCueTextSpan)
    assert comp2.text == "Laughs"
    with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
        content = f.read()
        vtt = _WebVTTFile.parse(content)
    assert len(vtt) == 4
    reverse = (
        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
        "https://www.w3.org/TR/webvtt1/\n\n"
    )
    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
    assert content == reverse
    with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
        content = f.read()
        vtt = _WebVTTFile.parse(content)
    assert len(vtt) == 13
    for block in vtt:
        assert block.identifier
    block = vtt.cue_blocks[0]
    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
    assert len(block.payload) == 1
    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
    block = vtt.cue_blocks[2]
    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
    assert len(block.payload) == 1
    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
    assert block.payload[0].text == "Good."
 def test_e2e_vtt_conversions():
    directory = Path("./tests/data/webvtt/")
    vtt_paths = sorted(directory.rglob("*.vtt"))
    converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
    for vtt in vtt_paths:
        gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
        conv_result: ConversionResult = converter.convert(vtt)
        doc: DoclingDocument = conv_result.document
        pred_md: str = doc.export_to_markdown(escape_html=False)
        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
            "export to md"
        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
            "export to indented-text"
        )
        assert verify_document(doc, str(gt_path) + ".json", GENERATE)
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
    doc_path.write_text("xyz", encoding="utf-8")
    assert dci._guess_format(doc_path) is None
    # Valid WebVTT
    buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
    stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
    assert dci._guess_format(stream) == InputFormat.VTT
    # Valid Docling JSON
    test_str = '{"name": ""}'
    stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
--- a/uv.lock
+++ b/uv.lock
@@ -1154,7 +1154,7 @@ requires-dist = [
    { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
    { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
    { name = "certifi", specifier = ">=2024.7.4" },
-    { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
+    { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
    { name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
    { name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
    { name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [
 [[package]]
 name = "docling-core"
-version = "2.48.1"
+version = "2.48.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
    { name = "typer" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
+    { url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
 ]
 [package.optional-dependencies]
@@ -4936,6 +4936,9 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
    { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
    { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
    { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
    { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
    { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
    { url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
    { url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
    { url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
@@ -4957,6 +4960,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
    { url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
    { url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
    { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
    { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
    { url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
    { url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
    { url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },