diff --git a/README.md b/README.md
index d3cd4935..a65803b3 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
+* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon
* 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
-* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Installation
diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py
new file mode 100644
index 00000000..2a7d02ce
--- /dev/null
+++ b/docling/backend/webvtt_backend.py
@@ -0,0 +1,572 @@
+import logging
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+
+from docling_core.types.doc import (
+ ContentLayer,
+ DocItemLabel,
+ DoclingDocument,
+ DocumentOrigin,
+ Formatting,
+ GroupLabel,
+ NodeItem,
+)
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic.types import StringConstraints
+from typing_extensions import Self, override
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class _WebVTTTimestamp(BaseModel):
+ """Model representing a WebVTT timestamp.
+
+ A WebVTT timestamp is always interpreted relative to the current playback position
+ of the media data that the WebVTT file is to be synchronized with.
+ """
+
+ model_config = ConfigDict(regex_engine="python-re")
+
+ raw: Annotated[
+ str,
+ Field(
+ description="A representation of the WebVTT Timestamp as a single string"
+ ),
+ ]
+
+ _pattern: ClassVar[re.Pattern] = re.compile(
+ r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
+ )
+ _hours: int
+ _minutes: int
+ _seconds: int
+ _millis: int
+
+ @model_validator(mode="after")
+ def validate_raw(self) -> Self:
+ m = self._pattern.match(self.raw)
+ if not m:
+ raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
+ self._hours = int(m.group(1)) if m.group(1) else 0
+ self._minutes = int(m.group(2))
+ self._seconds = int(m.group(3))
+ self._millis = int(m.group(4))
+
+ if self._minutes < 0 or self._minutes > 59:
+ raise ValueError("Minutes must be between 0 and 59")
+ if self._seconds < 0 or self._seconds > 59:
+ raise ValueError("Seconds must be between 0 and 59")
+
+ return self
+
+ @property
+ def seconds(self) -> float:
+ """A representation of the WebVTT Timestamp in seconds"""
+ return (
+ self._hours * 3600
+ + self._minutes * 60
+ + self._seconds
+ + self._millis / 1000.0
+ )
+
+ @override
+ def __str__(self) -> str:
+ return self.raw
+
+
+_WebVTTCueIdentifier = Annotated[
+ str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
+]
+
+
+class _WebVTTCueTimings(BaseModel):
+ """Model representating WebVTT cue timings."""
+
+ start: Annotated[
+ _WebVTTTimestamp, Field(description="Start time offset of the cue")
+ ]
+ end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
+
+ @model_validator(mode="after")
+ def check_order(self) -> Self:
+ if self.start and self.end:
+ if self.end.seconds <= self.start.seconds:
+ raise ValueError("End timestamp must be greater than start timestamp")
+ return self
+
+ @override
+ def __str__(self):
+ return f"{self.start} --> {self.end}"
+
+
+class _WebVTTCueTextSpan(BaseModel):
+ """Model representing a WebVTT cue text span."""
+
+ text: str
+ span_type: Literal["text"] = "text"
+
+ @field_validator("text", mode="after")
+ @classmethod
+ def validate_text(cls, value: str) -> str:
+ if any(ch in value for ch in {"\n", "\r", "&", "<"}):
+ raise ValueError("Cue text span contains invalid characters")
+ if len(value) == 0:
+ raise ValueError("Cue text span cannot be empty")
+ return value
+
+ @override
+ def __str__(self):
+ return self.text
+
+
+class _WebVTTCueVoiceSpan(BaseModel):
+ """Model representing a WebVTT cue voice span."""
+
+ annotation: Annotated[
+ str,
+ Field(
+ description=(
+ "Cue span start tag annotation text representing the name of thevoice"
+ )
+ ),
+ ]
+ classes: Annotated[
+ list[str],
+ Field(description="List of classes representing the cue span's significance"),
+ ] = []
+ components: Annotated[
+ list["_WebVTTCueComponent"],
+ Field(description="The components representing the cue internal text"),
+ ] = []
+ span_type: Literal["v"] = "v"
+
+ @field_validator("annotation", mode="after")
+ @classmethod
+ def validate_annotation(cls, value: str) -> str:
+ if any(ch in value for ch in {"\n", "\r", "&", ">"}):
+ raise ValueError(
+ "Cue span start tag annotation contains invalid characters"
+ )
+ if not value:
+ raise ValueError("Cue text span cannot be empty")
+ return value
+
+ @field_validator("classes", mode="after")
+ @classmethod
+ def validate_classes(cls, value: list[str]) -> list[str]:
+ for item in value:
+ if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
+ raise ValueError(
+ "A cue span start tag class contains invalid characters"
+ )
+ if not item:
+ raise ValueError("Cue span start tag classes cannot be empty")
+ return value
+
+ @override
+ def __str__(self):
+ tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
+ inner = "".join(str(span) for span in self.components)
+ return f"<{tag} {self.annotation}>{inner}"
+
+
+class _WebVTTCueClassSpan(BaseModel):
+ span_type: Literal["c"] = "c"
+ components: list["_WebVTTCueComponent"]
+
+ @override
+ def __str__(self):
+ inner = "".join(str(span) for span in self.components)
+ return f"{inner}"
+
+
+class _WebVTTCueItalicSpan(BaseModel):
+ span_type: Literal["i"] = "i"
+ components: list["_WebVTTCueComponent"]
+
+ @override
+ def __str__(self):
+ inner = "".join(str(span) for span in self.components)
+ return f"{inner}"
+
+
+class _WebVTTCueBoldSpan(BaseModel):
+ span_type: Literal["b"] = "b"
+ components: list["_WebVTTCueComponent"]
+
+ @override
+ def __str__(self):
+ inner = "".join(str(span) for span in self.components)
+ return f"{inner}"
+
+
+class _WebVTTCueUnderlineSpan(BaseModel):
+ span_type: Literal["u"] = "u"
+ components: list["_WebVTTCueComponent"]
+
+ @override
+ def __str__(self):
+ inner = "".join(str(span) for span in self.components)
+ return f"{inner}"
+
+
+_WebVTTCueComponent = Annotated[
+ Union[
+ _WebVTTCueTextSpan,
+ _WebVTTCueClassSpan,
+ _WebVTTCueItalicSpan,
+ _WebVTTCueBoldSpan,
+ _WebVTTCueUnderlineSpan,
+ _WebVTTCueVoiceSpan,
+ ],
+ Field(discriminator="span_type", description="The WebVTT cue component"),
+]
+
+
+class _WebVTTCueBlock(BaseModel):
+ """Model representing a WebVTT cue block.
+
+ The optional WebVTT cue settings list is not supported.
+ The cue payload is limited to the following spans: text, class, italic, bold,
+ underline, and voice.
+ """
+
+ model_config = ConfigDict(regex_engine="python-re")
+
+ identifier: Optional[_WebVTTCueIdentifier] = Field(
+ None, description="The WebVTT cue identifier"
+ )
+ timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+ payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
+
+ _pattern_block: ClassVar[re.Pattern] = re.compile(
+ r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
+ )
+ _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
+ r"^\.[^\t\n\r &<>]+)?" # zero or more classes
+ r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation
+ )
+
+ @field_validator("payload", mode="after")
+ @classmethod
+ def validate_payload(cls, payload):
+ for voice in payload:
+ if "-->" in str(voice):
+ raise ValueError("Cue payload must not contain '-->'")
+ return payload
+
+ @classmethod
+ def parse(cls, raw: str) -> "_WebVTTCueBlock":
+ lines = raw.strip().splitlines()
+ if not lines:
+ raise ValueError("Cue block must have at least one line")
+ identifier: Optional[_WebVTTCueIdentifier] = None
+ timing_line = lines[0]
+ if "-->" not in timing_line and len(lines) > 1:
+ identifier = timing_line
+ timing_line = lines[1]
+ cue_lines = lines[2:]
+ else:
+ cue_lines = lines[1:]
+
+ if "-->" not in timing_line:
+ raise ValueError("Cue block must contain WebVTT cue timings")
+
+ start, end = [t.strip() for t in timing_line.split("-->")]
+ end = re.split(" |\t", end)[0] # ignore the cue settings list
+ timings: _WebVTTCueTimings = _WebVTTCueTimings(
+ start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+ )
+ cue_text = " ".join(cue_lines).strip()
+ if cue_text.startswith("" not in cue_text:
+ # adding close tag for cue voice spans without end tag
+ cue_text += ""
+
+ stack: list[list[_WebVTTCueComponent]] = [[]]
+ tag_stack: list[Union[str, tuple]] = []
+
+ pos = 0
+ matches = list(cls._pattern_block.finditer(cue_text))
+ i = 0
+ while i < len(matches):
+ match = matches[i]
+ if match.start() > pos:
+ stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
+ tag = match.group(0)
+
+ if tag.startswith(("", "", "", "")):
+ tag_type = tag[1:2]
+ tag_stack.append(tag_type)
+ stack.append([])
+ elif tag == "":
+ children = stack.pop()
+ stack[-1].append(_WebVTTCueItalicSpan(components=children))
+ tag_stack.pop()
+ elif tag == "":
+ children = stack.pop()
+ stack[-1].append(_WebVTTCueBoldSpan(components=children))
+ tag_stack.pop()
+ elif tag == "":
+ children = stack.pop()
+ stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
+ tag_stack.pop()
+ elif tag == "":
+ children = stack.pop()
+ stack[-1].append(_WebVTTCueClassSpan(components=children))
+ tag_stack.pop()
+ elif tag.startswith(""))
+ else:
+ parts.append(str(span))
+
+ return "".join(parts)
+
+
+class _WebVTTFile(BaseModel):
+ """A model representing a WebVTT file."""
+
+ cue_blocks: list[_WebVTTCueBlock]
+
+ @staticmethod
+ def verify_signature(content: str) -> bool:
+ if not content:
+ return False
+ elif len(content) == 6:
+ return content == "WEBVTT"
+ elif len(content) > 6 and content.startswith("WEBVTT"):
+ return content[6] in (" ", "\t", "\n")
+ else:
+ return False
+
+ @classmethod
+ def parse(cls, raw: str) -> "_WebVTTFile":
+ # Normalize newlines to LF
+ raw = raw.replace("\r\n", "\n").replace("\r", "\n")
+
+ # Check WebVTT signature
+ if not cls.verify_signature(raw):
+ raise ValueError("Invalid WebVTT file signature")
+
+ # Strip "WEBVTT" header line
+ lines = raw.split("\n", 1)
+ body = lines[1] if len(lines) > 1 else ""
+
+ # Remove NOTE/STYLE/REGION blocks
+ body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
+ body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
+
+ # Split into cue blocks
+ raw_blocks = re.split(r"\n\s*\n", body.strip())
+ cues: list[_WebVTTCueBlock] = []
+ for block in raw_blocks:
+ try:
+ cues.append(_WebVTTCueBlock.parse(block))
+ except ValueError as e:
+ _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
+
+ return cls(cue_blocks=cues)
+
+ def __iter__(self):
+ return iter(self.cue_blocks)
+
+ def __getitem__(self, idx):
+ return self.cue_blocks[idx]
+
+ def __len__(self):
+ return len(self.cue_blocks)
+
+
+class WebVTTDocumentBackend(DeclarativeDocumentBackend):
+ """Declarative backend for WebVTT (.vtt) files.
+
+ This parser reads the content of a WebVTT file and converts
+ it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
+
+ Each cue becomes a TextItem and the items are appended to the
+ document body by the cue's start time.
+ """
+
+ @override
+ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+ super().__init__(in_doc, path_or_stream)
+
+ self.content: str = ""
+ try:
+ if isinstance(self.path_or_stream, BytesIO):
+ self.content = self.path_or_stream.getvalue().decode("utf-8")
+ if isinstance(self.path_or_stream, Path):
+ with open(self.path_or_stream, encoding="utf-8") as f:
+ self.content = f.read()
+ except Exception as e:
+ raise RuntimeError(
+ "Could not initialize the WebVTT backend for file with hash "
+ f"{self.document_hash}."
+ ) from e
+
+ @override
+ def is_valid(self) -> bool:
+ return _WebVTTFile.verify_signature(self.content)
+
+ @classmethod
+ @override
+ def supports_pagination(cls) -> bool:
+ return False
+
+ @override
+ def unload(self):
+ if isinstance(self.path_or_stream, BytesIO):
+ self.path_or_stream.close()
+ self.path_or_stream = None
+
+ @classmethod
+ @override
+ def supported_formats(cls) -> set[InputFormat]:
+ return {InputFormat.VTT}
+
+ @staticmethod
+ def _add_text_from_component(
+ doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
+ ) -> None:
+ """Adds a TextItem to a document by extracting text from a cue span component.
+
+ TODO: address nesting
+ """
+ formatting = Formatting()
+ text = ""
+ if isinstance(item, _WebVTTCueItalicSpan):
+ formatting.italic = True
+ elif isinstance(item, _WebVTTCueBoldSpan):
+ formatting.bold = True
+ elif isinstance(item, _WebVTTCueUnderlineSpan):
+ formatting.underline = True
+ if isinstance(item, _WebVTTCueTextSpan):
+ text = item.text
+ else:
+ # TODO: address nesting
+ text = "".join(
+ [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
+ )
+ if text := text.strip():
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ text=text,
+ parent=parent,
+ content_layer=ContentLayer.BODY,
+ formatting=formatting,
+ )
+
+ @override
+ def convert(self) -> DoclingDocument:
+ _log.debug("Starting WebVTT conversion...")
+ if not self.is_valid():
+ raise RuntimeError("Invalid WebVTT document.")
+
+ origin = DocumentOrigin(
+ filename=self.file.name or "file",
+ mimetype="text/vtt",
+ binary_hash=self.document_hash,
+ )
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+
+ vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
+ for block in vtt.cue_blocks:
+ block_group = doc.add_group(
+ label=GroupLabel.SECTION,
+ name="WebVTT cue block",
+ parent=None,
+ content_layer=ContentLayer.BODY,
+ )
+ if block.identifier:
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ text=str(block.identifier),
+ parent=block_group,
+ content_layer=ContentLayer.BODY,
+ )
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ text=str(block.timings),
+ parent=block_group,
+ content_layer=ContentLayer.BODY,
+ )
+ for cue_span in block.payload:
+ if isinstance(cue_span, _WebVTTCueVoiceSpan):
+ voice_group = doc.add_group(
+ label=GroupLabel.INLINE,
+ name="WebVTT cue voice span",
+ parent=block_group,
+ content_layer=ContentLayer.BODY,
+ )
+ voice = cue_span.annotation
+ if classes := cue_span.classes:
+ voice += f" ({', '.join(classes)})"
+ voice += ": "
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ text=voice,
+ parent=voice_group,
+ content_layer=ContentLayer.BODY,
+ )
+ for item in cue_span.components:
+ WebVTTDocumentBackend._add_text_from_component(
+ doc, item, voice_group
+ )
+ else:
+ WebVTTDocumentBackend._add_text_from_component(
+ doc, cue_span, block_group
+ )
+
+ return doc
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 25a4386e..627ecf5f 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,7 +1,6 @@
-import math
from collections import defaultdict
from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Optional, Type, Union
import numpy as np
from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
)
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (
- DocumentStream,
-)
+from docling_core.types.io import DocumentStream
# DO NOT REMOVE; explicitly exposed from this location
from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
METS_GBS = "mets_gbs"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
+ VTT = "vtt"
class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
DOCTAGS = "doctags"
-FormatToExtensions: Dict[InputFormat, List[str]] = {
+FormatToExtensions: dict[InputFormat, list[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["tar.gz"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
+ InputFormat.VTT: ["vtt"],
}
-FormatToMimeType: Dict[InputFormat, List[str]] = {
+FormatToMimeType: dict[InputFormat, list[str]] = {
InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["application/mets+xml"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
+ InputFormat.VTT: ["text/vtt"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
- cells: List[TextCell] = []
- children: List["Cluster"] = [] # Add child cluster support
+ cells: list[TextCell] = []
+ children: list["Cluster"] = [] # Add child cluster support
@field_serializer("confidence")
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
class LayoutPrediction(BaseModel):
- clusters: List[Cluster] = []
+ clusters: list[Cluster] = []
class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
class Table(BasePageElement):
- otsl_seq: List[str]
+ otsl_seq: list[str]
num_rows: int = 0
num_cols: int = 0
- table_cells: List[TableCell]
+ table_cells: list[TableCell]
class TableStructurePrediction(BaseModel):
- table_map: Dict[int, Table] = {}
+ table_map: dict[int, Table] = {}
class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
class FigureElement(BasePageElement):
- annotations: List[PictureDataType] = []
+ annotations: list[PictureDataType] = []
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
class FigureClassificationPrediction(BaseModel):
figure_count: int = 0
- figure_map: Dict[int, FigureElement] = {}
+ figure_map: dict[int, FigureElement] = {}
class EquationPrediction(BaseModel):
equation_count: int = 0
- equation_map: Dict[int, TextElement] = {}
+ equation_map: dict[int, TextElement] = {}
class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
class AssembledUnit(BaseModel):
- elements: List[PageElement] = []
- body: List[PageElement] = []
- headers: List[PageElement] = []
+ elements: list[PageElement] = []
+ body: list[PageElement] = []
+ headers: list[PageElement] = []
class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
- _image_cache: Dict[
+ _image_cache: dict[
float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
- def cells(self) -> List[TextCell]:
+ def cells(self) -> list[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
id: str
model: Optional[str] = None # returned by openai
- choices: List[OpenAiResponseChoice]
+ choices: list[OpenAiResponseChoice]
created: int
usage: OpenAiResponseUsage
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
class ConfidenceReport(PageConfidenceScores):
- pages: Dict[int, PageConfidenceScores] = Field(
+ pages: dict[int, PageConfidenceScores] = Field(
default_factory=lambda: defaultdict(PageConfidenceScores)
)
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 7955ff9d..8ea45482 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.PPTX][0]
elif ext in FormatToExtensions[InputFormat.XLSX]:
mime = FormatToMimeType[InputFormat.XLSX][0]
+ elif ext in FormatToExtensions[InputFormat.VTT]:
+ mime = FormatToMimeType[InputFormat.VTT][0]
return mime
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 1c314903..5d64d633 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.noop_backend import NoOpBackend
+from docling.backend.webvtt_backend import WebVTTDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
+ InputFormat.VTT: FormatOption(
+ pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
+ ),
}
if (options := format_to_default_options.get(format)) is not None:
return options
diff --git a/docs/index.md b/docs/index.md
index a41b1303..d18b6d21 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
+* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon
* 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
-* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Get started
diff --git a/docs/usage/supported_formats.md b/docs/usage/supported_formats.md
index c38e7ffa..09f25ed5 100644
--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
| PDF | |
| DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
| Markdown | |
-| AsciiDoc | |
+| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
| HTML, XHTML | |
| CSV | |
| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
+| WebVTT | Web Video Text Tracks format for displaying timed text |
Schema-specific support:
@@ -32,4 +33,4 @@ Schema-specific support:
| Markdown | |
| JSON | Lossless serialization of Docling Document |
| Text | Plain text, i.e. without Markdown markers |
-| Doctags | |
+| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |
diff --git a/pyproject.toml b/pyproject.toml
index 116d61f9..e22a1d15 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ authors = [
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
- 'docling-core[chunking] (>=2.48.0,<3.0.0)',
+ 'docling-core[chunking] (>=2.48.2,<3.0.0)',
'docling-parse (>=4.4.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
new file mode 100644
index 00000000..d7840e99
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -0,0 +1,66 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: section: group WebVTT cue block
+ item-2 at level 2: text: 00:11.000 --> 00:13.000
+ item-3 at level 2: inline: group WebVTT cue voice span
+ item-4 at level 3: text: Roger Bingham:
+ item-5 at level 3: text: We are in New York City
+ item-6 at level 1: section: group WebVTT cue block
+ item-7 at level 2: text: 00:13.000 --> 00:16.000
+ item-8 at level 2: inline: group WebVTT cue voice span
+ item-9 at level 3: text: Roger Bingham:
+ item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
+ item-11 at level 1: section: group WebVTT cue block
+ item-12 at level 2: text: 00:16.000 --> 00:18.000
+ item-13 at level 2: inline: group WebVTT cue voice span
+ item-14 at level 3: text: Roger Bingham:
+ item-15 at level 3: text: from the American Museum of Natural History
+ item-16 at level 1: section: group WebVTT cue block
+ item-17 at level 2: text: 00:18.000 --> 00:20.000
+ item-18 at level 2: inline: group WebVTT cue voice span
+ item-19 at level 3: text: Roger Bingham:
+ item-20 at level 3: text: And with me is Neil deGrasse Tyson
+ item-21 at level 1: section: group WebVTT cue block
+ item-22 at level 2: text: 00:20.000 --> 00:22.000
+ item-23 at level 2: inline: group WebVTT cue voice span
+ item-24 at level 3: text: Roger Bingham:
+ item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
+ item-26 at level 1: section: group WebVTT cue block
+ item-27 at level 2: text: 00:22.000 --> 00:24.000
+ item-28 at level 2: inline: group WebVTT cue voice span
+ item-29 at level 3: text: Roger Bingham:
+ item-30 at level 3: text: at the AMNH.
+ item-31 at level 1: section: group WebVTT cue block
+ item-32 at level 2: text: 00:24.000 --> 00:26.000
+ item-33 at level 2: inline: group WebVTT cue voice span
+ item-34 at level 3: text: Roger Bingham:
+ item-35 at level 3: text: Thank you for walking down here.
+ item-36 at level 1: section: group WebVTT cue block
+ item-37 at level 2: text: 00:27.000 --> 00:30.000
+ item-38 at level 2: inline: group WebVTT cue voice span
+ item-39 at level 3: text: Roger Bingham:
+ item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
+ item-41 at level 1: section: group WebVTT cue block
+ item-42 at level 2: text: 00:30.000 --> 00:31.500
+ item-43 at level 2: inline: group WebVTT cue voice span
+ item-44 at level 3: text: Roger Bingham:
+ item-45 at level 3: text: When we e-mailed—
+ item-46 at level 1: section: group WebVTT cue block
+ item-47 at level 2: text: 00:30.500 --> 00:32.500
+ item-48 at level 2: inline: group WebVTT cue voice span
+ item-49 at level 3: text: Neil deGrasse Tyson:
+ item-50 at level 3: text: Didn’t we talk about enough in that conversation?
+ item-51 at level 1: section: group WebVTT cue block
+ item-52 at level 2: text: 00:32.000 --> 00:35.500
+ item-53 at level 2: inline: group WebVTT cue voice span
+ item-54 at level 3: text: Roger Bingham:
+ item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
+ item-56 at level 1: section: group WebVTT cue block
+ item-57 at level 2: text: 00:32.500 --> 00:33.500
+ item-58 at level 2: inline: group WebVTT cue voice span
+ item-59 at level 3: text: Neil deGrasse Tyson:
+ item-60 at level 3: text: Laughs
+ item-61 at level 1: section: group WebVTT cue block
+ item-62 at level 2: text: 00:35.500 --> 00:38.000
+ item-63 at level 2: inline: group WebVTT cue voice span
+ item-64 at level 3: text: Roger Bingham:
+ item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
new file mode 100644
index 00000000..0d34890e
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
@@ -0,0 +1,1074 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.6.0",
+ "name": "webvtt_example_01",
+ "origin": {
+ "mimetype": "text/vtt",
+ "binary_hash": 16887312431371817791,
+ "filename": "webvtt_example_01.vtt"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/groups/0"
+ },
+ {
+ "$ref": "#/groups/2"
+ },
+ {
+ "$ref": "#/groups/4"
+ },
+ {
+ "$ref": "#/groups/6"
+ },
+ {
+ "$ref": "#/groups/8"
+ },
+ {
+ "$ref": "#/groups/10"
+ },
+ {
+ "$ref": "#/groups/12"
+ },
+ {
+ "$ref": "#/groups/14"
+ },
+ {
+ "$ref": "#/groups/16"
+ },
+ {
+ "$ref": "#/groups/18"
+ },
+ {
+ "$ref": "#/groups/20"
+ },
+ {
+ "$ref": "#/groups/22"
+ },
+ {
+ "$ref": "#/groups/24"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [
+ {
+ "self_ref": "#/groups/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/groups/1"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/1",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/1"
+ },
+ {
+ "$ref": "#/texts/2"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/2",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/3"
+ },
+ {
+ "$ref": "#/groups/3"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/3",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/4"
+ },
+ {
+ "$ref": "#/texts/5"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/4",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/6"
+ },
+ {
+ "$ref": "#/groups/5"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/5",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/7"
+ },
+ {
+ "$ref": "#/texts/8"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/6",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/9"
+ },
+ {
+ "$ref": "#/groups/7"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/7",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/10"
+ },
+ {
+ "$ref": "#/texts/11"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/8",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/12"
+ },
+ {
+ "$ref": "#/groups/9"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/9",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/13"
+ },
+ {
+ "$ref": "#/texts/14"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/10",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/15"
+ },
+ {
+ "$ref": "#/groups/11"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/11",
+ "parent": {
+ "$ref": "#/groups/10"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/16"
+ },
+ {
+ "$ref": "#/texts/17"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/12",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/18"
+ },
+ {
+ "$ref": "#/groups/13"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/13",
+ "parent": {
+ "$ref": "#/groups/12"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/19"
+ },
+ {
+ "$ref": "#/texts/20"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/14",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/21"
+ },
+ {
+ "$ref": "#/groups/15"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/15",
+ "parent": {
+ "$ref": "#/groups/14"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/22"
+ },
+ {
+ "$ref": "#/texts/23"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/16",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/24"
+ },
+ {
+ "$ref": "#/groups/17"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/17",
+ "parent": {
+ "$ref": "#/groups/16"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/25"
+ },
+ {
+ "$ref": "#/texts/26"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/18",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/27"
+ },
+ {
+ "$ref": "#/groups/19"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/19",
+ "parent": {
+ "$ref": "#/groups/18"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/28"
+ },
+ {
+ "$ref": "#/texts/29"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/20",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/30"
+ },
+ {
+ "$ref": "#/groups/21"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/21",
+ "parent": {
+ "$ref": "#/groups/20"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/31"
+ },
+ {
+ "$ref": "#/texts/32"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/22",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/33"
+ },
+ {
+ "$ref": "#/groups/23"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/23",
+ "parent": {
+ "$ref": "#/groups/22"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/34"
+ },
+ {
+ "$ref": "#/texts/35"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/24",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/36"
+ },
+ {
+ "$ref": "#/groups/25"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/25",
+ "parent": {
+ "$ref": "#/groups/24"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/37"
+ },
+ {
+ "$ref": "#/texts/38"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ }
+ ],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:11.000 --> 00:13.000",
+ "text": "00:11.000 --> 00:13.000"
+ },
+ {
+ "self_ref": "#/texts/1",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/2",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "We are in New York City",
+ "text": "We are in New York City",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/3",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:13.000 --> 00:16.000",
+ "text": "00:13.000 --> 00:16.000"
+ },
+ {
+ "self_ref": "#/texts/4",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/5",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "We’re actually at the Lucern Hotel, just down the street",
+ "text": "We’re actually at the Lucern Hotel, just down the street",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/6",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:16.000 --> 00:18.000",
+ "text": "00:16.000 --> 00:18.000"
+ },
+ {
+ "self_ref": "#/texts/7",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/8",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "from the American Museum of Natural History",
+ "text": "from the American Museum of Natural History",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/9",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:18.000 --> 00:20.000",
+ "text": "00:18.000 --> 00:20.000"
+ },
+ {
+ "self_ref": "#/texts/10",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/11",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "And with me is Neil deGrasse Tyson",
+ "text": "And with me is Neil deGrasse Tyson",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/12",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:20.000 --> 00:22.000",
+ "text": "00:20.000 --> 00:22.000"
+ },
+ {
+ "self_ref": "#/texts/13",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/14",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Astrophysicist, Director of the Hayden Planetarium",
+ "text": "Astrophysicist, Director of the Hayden Planetarium",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/15",
+ "parent": {
+ "$ref": "#/groups/10"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:22.000 --> 00:24.000",
+ "text": "00:22.000 --> 00:24.000"
+ },
+ {
+ "self_ref": "#/texts/16",
+ "parent": {
+ "$ref": "#/groups/11"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/17",
+ "parent": {
+ "$ref": "#/groups/11"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "at the AMNH.",
+ "text": "at the AMNH.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/18",
+ "parent": {
+ "$ref": "#/groups/12"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:24.000 --> 00:26.000",
+ "text": "00:24.000 --> 00:26.000"
+ },
+ {
+ "self_ref": "#/texts/19",
+ "parent": {
+ "$ref": "#/groups/13"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/20",
+ "parent": {
+ "$ref": "#/groups/13"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Thank you for walking down here.",
+ "text": "Thank you for walking down here.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/21",
+ "parent": {
+ "$ref": "#/groups/14"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:27.000 --> 00:30.000",
+ "text": "00:27.000 --> 00:30.000"
+ },
+ {
+ "self_ref": "#/texts/22",
+ "parent": {
+ "$ref": "#/groups/15"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/23",
+ "parent": {
+ "$ref": "#/groups/15"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "And I want to do a follow-up on the last conversation we did.",
+ "text": "And I want to do a follow-up on the last conversation we did.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/24",
+ "parent": {
+ "$ref": "#/groups/16"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:30.000 --> 00:31.500",
+ "text": "00:30.000 --> 00:31.500"
+ },
+ {
+ "self_ref": "#/texts/25",
+ "parent": {
+ "$ref": "#/groups/17"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/26",
+ "parent": {
+ "$ref": "#/groups/17"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "When we e-mailed—",
+ "text": "When we e-mailed—",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/27",
+ "parent": {
+ "$ref": "#/groups/18"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:30.500 --> 00:32.500",
+ "text": "00:30.500 --> 00:32.500"
+ },
+ {
+ "self_ref": "#/texts/28",
+ "parent": {
+ "$ref": "#/groups/19"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Neil deGrasse Tyson: ",
+ "text": "Neil deGrasse Tyson: "
+ },
+ {
+ "self_ref": "#/texts/29",
+ "parent": {
+ "$ref": "#/groups/19"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Didn’t we talk about enough in that conversation?",
+ "text": "Didn’t we talk about enough in that conversation?",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/30",
+ "parent": {
+ "$ref": "#/groups/20"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:32.000 --> 00:35.500",
+ "text": "00:32.000 --> 00:35.500"
+ },
+ {
+ "self_ref": "#/texts/31",
+ "parent": {
+ "$ref": "#/groups/21"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/32",
+ "parent": {
+ "$ref": "#/groups/21"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+ "text": "No! No no no no; 'cos 'cos obviously 'cos",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/33",
+ "parent": {
+ "$ref": "#/groups/22"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:32.500 --> 00:33.500",
+ "text": "00:32.500 --> 00:33.500"
+ },
+ {
+ "self_ref": "#/texts/34",
+ "parent": {
+ "$ref": "#/groups/23"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Neil deGrasse Tyson: ",
+ "text": "Neil deGrasse Tyson: "
+ },
+ {
+ "self_ref": "#/texts/35",
+ "parent": {
+ "$ref": "#/groups/23"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Laughs",
+ "text": "Laughs",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/36",
+ "parent": {
+ "$ref": "#/groups/24"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:35.500 --> 00:38.000",
+ "text": "00:35.500 --> 00:38.000"
+ },
+ {
+ "self_ref": "#/texts/37",
+ "parent": {
+ "$ref": "#/groups/25"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Roger Bingham: ",
+ "text": "Roger Bingham: "
+ },
+ {
+ "self_ref": "#/texts/38",
+ "parent": {
+ "$ref": "#/groups/25"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "You know I’m so excited my glasses are falling off here.",
+ "text": "You know I’m so excited my glasses are falling off here.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ }
+ ],
+ "pictures": [],
+ "tables": [],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
new file mode 100644
index 00000000..c5767028
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -0,0 +1,51 @@
+00:11.000 --> 00:13.000
+
+Roger Bingham: We are in New York City
+
+00:13.000 --> 00:16.000
+
+Roger Bingham: We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+
+Roger Bingham: from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+
+Roger Bingham: And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+
+Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+
+Roger Bingham: at the AMNH.
+
+00:24.000 --> 00:26.000
+
+Roger Bingham: Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+
+Roger Bingham: And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500
+
+Roger Bingham: When we e-mailed—
+
+00:30.500 --> 00:32.500
+
+Neil deGrasse Tyson: Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500
+
+Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500
+
+Neil deGrasse Tyson: *Laughs*
+
+00:35.500 --> 00:38.000
+
+Roger Bingham: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
new file mode 100644
index 00000000..6d90404f
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -0,0 +1,22 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: section: group WebVTT cue block
+ item-2 at level 2: text: 00:00.000 --> 00:02.000
+ item-3 at level 2: inline: group WebVTT cue voice span
+ item-4 at level 3: text: Esme (first, loud):
+ item-5 at level 3: text: It’s a blue apple tree!
+ item-6 at level 1: section: group WebVTT cue block
+ item-7 at level 2: text: 00:02.000 --> 00:04.000
+ item-8 at level 2: inline: group WebVTT cue voice span
+ item-9 at level 3: text: Mary:
+ item-10 at level 3: text: No way!
+ item-11 at level 1: section: group WebVTT cue block
+ item-12 at level 2: text: 00:04.000 --> 00:06.000
+ item-13 at level 2: inline: group WebVTT cue voice span
+ item-14 at level 3: text: Esme:
+ item-15 at level 3: text: Hee!
+ item-16 at level 2: text: laughter
+ item-17 at level 1: section: group WebVTT cue block
+ item-18 at level 2: text: 00:06.000 --> 00:08.000
+ item-19 at level 2: inline: group WebVTT cue voice span
+ item-20 at level 3: text: Mary (loud):
+ item-21 at level 3: text: That’s awesome!
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
new file mode 100644
index 00000000..c7700ae2
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -0,0 +1,376 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.6.0",
+ "name": "webvtt_example_02",
+ "origin": {
+ "mimetype": "text/vtt",
+ "binary_hash": 12867774546881601731,
+ "filename": "webvtt_example_02.vtt"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/groups/0"
+ },
+ {
+ "$ref": "#/groups/2"
+ },
+ {
+ "$ref": "#/groups/4"
+ },
+ {
+ "$ref": "#/groups/6"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [
+ {
+ "self_ref": "#/groups/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/groups/1"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/1",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/1"
+ },
+ {
+ "$ref": "#/texts/2"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/2",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/3"
+ },
+ {
+ "$ref": "#/groups/3"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/3",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/4"
+ },
+ {
+ "$ref": "#/texts/5"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/4",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/6"
+ },
+ {
+ "$ref": "#/groups/5"
+ },
+ {
+ "$ref": "#/texts/9"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/5",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/7"
+ },
+ {
+ "$ref": "#/texts/8"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/6",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/10"
+ },
+ {
+ "$ref": "#/groups/7"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/7",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/11"
+ },
+ {
+ "$ref": "#/texts/12"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ }
+ ],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00.000 --> 00:02.000",
+ "text": "00:00.000 --> 00:02.000"
+ },
+ {
+ "self_ref": "#/texts/1",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Esme (first, loud): ",
+ "text": "Esme (first, loud): "
+ },
+ {
+ "self_ref": "#/texts/2",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "It’s a blue apple tree!",
+ "text": "It’s a blue apple tree!",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/3",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:02.000 --> 00:04.000",
+ "text": "00:02.000 --> 00:04.000"
+ },
+ {
+ "self_ref": "#/texts/4",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Mary: ",
+ "text": "Mary: "
+ },
+ {
+ "self_ref": "#/texts/5",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "No way!",
+ "text": "No way!",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/6",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:04.000 --> 00:06.000",
+ "text": "00:04.000 --> 00:06.000"
+ },
+ {
+ "self_ref": "#/texts/7",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Esme: ",
+ "text": "Esme: "
+ },
+ {
+ "self_ref": "#/texts/8",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Hee!",
+ "text": "Hee!",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/9",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "laughter",
+ "text": "laughter",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/10",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:06.000 --> 00:08.000",
+ "text": "00:06.000 --> 00:08.000"
+ },
+ {
+ "self_ref": "#/texts/11",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Mary (loud): ",
+ "text": "Mary (loud): "
+ },
+ {
+ "self_ref": "#/texts/12",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "That’s awesome!",
+ "text": "That’s awesome!",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ }
+ ],
+ "pictures": [],
+ "tables": [],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
new file mode 100644
index 00000000..db84cf11
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -0,0 +1,17 @@
+00:00.000 --> 00:02.000
+
+Esme (first, loud): It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+
+Mary: No way!
+
+00:04.000 --> 00:06.000
+
+Esme: Hee!
+
+*laughter*
+
+00:06.000 --> 00:08.000
+
+Mary (loud): That’s awesome!
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
new file mode 100644
index 00000000..ca344e59
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -0,0 +1,77 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: section: group WebVTT cue block
+ item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+ item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
+ item-4 at level 2: inline: group WebVTT cue voice span
+ item-5 at level 3: text: Speaker A:
+ item-6 at level 3: text: OK, I think now we should be recording
+ item-7 at level 1: section: group WebVTT cue block
+ item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+ item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
+ item-10 at level 2: inline: group WebVTT cue voice span
+ item-11 at level 3: text: Speaker A:
+ item-12 at level 3: text: properly.
+ item-13 at level 1: section: group WebVTT cue block
+ item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+ item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
+ item-16 at level 2: text: Good.
+ item-17 at level 1: section: group WebVTT cue block
+ item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+ item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
+ item-20 at level 2: inline: group WebVTT cue voice span
+ item-21 at level 3: text: Speaker A:
+ item-22 at level 3: text: Yeah.
+ item-23 at level 1: section: group WebVTT cue block
+ item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+ item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
+ item-26 at level 2: inline: group WebVTT cue voice span
+ item-27 at level 3: text: Speaker B:
+ item-28 at level 3: text: I was also thinking.
+ item-29 at level 1: section: group WebVTT cue block
+ item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+ item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
+ item-32 at level 2: inline: group WebVTT cue voice span
+ item-33 at level 3: text: Speaker B:
+ item-34 at level 3: text: Would be maybe good to create items,
+ item-35 at level 1: section: group WebVTT cue block
+ item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+ item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
+ item-38 at level 2: inline: group WebVTT cue voice span
+ item-39 at level 3: text: Speaker B:
+ item-40 at level 3: text: some metadata, some options that can be specific.
+ item-41 at level 1: section: group WebVTT cue block
+ item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+ item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
+ item-44 at level 2: inline: group WebVTT cue voice span
+ item-45 at level 3: text: Speaker A:
+ item-46 at level 3: text: Yeah, I mean I think you went even more than
+ item-47 at level 1: section: group WebVTT cue block
+ item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+ item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
+ item-50 at level 2: inline: group WebVTT cue voice span
+ item-51 at level 3: text: Speaker B:
+ item-52 at level 3: text: But we preserved the atoms.
+ item-53 at level 1: section: group WebVTT cue block
+ item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+ item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
+ item-56 at level 2: inline: group WebVTT cue voice span
+ item-57 at level 3: text: Speaker A:
+ item-58 at level 3: text: than me. I just opened the format.
+ item-59 at level 1: section: group WebVTT cue block
+ item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+ item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
+ item-62 at level 2: inline: group WebVTT cue voice span
+ item-63 at level 3: text: Speaker A:
+ item-64 at level 3: text: give it a try, yeah.
+ item-65 at level 1: section: group WebVTT cue block
+ item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+ item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
+ item-68 at level 2: inline: group WebVTT cue voice span
+ item-69 at level 3: text: Speaker B:
+ item-70 at level 3: text: Okay, talk to you later.
+ item-71 at level 1: section: group WebVTT cue block
+ item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+ item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
+ item-74 at level 2: inline: group WebVTT cue voice span
+ item-75 at level 3: text: Speaker A:
+ item-76 at level 3: text: See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
new file mode 100644
index 00000000..5b833971
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
@@ -0,0 +1,1240 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.6.0",
+ "name": "webvtt_example_03",
+ "origin": {
+ "mimetype": "text/vtt",
+ "binary_hash": 11620880316586573676,
+ "filename": "webvtt_example_03.vtt"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/groups/0"
+ },
+ {
+ "$ref": "#/groups/2"
+ },
+ {
+ "$ref": "#/groups/4"
+ },
+ {
+ "$ref": "#/groups/5"
+ },
+ {
+ "$ref": "#/groups/7"
+ },
+ {
+ "$ref": "#/groups/9"
+ },
+ {
+ "$ref": "#/groups/11"
+ },
+ {
+ "$ref": "#/groups/13"
+ },
+ {
+ "$ref": "#/groups/15"
+ },
+ {
+ "$ref": "#/groups/17"
+ },
+ {
+ "$ref": "#/groups/19"
+ },
+ {
+ "$ref": "#/groups/21"
+ },
+ {
+ "$ref": "#/groups/23"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [
+ {
+ "self_ref": "#/groups/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/texts/1"
+ },
+ {
+ "$ref": "#/groups/1"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/1",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/2"
+ },
+ {
+ "$ref": "#/texts/3"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/2",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/4"
+ },
+ {
+ "$ref": "#/texts/5"
+ },
+ {
+ "$ref": "#/groups/3"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/3",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/6"
+ },
+ {
+ "$ref": "#/texts/7"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/4",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/8"
+ },
+ {
+ "$ref": "#/texts/9"
+ },
+ {
+ "$ref": "#/texts/10"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/5",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/11"
+ },
+ {
+ "$ref": "#/texts/12"
+ },
+ {
+ "$ref": "#/groups/6"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/6",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/13"
+ },
+ {
+ "$ref": "#/texts/14"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/7",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/15"
+ },
+ {
+ "$ref": "#/texts/16"
+ },
+ {
+ "$ref": "#/groups/8"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/8",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/17"
+ },
+ {
+ "$ref": "#/texts/18"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/9",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/19"
+ },
+ {
+ "$ref": "#/texts/20"
+ },
+ {
+ "$ref": "#/groups/10"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/10",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/21"
+ },
+ {
+ "$ref": "#/texts/22"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/11",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/23"
+ },
+ {
+ "$ref": "#/texts/24"
+ },
+ {
+ "$ref": "#/groups/12"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/12",
+ "parent": {
+ "$ref": "#/groups/11"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/25"
+ },
+ {
+ "$ref": "#/texts/26"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/13",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/27"
+ },
+ {
+ "$ref": "#/texts/28"
+ },
+ {
+ "$ref": "#/groups/14"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/14",
+ "parent": {
+ "$ref": "#/groups/13"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/29"
+ },
+ {
+ "$ref": "#/texts/30"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/15",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/31"
+ },
+ {
+ "$ref": "#/texts/32"
+ },
+ {
+ "$ref": "#/groups/16"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/16",
+ "parent": {
+ "$ref": "#/groups/15"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/33"
+ },
+ {
+ "$ref": "#/texts/34"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/17",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/35"
+ },
+ {
+ "$ref": "#/texts/36"
+ },
+ {
+ "$ref": "#/groups/18"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/18",
+ "parent": {
+ "$ref": "#/groups/17"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/37"
+ },
+ {
+ "$ref": "#/texts/38"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/19",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/39"
+ },
+ {
+ "$ref": "#/texts/40"
+ },
+ {
+ "$ref": "#/groups/20"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/20",
+ "parent": {
+ "$ref": "#/groups/19"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/41"
+ },
+ {
+ "$ref": "#/texts/42"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/21",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/43"
+ },
+ {
+ "$ref": "#/texts/44"
+ },
+ {
+ "$ref": "#/groups/22"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/22",
+ "parent": {
+ "$ref": "#/groups/21"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/45"
+ },
+ {
+ "$ref": "#/texts/46"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/23",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/47"
+ },
+ {
+ "$ref": "#/texts/48"
+ },
+ {
+ "$ref": "#/groups/24"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue block",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/24",
+ "parent": {
+ "$ref": "#/groups/23"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/49"
+ },
+ {
+ "$ref": "#/texts/50"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue voice span",
+ "label": "inline"
+ }
+ ],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+ },
+ {
+ "self_ref": "#/texts/1",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00:04.963 --> 00:00:08.571",
+ "text": "00:00:04.963 --> 00:00:08.571"
+ },
+ {
+ "self_ref": "#/texts/2",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/3",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "OK, I think now we should be recording",
+ "text": "OK, I think now we should be recording",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/4",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1"
+ },
+ {
+ "self_ref": "#/texts/5",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00:08.571 --> 00:00:09.403",
+ "text": "00:00:08.571 --> 00:00:09.403"
+ },
+ {
+ "self_ref": "#/texts/6",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/7",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "properly.",
+ "text": "properly.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/8",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+ },
+ {
+ "self_ref": "#/texts/9",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00:10.683 --> 00:00:11.563",
+ "text": "00:00:10.683 --> 00:00:11.563"
+ },
+ {
+ "self_ref": "#/texts/10",
+ "parent": {
+ "$ref": "#/groups/4"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Good.",
+ "text": "Good.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/11",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0"
+ },
+ {
+ "self_ref": "#/texts/12",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00:13.363 --> 00:00:13.803",
+ "text": "00:00:13.363 --> 00:00:13.803"
+ },
+ {
+ "self_ref": "#/texts/13",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/14",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Yeah.",
+ "text": "Yeah.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/15",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0"
+ },
+ {
+ "self_ref": "#/texts/16",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00:49.603 --> 00:00:53.363",
+ "text": "00:00:49.603 --> 00:00:53.363"
+ },
+ {
+ "self_ref": "#/texts/17",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker B: ",
+ "text": "Speaker B: "
+ },
+ {
+ "self_ref": "#/texts/18",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "I was also thinking.",
+ "text": "I was also thinking.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/19",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0"
+ },
+ {
+ "self_ref": "#/texts/20",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:00:54.963 --> 00:01:02.072",
+ "text": "00:00:54.963 --> 00:01:02.072"
+ },
+ {
+ "self_ref": "#/texts/21",
+ "parent": {
+ "$ref": "#/groups/10"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker B: ",
+ "text": "Speaker B: "
+ },
+ {
+ "self_ref": "#/texts/22",
+ "parent": {
+ "$ref": "#/groups/10"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Would be maybe good to create items,",
+ "text": "Would be maybe good to create items,",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/23",
+ "parent": {
+ "$ref": "#/groups/11"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1"
+ },
+ {
+ "self_ref": "#/texts/24",
+ "parent": {
+ "$ref": "#/groups/11"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:02.072 --> 00:01:06.811",
+ "text": "00:01:02.072 --> 00:01:06.811"
+ },
+ {
+ "self_ref": "#/texts/25",
+ "parent": {
+ "$ref": "#/groups/12"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker B: ",
+ "text": "Speaker B: "
+ },
+ {
+ "self_ref": "#/texts/26",
+ "parent": {
+ "$ref": "#/groups/12"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "some metadata, some options that can be specific.",
+ "text": "some metadata, some options that can be specific.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/27",
+ "parent": {
+ "$ref": "#/groups/13"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0"
+ },
+ {
+ "self_ref": "#/texts/28",
+ "parent": {
+ "$ref": "#/groups/13"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:10.243 --> 00:01:13.014",
+ "text": "00:01:10.243 --> 00:01:13.014"
+ },
+ {
+ "self_ref": "#/texts/29",
+ "parent": {
+ "$ref": "#/groups/14"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/30",
+ "parent": {
+ "$ref": "#/groups/14"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Yeah, I mean I think you went even more than",
+ "text": "Yeah, I mean I think you went even more than",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/31",
+ "parent": {
+ "$ref": "#/groups/15"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0"
+ },
+ {
+ "self_ref": "#/texts/32",
+ "parent": {
+ "$ref": "#/groups/15"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:10.563 --> 00:01:12.643",
+ "text": "00:01:10.563 --> 00:01:12.643"
+ },
+ {
+ "self_ref": "#/texts/33",
+ "parent": {
+ "$ref": "#/groups/16"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker B: ",
+ "text": "Speaker B: "
+ },
+ {
+ "self_ref": "#/texts/34",
+ "parent": {
+ "$ref": "#/groups/16"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "But we preserved the atoms.",
+ "text": "But we preserved the atoms.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/35",
+ "parent": {
+ "$ref": "#/groups/17"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1"
+ },
+ {
+ "self_ref": "#/texts/36",
+ "parent": {
+ "$ref": "#/groups/17"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:13.014 --> 00:01:15.907",
+ "text": "00:01:13.014 --> 00:01:15.907"
+ },
+ {
+ "self_ref": "#/texts/37",
+ "parent": {
+ "$ref": "#/groups/18"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/38",
+ "parent": {
+ "$ref": "#/groups/18"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "than me. I just opened the format.",
+ "text": "than me. I just opened the format.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/39",
+ "parent": {
+ "$ref": "#/groups/19"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1"
+ },
+ {
+ "self_ref": "#/texts/40",
+ "parent": {
+ "$ref": "#/groups/19"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:50.222 --> 00:01:51.643",
+ "text": "00:01:50.222 --> 00:01:51.643"
+ },
+ {
+ "self_ref": "#/texts/41",
+ "parent": {
+ "$ref": "#/groups/20"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/42",
+ "parent": {
+ "$ref": "#/groups/20"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "give it a try, yeah.",
+ "text": "give it a try, yeah.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/43",
+ "parent": {
+ "$ref": "#/groups/21"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0"
+ },
+ {
+ "self_ref": "#/texts/44",
+ "parent": {
+ "$ref": "#/groups/21"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:52.043 --> 00:01:55.043",
+ "text": "00:01:52.043 --> 00:01:55.043"
+ },
+ {
+ "self_ref": "#/texts/45",
+ "parent": {
+ "$ref": "#/groups/22"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker B: ",
+ "text": "Speaker B: "
+ },
+ {
+ "self_ref": "#/texts/46",
+ "parent": {
+ "$ref": "#/groups/22"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Okay, talk to you later.",
+ "text": "Okay, talk to you later.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/47",
+ "parent": {
+ "$ref": "#/groups/23"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+ "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0"
+ },
+ {
+ "self_ref": "#/texts/48",
+ "parent": {
+ "$ref": "#/groups/23"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "00:01:54.603 --> 00:01:55.283",
+ "text": "00:01:54.603 --> 00:01:55.283"
+ },
+ {
+ "self_ref": "#/texts/49",
+ "parent": {
+ "$ref": "#/groups/24"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "Speaker A: ",
+ "text": "Speaker A: "
+ },
+ {
+ "self_ref": "#/texts/50",
+ "parent": {
+ "$ref": "#/groups/24"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "See you.",
+ "text": "See you.",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ }
+ ],
+ "pictures": [],
+ "tables": [],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
new file mode 100644
index 00000000..859a6dde
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -0,0 +1,77 @@
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+
+00:00:04.963 --> 00:00:08.571
+
+Speaker A: OK, I think now we should be recording
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+
+00:00:08.571 --> 00:00:09.403
+
+Speaker A: properly.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+
+00:00:10.683 --> 00:00:11.563
+
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+
+00:00:13.363 --> 00:00:13.803
+
+Speaker A: Yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+
+00:00:49.603 --> 00:00:53.363
+
+Speaker B: I was also thinking.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+
+00:00:54.963 --> 00:01:02.072
+
+Speaker B: Would be maybe good to create items,
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+
+00:01:02.072 --> 00:01:06.811
+
+Speaker B: some metadata, some options that can be specific.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+
+00:01:10.243 --> 00:01:13.014
+
+Speaker A: Yeah, I mean I think you went even more than
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+
+00:01:10.563 --> 00:01:12.643
+
+Speaker B: But we preserved the atoms.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+
+00:01:13.014 --> 00:01:15.907
+
+Speaker A: than me. I just opened the format.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+
+00:01:50.222 --> 00:01:51.643
+
+Speaker A: give it a try, yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+
+00:01:52.043 --> 00:01:55.043
+
+Speaker B: Okay, talk to you later.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+
+00:01:54.603 --> 00:01:55.283
+
+Speaker A: See you.
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_01.vtt b/tests/data/webvtt/webvtt_example_01.vtt
new file mode 100644
index 00000000..333ca4a8
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:11.000 --> 00:13.000
+We are in New York City
+
+00:13.000 --> 00:16.000
+We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+at the AMNH.
+
+00:24.000 --> 00:26.000
+Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500 align:right size:50%
+When we e-mailed—
+
+00:30.500 --> 00:32.500 align:left size:50%
+Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500 align:right size:50%
+No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500 align:left size:50%
+Laughs
+
+00:35.500 --> 00:38.000
+You know I’m so excited my glasses are falling off here.
diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt
new file mode 100644
index 00000000..1152a1e8
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:00.000 --> 00:02.000
+It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+No way!
+
+00:04.000 --> 00:06.000
+Hee! laughter
+
+00:06.000 --> 00:08.000
+That’s awesome!
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_03.vtt b/tests/data/webvtt/webvtt_example_03.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+OK,
+I think now we should be recording
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+properly.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+Yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+I was also thinking.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+Would be maybe good to create items,
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+some metadata,
+some options that can be specific.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+Yeah,
+I mean I think you went even more than
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+But we preserved the atoms.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+than me.
+I just opened the format.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+give it a try, yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+Okay, talk to you later.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+See you.
\ No newline at end of file
diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py
new file mode 100644
index 00000000..a910671b
--- /dev/null
+++ b/tests/test_backend_vtt.py
@@ -0,0 +1,232 @@
+# Assisted by watsonx Code Assistant
+
+from pathlib import Path
+
+import pytest
+from docling_core.types.doc import DoclingDocument
+from pydantic import ValidationError
+
+from docling.backend.webvtt_backend import (
+ _WebVTTCueItalicSpan,
+ _WebVTTCueTextSpan,
+ _WebVTTCueTimings,
+ _WebVTTCueVoiceSpan,
+ _WebVTTFile,
+ _WebVTTTimestamp,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+
+from .test_data_gen_flag import GEN_TEST_DATA
+from .verify_utils import verify_document, verify_export
+
+GENERATE = GEN_TEST_DATA
+
+
+def test_vtt_cue_commponents():
+ """Test WebVTT components."""
+ valid_timestamps = [
+ "00:01:02.345",
+ "12:34:56.789",
+ "02:34.567",
+ "00:00:00.000",
+ ]
+ valid_total_seconds = [
+ 1 * 60 + 2.345,
+ 12 * 3600 + 34 * 60 + 56.789,
+ 2 * 60 + 34.567,
+ 0.0,
+ ]
+ for idx, ts in enumerate(valid_timestamps):
+ model = _WebVTTTimestamp(raw=ts)
+ assert model.seconds == valid_total_seconds[idx]
+
+ """Test invalid WebVTT timestamps."""
+ invalid_timestamps = [
+ "00:60:02.345", # minutes > 59
+ "00:01:60.345", # seconds > 59
+ "00:01:02.1000", # milliseconds > 999
+ "01:02:03", # missing milliseconds
+ "01:02", # missing milliseconds
+ ":01:02.345", # extra : for missing hours
+ "abc:01:02.345", # invalid format
+ ]
+ for ts in invalid_timestamps:
+ with pytest.raises(ValidationError):
+ _WebVTTTimestamp(raw=ts)
+
+ """Test the timestamp __str__ method."""
+ model = _WebVTTTimestamp(raw="00:01:02.345")
+ assert str(model) == "00:01:02.345"
+
+ """Test valid cue timings."""
+ start = _WebVTTTimestamp(raw="00:10.005")
+ end = _WebVTTTimestamp(raw="00:14.007")
+ cue_timings = _WebVTTCueTimings(start=start, end=end)
+ assert cue_timings.start == start
+ assert cue_timings.end == end
+ assert str(cue_timings) == "00:10.005 --> 00:14.007"
+
+ """Test invalid cue timings with end timestamp before start."""
+ start = _WebVTTTimestamp(raw="00:10.700")
+ end = _WebVTTTimestamp(raw="00:10.500")
+ with pytest.raises(ValidationError) as excinfo:
+ _WebVTTCueTimings(start=start, end=end)
+ assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
+
+ """Test invalid cue timings with missing end."""
+ start = _WebVTTTimestamp(raw="00:10.500")
+ with pytest.raises(ValidationError) as excinfo:
+ _WebVTTCueTimings(start=start)
+ assert "Field required" in str(excinfo.value)
+
+ """Test invalid cue timings with missing start."""
+ end = _WebVTTTimestamp(raw="00:10.500")
+ with pytest.raises(ValidationError) as excinfo:
+ _WebVTTCueTimings(end=end)
+ assert "Field required" in str(excinfo.value)
+
+ """Test with valid text."""
+ valid_text = "This is a valid cue text span."
+ span = _WebVTTCueTextSpan(text=valid_text)
+ assert span.text == valid_text
+ assert str(span) == valid_text
+
+ """Test with text containing newline characters."""
+ invalid_text = "This cue text span\ncontains a newline."
+ with pytest.raises(ValidationError):
+ _WebVTTCueTextSpan(text=invalid_text)
+
+ """Test with text containing ampersand."""
+ invalid_text = "This cue text span contains &."
+ with pytest.raises(ValidationError):
+ _WebVTTCueTextSpan(text=invalid_text)
+
+ """Test with text containing less-than sign."""
+ invalid_text = "This cue text span contains <."
+ with pytest.raises(ValidationError):
+ _WebVTTCueTextSpan(text=invalid_text)
+
+ """Test with empty text."""
+ with pytest.raises(ValidationError):
+ _WebVTTCueTextSpan(text="")
+
+ """Test that annotation validation works correctly."""
+ valid_annotation = "valid-annotation"
+ invalid_annotation = "invalid\nannotation"
+ with pytest.raises(ValidationError):
+ _WebVTTCueVoiceSpan(annotation=invalid_annotation)
+ assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+
+ """Test that classes validation works correctly."""
+ annotation = "speaker name"
+ valid_classes = ["class1", "class2"]
+ invalid_classes = ["class\nwith\nnewlines", ""]
+ with pytest.raises(ValidationError):
+ _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
+ assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+
+ """Test that components validation works correctly."""
+ annotation = "speaker name"
+ valid_components = [_WebVTTCueTextSpan(text="random text")]
+ invalid_components = [123, "not a component"]
+ with pytest.raises(ValidationError):
+ _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
+ assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+
+ """Test valid cue voice spans."""
+ cue_span = _WebVTTCueVoiceSpan(
+ annotation="speaker",
+ classes=["loud", "clear"],
+ components=[_WebVTTCueTextSpan(text="random text")],
+ )
+
+ expected_str = "random text"
+ assert str(cue_span) == expected_str
+
+ cue_span = _WebVTTCueVoiceSpan(
+ annotation="speaker",
+ components=[_WebVTTCueTextSpan(text="random text")],
+ )
+ expected_str = "random text"
+ assert str(cue_span) == expected_str
+
+
+def test_webvtt_file():
+ """Test WebVTT files."""
+ with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
+ content = f.read()
+ vtt = _WebVTTFile.parse(content)
+ assert len(vtt) == 13
+ block = vtt.cue_blocks[11]
+ assert str(block.timings) == "00:32.500 --> 00:33.500"
+ assert len(block.payload) == 1
+ cue_span = block.payload[0]
+ assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+ assert cue_span.annotation == "Neil deGrasse Tyson"
+ assert not cue_span.classes
+ assert len(cue_span.components) == 1
+ comp = cue_span.components[0]
+ assert isinstance(comp, _WebVTTCueItalicSpan)
+ assert len(comp.components) == 1
+ comp2 = comp.components[0]
+ assert isinstance(comp2, _WebVTTCueTextSpan)
+ assert comp2.text == "Laughs"
+
+ with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
+ content = f.read()
+ vtt = _WebVTTFile.parse(content)
+ assert len(vtt) == 4
+ reverse = (
+ "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
+ "https://www.w3.org/TR/webvtt1/\n\n"
+ )
+ reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
+ assert content == reverse
+
+ with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
+ content = f.read()
+ vtt = _WebVTTFile.parse(content)
+ assert len(vtt) == 13
+ for block in vtt:
+ assert block.identifier
+ block = vtt.cue_blocks[0]
+ assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+ assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
+ assert len(block.payload) == 1
+ assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+ block = vtt.cue_blocks[2]
+ assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+ assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+ assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
+ assert len(block.payload) == 1
+ assert isinstance(block.payload[0], _WebVTTCueTextSpan)
+ assert block.payload[0].text == "Good."
+
+
+def test_e2e_vtt_conversions():
+ directory = Path("./tests/data/webvtt/")
+ vtt_paths = sorted(directory.rglob("*.vtt"))
+ converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
+
+ for vtt in vtt_paths:
+ gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
+
+ conv_result: ConversionResult = converter.convert(vtt)
+
+ doc: DoclingDocument = conv_result.document
+
+ pred_md: str = doc.export_to_markdown(escape_html=False)
+ assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
+ "export to md"
+ )
+
+ pred_itxt: str = doc._export_to_indented_text(
+ max_text_len=70, explicit_tables=False
+ )
+ assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
+ "export to indented-text"
+ )
+
+ assert verify_document(doc, str(gt_path) + ".json", GENERATE)
diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py
index 29f1dafe..4b7ce469 100644
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) is None
+ # Valid WebVTT
+ buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
+ stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
+ assert dci._guess_format(stream) == InputFormat.VTT
+
# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
diff --git a/uv.lock b/uv.lock
index d265f426..c7f3721a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1154,7 +1154,7 @@ requires-dist = [
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
{ name = "certifi", specifier = ">=2024.7.4" },
- { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
+ { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
{ name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
{ name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [
[[package]]
name = "docling-core"
-version = "2.48.1"
+version = "2.48.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
{ name = "typer" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
+ { url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
]
[package.optional-dependencies]
@@ -4936,6 +4936,9 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
{ url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
{ url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
+ { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
{ url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
{ url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
{ url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
@@ -4957,6 +4960,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
{ url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
{ url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
+ { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
{ url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
{ url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
{ url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },