feat: add a backend parser for WebVTT files (#2288)

* feat: add a backend parser for WebVTT files

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* docs: update README with VTT support

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* docs: add description to supported formats

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore: upgrade docling-core to unescape WebVTT in markdown

Pin the new release of docling-core 2.48.2.
Do not escape HTML reserved characters when exporting WebVTT documents to markdown.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* test: add missing copyright notice

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-09-22 15:24:34 +02:00
committed by GitHub
parent b5628f1227
commit 46efaaefee
23 changed files with 3969 additions and 34 deletions

View File

@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon
* 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Installation

View File

@@ -0,0 +1,572 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Annotated, ClassVar, Literal, Optional, Union, cast
from docling_core.types.doc import (
ContentLayer,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
Formatting,
GroupLabel,
NodeItem,
)
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic.types import StringConstraints
from typing_extensions import Self, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class _WebVTTTimestamp(BaseModel):
"""Model representing a WebVTT timestamp.
A WebVTT timestamp is always interpreted relative to the current playback position
of the media data that the WebVTT file is to be synchronized with.
"""
model_config = ConfigDict(regex_engine="python-re")
raw: Annotated[
str,
Field(
description="A representation of the WebVTT Timestamp as a single string"
),
]
_pattern: ClassVar[re.Pattern] = re.compile(
r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
)
_hours: int
_minutes: int
_seconds: int
_millis: int
@model_validator(mode="after")
def validate_raw(self) -> Self:
m = self._pattern.match(self.raw)
if not m:
raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
self._hours = int(m.group(1)) if m.group(1) else 0
self._minutes = int(m.group(2))
self._seconds = int(m.group(3))
self._millis = int(m.group(4))
if self._minutes < 0 or self._minutes > 59:
raise ValueError("Minutes must be between 0 and 59")
if self._seconds < 0 or self._seconds > 59:
raise ValueError("Seconds must be between 0 and 59")
return self
@property
def seconds(self) -> float:
"""A representation of the WebVTT Timestamp in seconds"""
return (
self._hours * 3600
+ self._minutes * 60
+ self._seconds
+ self._millis / 1000.0
)
@override
def __str__(self) -> str:
return self.raw
_WebVTTCueIdentifier = Annotated[
str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
]
class _WebVTTCueTimings(BaseModel):
"""Model representating WebVTT cue timings."""
start: Annotated[
_WebVTTTimestamp, Field(description="Start time offset of the cue")
]
end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
@model_validator(mode="after")
def check_order(self) -> Self:
if self.start and self.end:
if self.end.seconds <= self.start.seconds:
raise ValueError("End timestamp must be greater than start timestamp")
return self
@override
def __str__(self):
return f"{self.start} --> {self.end}"
class _WebVTTCueTextSpan(BaseModel):
"""Model representing a WebVTT cue text span."""
text: str
span_type: Literal["text"] = "text"
@field_validator("text", mode="after")
@classmethod
def validate_text(cls, value: str) -> str:
if any(ch in value for ch in {"\n", "\r", "&", "<"}):
raise ValueError("Cue text span contains invalid characters")
if len(value) == 0:
raise ValueError("Cue text span cannot be empty")
return value
@override
def __str__(self):
return self.text
class _WebVTTCueVoiceSpan(BaseModel):
"""Model representing a WebVTT cue voice span."""
annotation: Annotated[
str,
Field(
description=(
"Cue span start tag annotation text representing the name of thevoice"
)
),
]
classes: Annotated[
list[str],
Field(description="List of classes representing the cue span's significance"),
] = []
components: Annotated[
list["_WebVTTCueComponent"],
Field(description="The components representing the cue internal text"),
] = []
span_type: Literal["v"] = "v"
@field_validator("annotation", mode="after")
@classmethod
def validate_annotation(cls, value: str) -> str:
if any(ch in value for ch in {"\n", "\r", "&", ">"}):
raise ValueError(
"Cue span start tag annotation contains invalid characters"
)
if not value:
raise ValueError("Cue text span cannot be empty")
return value
@field_validator("classes", mode="after")
@classmethod
def validate_classes(cls, value: list[str]) -> list[str]:
for item in value:
if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
raise ValueError(
"A cue span start tag class contains invalid characters"
)
if not item:
raise ValueError("Cue span start tag classes cannot be empty")
return value
@override
def __str__(self):
tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
inner = "".join(str(span) for span in self.components)
return f"<{tag} {self.annotation}>{inner}</v>"
class _WebVTTCueClassSpan(BaseModel):
span_type: Literal["c"] = "c"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<c>{inner}</c>"
class _WebVTTCueItalicSpan(BaseModel):
span_type: Literal["i"] = "i"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<i>{inner}</i>"
class _WebVTTCueBoldSpan(BaseModel):
span_type: Literal["b"] = "b"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<b>{inner}</b>"
class _WebVTTCueUnderlineSpan(BaseModel):
span_type: Literal["u"] = "u"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<u>{inner}</u>"
_WebVTTCueComponent = Annotated[
Union[
_WebVTTCueTextSpan,
_WebVTTCueClassSpan,
_WebVTTCueItalicSpan,
_WebVTTCueBoldSpan,
_WebVTTCueUnderlineSpan,
_WebVTTCueVoiceSpan,
],
Field(discriminator="span_type", description="The WebVTT cue component"),
]
class _WebVTTCueBlock(BaseModel):
"""Model representing a WebVTT cue block.
The optional WebVTT cue settings list is not supported.
The cue payload is limited to the following spans: text, class, italic, bold,
underline, and voice.
"""
model_config = ConfigDict(regex_engine="python-re")
identifier: Optional[_WebVTTCueIdentifier] = Field(
None, description="The WebVTT cue identifier"
)
timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
_pattern_block: ClassVar[re.Pattern] = re.compile(
r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
)
_pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
r"^<v(?P<class>\.[^\t\n\r &<>]+)?" # zero or more classes
r"[ \t]+(?P<annotation>[^\n\r&>]+)>" # required space and annotation
)
@field_validator("payload", mode="after")
@classmethod
def validate_payload(cls, payload):
for voice in payload:
if "-->" in str(voice):
raise ValueError("Cue payload must not contain '-->'")
return payload
@classmethod
def parse(cls, raw: str) -> "_WebVTTCueBlock":
lines = raw.strip().splitlines()
if not lines:
raise ValueError("Cue block must have at least one line")
identifier: Optional[_WebVTTCueIdentifier] = None
timing_line = lines[0]
if "-->" not in timing_line and len(lines) > 1:
identifier = timing_line
timing_line = lines[1]
cue_lines = lines[2:]
else:
cue_lines = lines[1:]
if "-->" not in timing_line:
raise ValueError("Cue block must contain WebVTT cue timings")
start, end = [t.strip() for t in timing_line.split("-->")]
end = re.split(" |\t", end)[0] # ignore the cue settings list
timings: _WebVTTCueTimings = _WebVTTCueTimings(
start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
)
cue_text = " ".join(cue_lines).strip()
if cue_text.startswith("<v") and "</v>" not in cue_text:
# adding close tag for cue voice spans without end tag
cue_text += "</v>"
stack: list[list[_WebVTTCueComponent]] = [[]]
tag_stack: list[Union[str, tuple]] = []
pos = 0
matches = list(cls._pattern_block.finditer(cue_text))
i = 0
while i < len(matches):
match = matches[i]
if match.start() > pos:
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
tag = match.group(0)
if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
tag_type = tag[1:2]
tag_stack.append(tag_type)
stack.append([])
elif tag == "</i>":
children = stack.pop()
stack[-1].append(_WebVTTCueItalicSpan(components=children))
tag_stack.pop()
elif tag == "</b>":
children = stack.pop()
stack[-1].append(_WebVTTCueBoldSpan(components=children))
tag_stack.pop()
elif tag == "</u>":
children = stack.pop()
stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
tag_stack.pop()
elif tag == "</c>":
children = stack.pop()
stack[-1].append(_WebVTTCueClassSpan(components=children))
tag_stack.pop()
elif tag.startswith("<v"):
tag_stack.append(("v", tag))
stack.append([])
elif tag.startswith("</v"):
children = stack.pop() if stack else []
if (
tag_stack
and isinstance(tag_stack[-1], tuple)
and tag_stack[-1][0] == "v"
):
_, voice = cast(tuple, tag_stack.pop())
voice_match = cls._pattern_voice_tag.match(voice)
if voice_match:
class_string = voice_match.group("class")
annotation = voice_match.group("annotation")
if annotation:
classes: list[str] = []
if class_string:
classes = [c for c in class_string.split(".") if c]
stack[-1].append(
_WebVTTCueVoiceSpan(
annotation=annotation.strip(),
classes=classes,
components=children,
)
)
pos = match.end()
i += 1
if pos < len(cue_text):
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
return cls(
identifier=identifier,
timings=timings,
payload=stack[0],
)
def __str__(self):
parts = []
if self.identifier:
parts.append(f"{self.identifier}\n")
timings_line = str(self.timings)
parts.append(timings_line + "\n")
for idx, span in enumerate(self.payload):
if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
# the end tag may be omitted for brevity
parts.append(str(span).removesuffix("</v>"))
else:
parts.append(str(span))
return "".join(parts)
class _WebVTTFile(BaseModel):
"""A model representing a WebVTT file."""
cue_blocks: list[_WebVTTCueBlock]
@staticmethod
def verify_signature(content: str) -> bool:
if not content:
return False
elif len(content) == 6:
return content == "WEBVTT"
elif len(content) > 6 and content.startswith("WEBVTT"):
return content[6] in (" ", "\t", "\n")
else:
return False
@classmethod
def parse(cls, raw: str) -> "_WebVTTFile":
# Normalize newlines to LF
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
# Check WebVTT signature
if not cls.verify_signature(raw):
raise ValueError("Invalid WebVTT file signature")
# Strip "WEBVTT" header line
lines = raw.split("\n", 1)
body = lines[1] if len(lines) > 1 else ""
# Remove NOTE/STYLE/REGION blocks
body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
# Split into cue blocks
raw_blocks = re.split(r"\n\s*\n", body.strip())
cues: list[_WebVTTCueBlock] = []
for block in raw_blocks:
try:
cues.append(_WebVTTCueBlock.parse(block))
except ValueError as e:
_log.warning(f"Failed to parse cue block:\n{block}\n{e}")
return cls(cue_blocks=cues)
def __iter__(self):
return iter(self.cue_blocks)
def __getitem__(self, idx):
return self.cue_blocks[idx]
def __len__(self):
return len(self.cue_blocks)
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
"""Declarative backend for WebVTT (.vtt) files.
This parser reads the content of a WebVTT file and converts
it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
Each cue becomes a TextItem and the items are appended to the
document body by the cue's start time.
"""
@override
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.content: str = ""
try:
if isinstance(self.path_or_stream, BytesIO):
self.content = self.path_or_stream.getvalue().decode("utf-8")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
self.content = f.read()
except Exception as e:
raise RuntimeError(
"Could not initialize the WebVTT backend for file with hash "
f"{self.document_hash}."
) from e
@override
def is_valid(self) -> bool:
return _WebVTTFile.verify_signature(self.content)
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.VTT}
@staticmethod
def _add_text_from_component(
doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
) -> None:
"""Adds a TextItem to a document by extracting text from a cue span component.
TODO: address nesting
"""
formatting = Formatting()
text = ""
if isinstance(item, _WebVTTCueItalicSpan):
formatting.italic = True
elif isinstance(item, _WebVTTCueBoldSpan):
formatting.bold = True
elif isinstance(item, _WebVTTCueUnderlineSpan):
formatting.underline = True
if isinstance(item, _WebVTTCueTextSpan):
text = item.text
else:
# TODO: address nesting
text = "".join(
[t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
)
if text := text.strip():
doc.add_text(
label=DocItemLabel.TEXT,
text=text,
parent=parent,
content_layer=ContentLayer.BODY,
formatting=formatting,
)
@override
def convert(self) -> DoclingDocument:
_log.debug("Starting WebVTT conversion...")
if not self.is_valid():
raise RuntimeError("Invalid WebVTT document.")
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="text/vtt",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
for block in vtt.cue_blocks:
block_group = doc.add_group(
label=GroupLabel.SECTION,
name="WebVTT cue block",
parent=None,
content_layer=ContentLayer.BODY,
)
if block.identifier:
doc.add_text(
label=DocItemLabel.TEXT,
text=str(block.identifier),
parent=block_group,
content_layer=ContentLayer.BODY,
)
doc.add_text(
label=DocItemLabel.TEXT,
text=str(block.timings),
parent=block_group,
content_layer=ContentLayer.BODY,
)
for cue_span in block.payload:
if isinstance(cue_span, _WebVTTCueVoiceSpan):
voice_group = doc.add_group(
label=GroupLabel.INLINE,
name="WebVTT cue voice span",
parent=block_group,
content_layer=ContentLayer.BODY,
)
voice = cue_span.annotation
if classes := cue_span.classes:
voice += f" ({', '.join(classes)})"
voice += ": "
doc.add_text(
label=DocItemLabel.TEXT,
text=voice,
parent=voice_group,
content_layer=ContentLayer.BODY,
)
for item in cue_span.components:
WebVTTDocumentBackend._add_text_from_component(
doc, item, voice_group
)
else:
WebVTTDocumentBackend._add_text_from_component(
doc, cue_span, block_group
)
return doc

View File

@@ -1,7 +1,6 @@
import math
from collections import defaultdict
from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
from typing import TYPE_CHECKING, Optional, Type, Union
import numpy as np
from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
)
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import (
DocumentStream,
)
from docling_core.types.io import DocumentStream
# DO NOT REMOVE; explicitly exposed from this location
from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
METS_GBS = "mets_gbs"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
VTT = "vtt"
class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
DOCTAGS = "doctags"
FormatToExtensions: Dict[InputFormat, List[str]] = {
FormatToExtensions: dict[InputFormat, list[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["tar.gz"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
InputFormat.VTT: ["vtt"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
FormatToMimeType: dict[InputFormat, list[str]] = {
InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["application/mets+xml"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
InputFormat.VTT: ["text/vtt"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[TextCell] = []
children: List["Cluster"] = [] # Add child cluster support
cells: list[TextCell] = []
children: list["Cluster"] = [] # Add child cluster support
@field_serializer("confidence")
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
clusters: list[Cluster] = []
class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
class Table(BasePageElement):
otsl_seq: List[str]
otsl_seq: list[str]
num_rows: int = 0
num_cols: int = 0
table_cells: List[TableCell]
table_cells: list[TableCell]
class TableStructurePrediction(BaseModel):
table_map: Dict[int, Table] = {}
table_map: dict[int, Table] = {}
class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
class FigureElement(BasePageElement):
annotations: List[PictureDataType] = []
annotations: list[PictureDataType] = []
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
class FigureClassificationPrediction(BaseModel):
figure_count: int = 0
figure_map: Dict[int, FigureElement] = {}
figure_map: dict[int, FigureElement] = {}
class EquationPrediction(BaseModel):
equation_count: int = 0
equation_map: Dict[int, TextElement] = {}
equation_map: dict[int, TextElement] = {}
class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
class AssembledUnit(BaseModel):
elements: List[PageElement] = []
body: List[PageElement] = []
headers: List[PageElement] = []
elements: list[PageElement] = []
body: list[PageElement] = []
headers: list[PageElement] = []
class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[
_image_cache: dict[
float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
def cells(self) -> list[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
id: str
model: Optional[str] = None # returned by openai
choices: List[OpenAiResponseChoice]
choices: list[OpenAiResponseChoice]
created: int
usage: OpenAiResponseUsage
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
class ConfidenceReport(PageConfidenceScores):
pages: Dict[int, PageConfidenceScores] = Field(
pages: dict[int, PageConfidenceScores] = Field(
default_factory=lambda: defaultdict(PageConfidenceScores)
)

View File

@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.PPTX][0]
elif ext in FormatToExtensions[InputFormat.XLSX]:
mime = FormatToMimeType[InputFormat.XLSX][0]
elif ext in FormatToExtensions[InputFormat.VTT]:
mime = FormatToMimeType[InputFormat.VTT][0]
return mime

View File

@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.noop_backend import NoOpBackend
from docling.backend.webvtt_backend import WebVTTDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
InputFormat.VTT: FormatOption(
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options

4
docs/index.md vendored
View File

@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon
* 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Get started

View File

@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
| PDF | |
| DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
| Markdown | |
| AsciiDoc | |
| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
| HTML, XHTML | |
| CSV | |
| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
| WebVTT | Web Video Text Tracks format for displaying timed text |
Schema-specific support:
@@ -32,4 +33,4 @@ Schema-specific support:
| Markdown | |
| JSON | Lossless serialization of Docling Document |
| Text | Plain text, i.e. without Markdown markers |
| Doctags | |
| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |

View File

@@ -44,7 +44,7 @@ authors = [
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.48.0,<3.0.0)',
'docling-core[chunking] (>=2.48.2,<3.0.0)',
'docling-parse (>=4.4.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',

View File

@@ -0,0 +1,66 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:11.000 --> 00:13.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Roger Bingham:
item-5 at level 3: text: We are in New York City
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:13.000 --> 00:16.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Roger Bingham:
item-10 at level 3: text: Were actually at the Lucern Hotel, just down the street
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:16.000 --> 00:18.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Roger Bingham:
item-15 at level 3: text: from the American Museum of Natural History
item-16 at level 1: section: group WebVTT cue block
item-17 at level 2: text: 00:18.000 --> 00:20.000
item-18 at level 2: inline: group WebVTT cue voice span
item-19 at level 3: text: Roger Bingham:
item-20 at level 3: text: And with me is Neil deGrasse Tyson
item-21 at level 1: section: group WebVTT cue block
item-22 at level 2: text: 00:20.000 --> 00:22.000
item-23 at level 2: inline: group WebVTT cue voice span
item-24 at level 3: text: Roger Bingham:
item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
item-26 at level 1: section: group WebVTT cue block
item-27 at level 2: text: 00:22.000 --> 00:24.000
item-28 at level 2: inline: group WebVTT cue voice span
item-29 at level 3: text: Roger Bingham:
item-30 at level 3: text: at the AMNH.
item-31 at level 1: section: group WebVTT cue block
item-32 at level 2: text: 00:24.000 --> 00:26.000
item-33 at level 2: inline: group WebVTT cue voice span
item-34 at level 3: text: Roger Bingham:
item-35 at level 3: text: Thank you for walking down here.
item-36 at level 1: section: group WebVTT cue block
item-37 at level 2: text: 00:27.000 --> 00:30.000
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Roger Bingham:
item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 00:30.000 --> 00:31.500
item-43 at level 2: inline: group WebVTT cue voice span
item-44 at level 3: text: Roger Bingham:
item-45 at level 3: text: When we e-mailed—
item-46 at level 1: section: group WebVTT cue block
item-47 at level 2: text: 00:30.500 --> 00:32.500
item-48 at level 2: inline: group WebVTT cue voice span
item-49 at level 3: text: Neil deGrasse Tyson:
item-50 at level 3: text: Didnt we talk about enough in that conversation?
item-51 at level 1: section: group WebVTT cue block
item-52 at level 2: text: 00:32.000 --> 00:35.500
item-53 at level 2: inline: group WebVTT cue voice span
item-54 at level 3: text: Roger Bingham:
item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
item-56 at level 1: section: group WebVTT cue block
item-57 at level 2: text: 00:32.500 --> 00:33.500
item-58 at level 2: inline: group WebVTT cue voice span
item-59 at level 3: text: Neil deGrasse Tyson:
item-60 at level 3: text: Laughs
item-61 at level 1: section: group WebVTT cue block
item-62 at level 2: text: 00:35.500 --> 00:38.000
item-63 at level 2: inline: group WebVTT cue voice span
item-64 at level 3: text: Roger Bingham:
item-65 at level 3: text: You know Im so excited my glasses are falling off here.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
00:11.000 --> 00:13.000
Roger Bingham: We are in New York City
00:13.000 --> 00:16.000
Roger Bingham: Were actually at the Lucern Hotel, just down the street
00:16.000 --> 00:18.000
Roger Bingham: from the American Museum of Natural History
00:18.000 --> 00:20.000
Roger Bingham: And with me is Neil deGrasse Tyson
00:20.000 --> 00:22.000
Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
00:22.000 --> 00:24.000
Roger Bingham: at the AMNH.
00:24.000 --> 00:26.000
Roger Bingham: Thank you for walking down here.
00:27.000 --> 00:30.000
Roger Bingham: And I want to do a follow-up on the last conversation we did.
00:30.000 --> 00:31.500
Roger Bingham: When we e-mailed—
00:30.500 --> 00:32.500
Neil deGrasse Tyson: Didnt we talk about enough in that conversation?
00:32.000 --> 00:35.500
Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
00:32.500 --> 00:33.500
Neil deGrasse Tyson: *Laughs*
00:35.500 --> 00:38.000
Roger Bingham: You know Im so excited my glasses are falling off here.

View File

@@ -0,0 +1,22 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:00.000 --> 00:02.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Esme (first, loud):
item-5 at level 3: text: Its a blue apple tree!
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:02.000 --> 00:04.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Mary:
item-10 at level 3: text: No way!
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:04.000 --> 00:06.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Esme:
item-15 at level 3: text: Hee!
item-16 at level 2: text: laughter
item-17 at level 1: section: group WebVTT cue block
item-18 at level 2: text: 00:06.000 --> 00:08.000
item-19 at level 2: inline: group WebVTT cue voice span
item-20 at level 3: text: Mary (loud):
item-21 at level 3: text: Thats awesome!

View File

@@ -0,0 +1,376 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"name": "webvtt_example_02",
"origin": {
"mimetype": "text/vtt",
"binary_hash": 12867774546881601731,
"filename": "webvtt_example_02.vtt"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/4"
},
{
"$ref": "#/groups/6"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/6"
},
{
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/9"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
},
{
"$ref": "#/groups/7"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/groups/6"
},
"children": [
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:00.000 --> 00:02.000",
"text": "00:00.000 --> 00:02.000"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Esme (first, loud): ",
"text": "Esme (first, loud): "
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Its a blue apple tree!",
"text": "Its a blue apple tree!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:02.000 --> 00:04.000",
"text": "00:02.000 --> 00:04.000"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mary: ",
"text": "Mary: "
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "No way!",
"text": "No way!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:04.000 --> 00:06.000",
"text": "00:04.000 --> 00:06.000"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Esme: ",
"text": "Esme: "
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Hee!",
"text": "Hee!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "laughter",
"text": "laughter",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:06.000 --> 00:08.000",
"text": "00:06.000 --> 00:08.000"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mary (loud): ",
"text": "Mary (loud): "
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Thats awesome!",
"text": "Thats awesome!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,17 @@
00:00.000 --> 00:02.000
Esme (first, loud): Its a blue apple tree!
00:02.000 --> 00:04.000
Mary: No way!
00:04.000 --> 00:06.000
Esme: Hee!
*laughter*
00:06.000 --> 00:08.000
Mary (loud): Thats awesome!

View File

@@ -0,0 +1,77 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
item-4 at level 2: inline: group WebVTT cue voice span
item-5 at level 3: text: Speaker A:
item-6 at level 3: text: OK, I think now we should be recording
item-7 at level 1: section: group WebVTT cue block
item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
item-10 at level 2: inline: group WebVTT cue voice span
item-11 at level 3: text: Speaker A:
item-12 at level 3: text: properly.
item-13 at level 1: section: group WebVTT cue block
item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
item-16 at level 2: text: Good.
item-17 at level 1: section: group WebVTT cue block
item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
item-20 at level 2: inline: group WebVTT cue voice span
item-21 at level 3: text: Speaker A:
item-22 at level 3: text: Yeah.
item-23 at level 1: section: group WebVTT cue block
item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
item-26 at level 2: inline: group WebVTT cue voice span
item-27 at level 3: text: Speaker B:
item-28 at level 3: text: I was also thinking.
item-29 at level 1: section: group WebVTT cue block
item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
item-32 at level 2: inline: group WebVTT cue voice span
item-33 at level 3: text: Speaker B:
item-34 at level 3: text: Would be maybe good to create items,
item-35 at level 1: section: group WebVTT cue block
item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Speaker B:
item-40 at level 3: text: some metadata, some options that can be specific.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
item-44 at level 2: inline: group WebVTT cue voice span
item-45 at level 3: text: Speaker A:
item-46 at level 3: text: Yeah, I mean I think you went even more than
item-47 at level 1: section: group WebVTT cue block
item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
item-50 at level 2: inline: group WebVTT cue voice span
item-51 at level 3: text: Speaker B:
item-52 at level 3: text: But we preserved the atoms.
item-53 at level 1: section: group WebVTT cue block
item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
item-56 at level 2: inline: group WebVTT cue voice span
item-57 at level 3: text: Speaker A:
item-58 at level 3: text: than me. I just opened the format.
item-59 at level 1: section: group WebVTT cue block
item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
item-62 at level 2: inline: group WebVTT cue voice span
item-63 at level 3: text: Speaker A:
item-64 at level 3: text: give it a try, yeah.
item-65 at level 1: section: group WebVTT cue block
item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
item-68 at level 2: inline: group WebVTT cue voice span
item-69 at level 3: text: Speaker B:
item-70 at level 3: text: Okay, talk to you later.
item-71 at level 1: section: group WebVTT cue block
item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
item-74 at level 2: inline: group WebVTT cue voice span
item-75 at level 3: text: Speaker A:
item-76 at level 3: text: See you.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,77 @@
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
00:00:04.963 --> 00:00:08.571
Speaker A: OK, I think now we should be recording
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
00:00:08.571 --> 00:00:09.403
Speaker A: properly.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
00:00:10.683 --> 00:00:11.563
Good.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
00:00:13.363 --> 00:00:13.803
Speaker A: Yeah.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
00:00:49.603 --> 00:00:53.363
Speaker B: I was also thinking.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
00:00:54.963 --> 00:01:02.072
Speaker B: Would be maybe good to create items,
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
00:01:02.072 --> 00:01:06.811
Speaker B: some metadata, some options that can be specific.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
00:01:10.243 --> 00:01:13.014
Speaker A: Yeah, I mean I think you went even more than
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
00:01:10.563 --> 00:01:12.643
Speaker B: But we preserved the atoms.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
00:01:13.014 --> 00:01:15.907
Speaker A: than me. I just opened the format.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
00:01:50.222 --> 00:01:51.643
Speaker A: give it a try, yeah.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
00:01:52.043 --> 00:01:55.043
Speaker B: Okay, talk to you later.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
00:01:54.603 --> 00:01:55.283
Speaker A: See you.

42
tests/data/webvtt/webvtt_example_01.vtt vendored Normal file
View File

@@ -0,0 +1,42 @@
WEBVTT
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
00:11.000 --> 00:13.000
<v Roger Bingham>We are in New York City
00:13.000 --> 00:16.000
<v Roger Bingham>Were actually at the Lucern Hotel, just down the street
00:16.000 --> 00:18.000
<v Roger Bingham>from the American Museum of Natural History
00:18.000 --> 00:20.000
<v Roger Bingham>And with me is Neil deGrasse Tyson
00:20.000 --> 00:22.000
<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
00:22.000 --> 00:24.000
<v Roger Bingham>at the AMNH.
00:24.000 --> 00:26.000
<v Roger Bingham>Thank you for walking down here.
00:27.000 --> 00:30.000
<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
00:30.000 --> 00:31.500 align:right size:50%
<v Roger Bingham>When we e-mailed—
00:30.500 --> 00:32.500 align:left size:50%
<v Neil deGrasse Tyson>Didnt we talk about enough in that conversation?
00:32.000 --> 00:35.500 align:right size:50%
<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
00:32.500 --> 00:33.500 align:left size:50%
<v Neil deGrasse Tyson><i>Laughs</i>
00:35.500 --> 00:38.000
<v Roger Bingham>You know Im so excited my glasses are falling off here.

15
tests/data/webvtt/webvtt_example_02.vtt vendored Normal file
View File

@@ -0,0 +1,15 @@
WEBVTT
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
00:00.000 --> 00:02.000
<v.first.loud Esme>Its a blue apple tree!
00:02.000 --> 00:04.000
<v Mary>No way!
00:04.000 --> 00:06.000
<v Esme>Hee!</v> <i>laughter</i>
00:06.000 --> 00:08.000
<v.loud Mary>Thats awesome!

57
tests/data/webvtt/webvtt_example_03.vtt vendored Normal file
View File

@@ -0,0 +1,57 @@
WEBVTT
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
00:00:04.963 --> 00:00:08.571
<v Speaker A>OK,
I think now we should be recording</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
00:00:08.571 --> 00:00:09.403
<v Speaker A>properly.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
00:00:10.683 --> 00:00:11.563
Good.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
00:00:13.363 --> 00:00:13.803
<v Speaker A>Yeah.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
00:00:49.603 --> 00:00:53.363
<v Speaker B>I was also thinking.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
00:00:54.963 --> 00:01:02.072
<v Speaker B>Would be maybe good to create items,</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
00:01:02.072 --> 00:01:06.811
<v Speaker B>some metadata,
some options that can be specific.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
00:01:10.243 --> 00:01:13.014
<v Speaker A>Yeah,
I mean I think you went even more than</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
00:01:10.563 --> 00:01:12.643
<v Speaker B>But we preserved the atoms.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
00:01:13.014 --> 00:01:15.907
<v Speaker A>than me.
I just opened the format.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
00:01:50.222 --> 00:01:51.643
<v Speaker A>give it a try, yeah.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
00:01:52.043 --> 00:01:55.043
<v Speaker B>Okay, talk to you later.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
00:01:54.603 --> 00:01:55.283
<v Speaker A>See you.</v>

232
tests/test_backend_vtt.py Normal file
View File

@@ -0,0 +1,232 @@
# Assisted by watsonx Code Assistant
from pathlib import Path
import pytest
from docling_core.types.doc import DoclingDocument
from pydantic import ValidationError
from docling.backend.webvtt_backend import (
_WebVTTCueItalicSpan,
_WebVTTCueTextSpan,
_WebVTTCueTimings,
_WebVTTCueVoiceSpan,
_WebVTTFile,
_WebVTTTimestamp,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_vtt_cue_commponents():
"""Test WebVTT components."""
valid_timestamps = [
"00:01:02.345",
"12:34:56.789",
"02:34.567",
"00:00:00.000",
]
valid_total_seconds = [
1 * 60 + 2.345,
12 * 3600 + 34 * 60 + 56.789,
2 * 60 + 34.567,
0.0,
]
for idx, ts in enumerate(valid_timestamps):
model = _WebVTTTimestamp(raw=ts)
assert model.seconds == valid_total_seconds[idx]
"""Test invalid WebVTT timestamps."""
invalid_timestamps = [
"00:60:02.345", # minutes > 59
"00:01:60.345", # seconds > 59
"00:01:02.1000", # milliseconds > 999
"01:02:03", # missing milliseconds
"01:02", # missing milliseconds
":01:02.345", # extra : for missing hours
"abc:01:02.345", # invalid format
]
for ts in invalid_timestamps:
with pytest.raises(ValidationError):
_WebVTTTimestamp(raw=ts)
"""Test the timestamp __str__ method."""
model = _WebVTTTimestamp(raw="00:01:02.345")
assert str(model) == "00:01:02.345"
"""Test valid cue timings."""
start = _WebVTTTimestamp(raw="00:10.005")
end = _WebVTTTimestamp(raw="00:14.007")
cue_timings = _WebVTTCueTimings(start=start, end=end)
assert cue_timings.start == start
assert cue_timings.end == end
assert str(cue_timings) == "00:10.005 --> 00:14.007"
"""Test invalid cue timings with end timestamp before start."""
start = _WebVTTTimestamp(raw="00:10.700")
end = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(start=start, end=end)
assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
"""Test invalid cue timings with missing end."""
start = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(start=start)
assert "Field required" in str(excinfo.value)
"""Test invalid cue timings with missing start."""
end = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(end=end)
assert "Field required" in str(excinfo.value)
"""Test with valid text."""
valid_text = "This is a valid cue text span."
span = _WebVTTCueTextSpan(text=valid_text)
assert span.text == valid_text
assert str(span) == valid_text
"""Test with text containing newline characters."""
invalid_text = "This cue text span\ncontains a newline."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with text containing ampersand."""
invalid_text = "This cue text span contains &."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with text containing less-than sign."""
invalid_text = "This cue text span contains <."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with empty text."""
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text="")
"""Test that annotation validation works correctly."""
valid_annotation = "valid-annotation"
invalid_annotation = "invalid\nannotation"
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=invalid_annotation)
assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
"""Test that classes validation works correctly."""
annotation = "speaker name"
valid_classes = ["class1", "class2"]
invalid_classes = ["class\nwith\nnewlines", ""]
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
"""Test that components validation works correctly."""
annotation = "speaker name"
valid_components = [_WebVTTCueTextSpan(text="random text")]
invalid_components = [123, "not a component"]
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
"""Test valid cue voice spans."""
cue_span = _WebVTTCueVoiceSpan(
annotation="speaker",
classes=["loud", "clear"],
components=[_WebVTTCueTextSpan(text="random text")],
)
expected_str = "<v.loud.clear speaker>random text</v>"
assert str(cue_span) == expected_str
cue_span = _WebVTTCueVoiceSpan(
annotation="speaker",
components=[_WebVTTCueTextSpan(text="random text")],
)
expected_str = "<v speaker>random text</v>"
assert str(cue_span) == expected_str
def test_webvtt_file():
"""Test WebVTT files."""
with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 13
block = vtt.cue_blocks[11]
assert str(block.timings) == "00:32.500 --> 00:33.500"
assert len(block.payload) == 1
cue_span = block.payload[0]
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
assert cue_span.annotation == "Neil deGrasse Tyson"
assert not cue_span.classes
assert len(cue_span.components) == 1
comp = cue_span.components[0]
assert isinstance(comp, _WebVTTCueItalicSpan)
assert len(comp.components) == 1
comp2 = comp.components[0]
assert isinstance(comp2, _WebVTTCueTextSpan)
assert comp2.text == "Laughs"
with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 4
reverse = (
"WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
"https://www.w3.org/TR/webvtt1/\n\n"
)
reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
assert content == reverse
with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 13
for block in vtt:
assert block.identifier
block = vtt.cue_blocks[0]
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
assert len(block.payload) == 1
assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
block = vtt.cue_blocks[2]
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
assert len(block.payload) == 1
assert isinstance(block.payload[0], _WebVTTCueTextSpan)
assert block.payload[0].text == "Good."
def test_e2e_vtt_conversions():
directory = Path("./tests/data/webvtt/")
vtt_paths = sorted(directory.rglob("*.vtt"))
converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
for vtt in vtt_paths:
gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
conv_result: ConversionResult = converter.convert(vtt)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown(escape_html=False)
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)

View File

@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) is None
# Valid WebVTT
buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
assert dci._guess_format(stream) == InputFormat.VTT
# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))

13
uv.lock generated
View File

@@ -1154,7 +1154,7 @@ requires-dist = [
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
{ name = "certifi", specifier = ">=2024.7.4" },
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
{ name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
{ name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [
[[package]]
name = "docling-core"
version = "2.48.1"
version = "2.48.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
{ name = "typer" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
{ url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
]
[package.optional-dependencies]
@@ -4936,6 +4936,9 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
{ url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
{ url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
{ url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
{ url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
{ url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
{ url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
{ url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
{ url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
@@ -4957,6 +4960,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
{ url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
{ url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
{ url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
{ url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
{ url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
{ url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
{ url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },