diff --git a/README.md b/README.md index d3cd4935..a65803b3 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad ## Features -* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more +* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON @@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad * 📤 Structured [information extraction][extraction] \[🧪 beta\] * 📑 New layout model (**Heron**) by default, for faster PDF parsing * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications +* 💬 Parsing of Web Video Text Tracks (WebVTT) files ### Coming soon * 📝 Metadata extraction, including title, authors, references & language * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) * 📝 Complex chemistry understanding (Molecular structures) -* 📝 Parsing of Web Video Text Tracks (WebVTT) files ## Installation diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py new file mode 100644 index 00000000..2a7d02ce --- /dev/null +++ b/docling/backend/webvtt_backend.py @@ -0,0 +1,572 @@ +import logging +import re +from io import BytesIO +from pathlib import Path +from typing import Annotated, ClassVar, Literal, Optional, Union, cast + +from docling_core.types.doc import ( + ContentLayer, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + Formatting, + GroupLabel, + NodeItem, +) +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic.types import StringConstraints +from typing_extensions import Self, override + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class _WebVTTTimestamp(BaseModel): + """Model representing a WebVTT timestamp. + + A WebVTT timestamp is always interpreted relative to the current playback position + of the media data that the WebVTT file is to be synchronized with. + """ + + model_config = ConfigDict(regex_engine="python-re") + + raw: Annotated[ + str, + Field( + description="A representation of the WebVTT Timestamp as a single string" + ), + ] + + _pattern: ClassVar[re.Pattern] = re.compile( + r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$" + ) + _hours: int + _minutes: int + _seconds: int + _millis: int + + @model_validator(mode="after") + def validate_raw(self) -> Self: + m = self._pattern.match(self.raw) + if not m: + raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}") + self._hours = int(m.group(1)) if m.group(1) else 0 + self._minutes = int(m.group(2)) + self._seconds = int(m.group(3)) + self._millis = int(m.group(4)) + + if self._minutes < 0 or self._minutes > 59: + raise ValueError("Minutes must be between 0 and 59") + if self._seconds < 0 or self._seconds > 59: + raise ValueError("Seconds must be between 0 and 59") + + return self + + @property + def seconds(self) -> float: + """A representation of the WebVTT Timestamp in seconds""" + return ( + self._hours * 3600 + + self._minutes * 60 + + self._seconds + + self._millis / 1000.0 + ) + + @override + def __str__(self) -> str: + return self.raw + + +_WebVTTCueIdentifier = Annotated[ + str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$") +] + + +class _WebVTTCueTimings(BaseModel): + """Model representating WebVTT cue timings.""" + + start: Annotated[ + _WebVTTTimestamp, Field(description="Start time offset of the cue") + ] + end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")] + + @model_validator(mode="after") + def check_order(self) -> Self: + if self.start and self.end: + if self.end.seconds <= self.start.seconds: + raise ValueError("End timestamp must be greater than start timestamp") + return self + + @override + def __str__(self): + return f"{self.start} --> {self.end}" + + +class _WebVTTCueTextSpan(BaseModel): + """Model representing a WebVTT cue text span.""" + + text: str + span_type: Literal["text"] = "text" + + @field_validator("text", mode="after") + @classmethod + def validate_text(cls, value: str) -> str: + if any(ch in value for ch in {"\n", "\r", "&", "<"}): + raise ValueError("Cue text span contains invalid characters") + if len(value) == 0: + raise ValueError("Cue text span cannot be empty") + return value + + @override + def __str__(self): + return self.text + + +class _WebVTTCueVoiceSpan(BaseModel): + """Model representing a WebVTT cue voice span.""" + + annotation: Annotated[ + str, + Field( + description=( + "Cue span start tag annotation text representing the name of thevoice" + ) + ), + ] + classes: Annotated[ + list[str], + Field(description="List of classes representing the cue span's significance"), + ] = [] + components: Annotated[ + list["_WebVTTCueComponent"], + Field(description="The components representing the cue internal text"), + ] = [] + span_type: Literal["v"] = "v" + + @field_validator("annotation", mode="after") + @classmethod + def validate_annotation(cls, value: str) -> str: + if any(ch in value for ch in {"\n", "\r", "&", ">"}): + raise ValueError( + "Cue span start tag annotation contains invalid characters" + ) + if not value: + raise ValueError("Cue text span cannot be empty") + return value + + @field_validator("classes", mode="after") + @classmethod + def validate_classes(cls, value: list[str]) -> list[str]: + for item in value: + if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): + raise ValueError( + "A cue span start tag class contains invalid characters" + ) + if not item: + raise ValueError("Cue span start tag classes cannot be empty") + return value + + @override + def __str__(self): + tag = f"v.{'.'.join(self.classes)}" if self.classes else "v" + inner = "".join(str(span) for span in self.components) + return f"<{tag} {self.annotation}>{inner}" + + +class _WebVTTCueClassSpan(BaseModel): + span_type: Literal["c"] = "c" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +class _WebVTTCueItalicSpan(BaseModel): + span_type: Literal["i"] = "i" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +class _WebVTTCueBoldSpan(BaseModel): + span_type: Literal["b"] = "b" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +class _WebVTTCueUnderlineSpan(BaseModel): + span_type: Literal["u"] = "u" + components: list["_WebVTTCueComponent"] + + @override + def __str__(self): + inner = "".join(str(span) for span in self.components) + return f"{inner}" + + +_WebVTTCueComponent = Annotated[ + Union[ + _WebVTTCueTextSpan, + _WebVTTCueClassSpan, + _WebVTTCueItalicSpan, + _WebVTTCueBoldSpan, + _WebVTTCueUnderlineSpan, + _WebVTTCueVoiceSpan, + ], + Field(discriminator="span_type", description="The WebVTT cue component"), +] + + +class _WebVTTCueBlock(BaseModel): + """Model representing a WebVTT cue block. + + The optional WebVTT cue settings list is not supported. + The cue payload is limited to the following spans: text, class, italic, bold, + underline, and voice. + """ + + model_config = ConfigDict(regex_engine="python-re") + + identifier: Optional[_WebVTTCueIdentifier] = Field( + None, description="The WebVTT cue identifier" + ) + timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")] + payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")] + + _pattern_block: ClassVar[re.Pattern] = re.compile( + r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>" + ) + _pattern_voice_tag: ClassVar[re.Pattern] = re.compile( + r"^\.[^\t\n\r &<>]+)?" # zero or more classes + r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation + ) + + @field_validator("payload", mode="after") + @classmethod + def validate_payload(cls, payload): + for voice in payload: + if "-->" in str(voice): + raise ValueError("Cue payload must not contain '-->'") + return payload + + @classmethod + def parse(cls, raw: str) -> "_WebVTTCueBlock": + lines = raw.strip().splitlines() + if not lines: + raise ValueError("Cue block must have at least one line") + identifier: Optional[_WebVTTCueIdentifier] = None + timing_line = lines[0] + if "-->" not in timing_line and len(lines) > 1: + identifier = timing_line + timing_line = lines[1] + cue_lines = lines[2:] + else: + cue_lines = lines[1:] + + if "-->" not in timing_line: + raise ValueError("Cue block must contain WebVTT cue timings") + + start, end = [t.strip() for t in timing_line.split("-->")] + end = re.split(" |\t", end)[0] # ignore the cue settings list + timings: _WebVTTCueTimings = _WebVTTCueTimings( + start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end) + ) + cue_text = " ".join(cue_lines).strip() + if cue_text.startswith("" not in cue_text: + # adding close tag for cue voice spans without end tag + cue_text += "" + + stack: list[list[_WebVTTCueComponent]] = [[]] + tag_stack: list[Union[str, tuple]] = [] + + pos = 0 + matches = list(cls._pattern_block.finditer(cue_text)) + i = 0 + while i < len(matches): + match = matches[i] + if match.start() > pos: + stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()])) + tag = match.group(0) + + if tag.startswith(("", "", "", "")): + tag_type = tag[1:2] + tag_stack.append(tag_type) + stack.append([]) + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueItalicSpan(components=children)) + tag_stack.pop() + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueBoldSpan(components=children)) + tag_stack.pop() + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueUnderlineSpan(components=children)) + tag_stack.pop() + elif tag == "": + children = stack.pop() + stack[-1].append(_WebVTTCueClassSpan(components=children)) + tag_stack.pop() + elif tag.startswith("")) + else: + parts.append(str(span)) + + return "".join(parts) + + +class _WebVTTFile(BaseModel): + """A model representing a WebVTT file.""" + + cue_blocks: list[_WebVTTCueBlock] + + @staticmethod + def verify_signature(content: str) -> bool: + if not content: + return False + elif len(content) == 6: + return content == "WEBVTT" + elif len(content) > 6 and content.startswith("WEBVTT"): + return content[6] in (" ", "\t", "\n") + else: + return False + + @classmethod + def parse(cls, raw: str) -> "_WebVTTFile": + # Normalize newlines to LF + raw = raw.replace("\r\n", "\n").replace("\r", "\n") + + # Check WebVTT signature + if not cls.verify_signature(raw): + raise ValueError("Invalid WebVTT file signature") + + # Strip "WEBVTT" header line + lines = raw.split("\n", 1) + body = lines[1] if len(lines) > 1 else "" + + # Remove NOTE/STYLE/REGION blocks + body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE) + body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE) + + # Split into cue blocks + raw_blocks = re.split(r"\n\s*\n", body.strip()) + cues: list[_WebVTTCueBlock] = [] + for block in raw_blocks: + try: + cues.append(_WebVTTCueBlock.parse(block)) + except ValueError as e: + _log.warning(f"Failed to parse cue block:\n{block}\n{e}") + + return cls(cue_blocks=cues) + + def __iter__(self): + return iter(self.cue_blocks) + + def __getitem__(self, idx): + return self.cue_blocks[idx] + + def __len__(self): + return len(self.cue_blocks) + + +class WebVTTDocumentBackend(DeclarativeDocumentBackend): + """Declarative backend for WebVTT (.vtt) files. + + This parser reads the content of a WebVTT file and converts + it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1 + + Each cue becomes a TextItem and the items are appended to the + document body by the cue's start time. + """ + + @override + def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self.content: str = "" + try: + if isinstance(self.path_or_stream, BytesIO): + self.content = self.path_or_stream.getvalue().decode("utf-8") + if isinstance(self.path_or_stream, Path): + with open(self.path_or_stream, encoding="utf-8") as f: + self.content = f.read() + except Exception as e: + raise RuntimeError( + "Could not initialize the WebVTT backend for file with hash " + f"{self.document_hash}." + ) from e + + @override + def is_valid(self) -> bool: + return _WebVTTFile.verify_signature(self.content) + + @classmethod + @override + def supports_pagination(cls) -> bool: + return False + + @override + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + self.path_or_stream = None + + @classmethod + @override + def supported_formats(cls) -> set[InputFormat]: + return {InputFormat.VTT} + + @staticmethod + def _add_text_from_component( + doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem] + ) -> None: + """Adds a TextItem to a document by extracting text from a cue span component. + + TODO: address nesting + """ + formatting = Formatting() + text = "" + if isinstance(item, _WebVTTCueItalicSpan): + formatting.italic = True + elif isinstance(item, _WebVTTCueBoldSpan): + formatting.bold = True + elif isinstance(item, _WebVTTCueUnderlineSpan): + formatting.underline = True + if isinstance(item, _WebVTTCueTextSpan): + text = item.text + else: + # TODO: address nesting + text = "".join( + [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)] + ) + if text := text.strip(): + doc.add_text( + label=DocItemLabel.TEXT, + text=text, + parent=parent, + content_layer=ContentLayer.BODY, + formatting=formatting, + ) + + @override + def convert(self) -> DoclingDocument: + _log.debug("Starting WebVTT conversion...") + if not self.is_valid(): + raise RuntimeError("Invalid WebVTT document.") + + origin = DocumentOrigin( + filename=self.file.name or "file", + mimetype="text/vtt", + binary_hash=self.document_hash, + ) + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) + + vtt: _WebVTTFile = _WebVTTFile.parse(self.content) + for block in vtt.cue_blocks: + block_group = doc.add_group( + label=GroupLabel.SECTION, + name="WebVTT cue block", + parent=None, + content_layer=ContentLayer.BODY, + ) + if block.identifier: + doc.add_text( + label=DocItemLabel.TEXT, + text=str(block.identifier), + parent=block_group, + content_layer=ContentLayer.BODY, + ) + doc.add_text( + label=DocItemLabel.TEXT, + text=str(block.timings), + parent=block_group, + content_layer=ContentLayer.BODY, + ) + for cue_span in block.payload: + if isinstance(cue_span, _WebVTTCueVoiceSpan): + voice_group = doc.add_group( + label=GroupLabel.INLINE, + name="WebVTT cue voice span", + parent=block_group, + content_layer=ContentLayer.BODY, + ) + voice = cue_span.annotation + if classes := cue_span.classes: + voice += f" ({', '.join(classes)})" + voice += ": " + doc.add_text( + label=DocItemLabel.TEXT, + text=voice, + parent=voice_group, + content_layer=ContentLayer.BODY, + ) + for item in cue_span.components: + WebVTTDocumentBackend._add_text_from_component( + doc, item, voice_group + ) + else: + WebVTTDocumentBackend._add_text_from_component( + doc, cue_span, block_group + ) + + return doc diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 25a4386e..627ecf5f 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,7 +1,6 @@ -import math from collections import defaultdict from enum import Enum -from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union +from typing import TYPE_CHECKING, Optional, Type, Union import numpy as np from docling_core.types.doc import ( @@ -14,9 +13,7 @@ from docling_core.types.doc import ( ) from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float from docling_core.types.doc.page import SegmentedPdfPage, TextCell -from docling_core.types.io import ( - DocumentStream, -) +from docling_core.types.io import DocumentStream # DO NOT REMOVE; explicitly exposed from this location from PIL.Image import Image @@ -71,6 +68,7 @@ class InputFormat(str, Enum): METS_GBS = "mets_gbs" JSON_DOCLING = "json_docling" AUDIO = "audio" + VTT = "vtt" class OutputFormat(str, Enum): @@ -82,7 +80,7 @@ class OutputFormat(str, Enum): DOCTAGS = "doctags" -FormatToExtensions: Dict[InputFormat, List[str]] = { +FormatToExtensions: dict[InputFormat, list[str]] = { InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"], InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"], InputFormat.PDF: ["pdf"], @@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.METS_GBS: ["tar.gz"], InputFormat.JSON_DOCLING: ["json"], InputFormat.AUDIO: ["wav", "mp3"], + InputFormat.VTT: ["vtt"], } -FormatToMimeType: Dict[InputFormat, List[str]] = { +FormatToMimeType: dict[InputFormat, list[str]] = { InputFormat.DOCX: [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.template", @@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { InputFormat.METS_GBS: ["application/mets+xml"], InputFormat.JSON_DOCLING: ["application/json"], InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"], + InputFormat.VTT: ["text/vtt"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { @@ -162,8 +162,8 @@ class Cluster(BaseModel): label: DocItemLabel bbox: BoundingBox confidence: float = 1.0 - cells: List[TextCell] = [] - children: List["Cluster"] = [] # Add child cluster support + cells: list[TextCell] = [] + children: list["Cluster"] = [] # Add child cluster support @field_serializer("confidence") def _serialize(self, value: float, info: FieldSerializationInfo) -> float: @@ -179,7 +179,7 @@ class BasePageElement(BaseModel): class LayoutPrediction(BaseModel): - clusters: List[Cluster] = [] + clusters: list[Cluster] = [] class VlmPredictionToken(BaseModel): @@ -201,14 +201,14 @@ class ContainerElement( class Table(BasePageElement): - otsl_seq: List[str] + otsl_seq: list[str] num_rows: int = 0 num_cols: int = 0 - table_cells: List[TableCell] + table_cells: list[TableCell] class TableStructurePrediction(BaseModel): - table_map: Dict[int, Table] = {} + table_map: dict[int, Table] = {} class TextElement(BasePageElement): @@ -216,7 +216,7 @@ class TextElement(BasePageElement): class FigureElement(BasePageElement): - annotations: List[PictureDataType] = [] + annotations: list[PictureDataType] = [] provenance: Optional[str] = None predicted_class: Optional[str] = None confidence: Optional[float] = None @@ -234,12 +234,12 @@ class FigureElement(BasePageElement): class FigureClassificationPrediction(BaseModel): figure_count: int = 0 - figure_map: Dict[int, FigureElement] = {} + figure_map: dict[int, FigureElement] = {} class EquationPrediction(BaseModel): equation_count: int = 0 - equation_map: Dict[int, TextElement] = {} + equation_map: dict[int, TextElement] = {} class PagePredictions(BaseModel): @@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement] class AssembledUnit(BaseModel): - elements: List[PageElement] = [] - body: List[PageElement] = [] - headers: List[PageElement] = [] + elements: list[PageElement] = [] + body: list[PageElement] = [] + headers: list[PageElement] = [] class ItemAndImageEnrichmentElement(BaseModel): @@ -280,12 +280,12 @@ class Page(BaseModel): None # Internal PDF backend. By default it is cleared during assembling. ) _default_image_scale: float = 1.0 # Default image scale for external usage. - _image_cache: Dict[ + _image_cache: dict[ float, Image ] = {} # Cache of images in different scales. By default it is cleared during assembling. @property - def cells(self) -> List[TextCell]: + def cells(self) -> list[TextCell]: """Return text cells as a read-only view of parsed_page.textline_cells.""" if self.parsed_page is not None: return self.parsed_page.textline_cells @@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel): id: str model: Optional[str] = None # returned by openai - choices: List[OpenAiResponseChoice] + choices: list[OpenAiResponseChoice] created: int usage: OpenAiResponseUsage @@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel): class ConfidenceReport(PageConfidenceScores): - pages: Dict[int, PageConfidenceScores] = Field( + pages: dict[int, PageConfidenceScores] = Field( default_factory=lambda: defaultdict(PageConfidenceScores) ) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 7955ff9d..8ea45482 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel): mime = FormatToMimeType[InputFormat.PPTX][0] elif ext in FormatToExtensions[InputFormat.XLSX]: mime = FormatToMimeType[InputFormat.XLSX][0] + elif ext in FormatToExtensions[InputFormat.VTT]: + mime = FormatToMimeType[InputFormat.VTT][0] return mime diff --git a/docling/document_converter.py b/docling/document_converter.py index 1c314903..5d64d633 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.noop_backend import NoOpBackend +from docling.backend.webvtt_backend import WebVTTDocumentBackend from docling.backend.xml.jats_backend import JatsDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.datamodel.base_models import ( @@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption: pipeline_cls=SimplePipeline, backend=DoclingJSONBackend ), InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend), + InputFormat.VTT: FormatOption( + pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend + ), } if (options := format_to_default_options.get(format)) is not None: return options diff --git a/docs/index.md b/docs/index.md index a41b1303..d18b6d21 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad ## Features -* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more +* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON @@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad * 📤 Structured [information extraction][extraction] \[🧪 beta\] * 📑 New layout model (**Heron**) by default, for faster PDF parsing * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications +* 💬 Parsing of Web Video Text Tracks (WebVTT) files ### Coming soon * 📝 Metadata extraction, including title, authors, references & language * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) * 📝 Complex chemistry understanding (Molecular structures) -* 📝 Parsing of Web Video Text Tracks (WebVTT) files ## Get started diff --git a/docs/usage/supported_formats.md b/docs/usage/supported_formats.md index c38e7ffa..09f25ed5 100644 --- a/docs/usage/supported_formats.md +++ b/docs/usage/supported_formats.md @@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats. | PDF | | | DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML | | Markdown | | -| AsciiDoc | | +| AsciiDoc | Human-readable, plain-text markup language for structured technical content | | HTML, XHTML | | | CSV | | | PNG, JPEG, TIFF, BMP, WEBP | Image formats | +| WebVTT | Web Video Text Tracks format for displaying timed text | Schema-specific support: @@ -32,4 +33,4 @@ Schema-specific support: | Markdown | | | JSON | Lossless serialization of Docling Document | | Text | Plain text, i.e. without Markdown markers | -| Doctags | | +| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document | diff --git a/pyproject.toml b/pyproject.toml index 116d61f9..e22a1d15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ authors = [ requires-python = '>=3.9,<4.0' dependencies = [ 'pydantic (>=2.0.0,<3.0.0)', - 'docling-core[chunking] (>=2.48.0,<3.0.0)', + 'docling-core[chunking] (>=2.48.2,<3.0.0)', 'docling-parse (>=4.4.0,<5.0.0)', "docling-ibm-models>=3.9.1,<4", 'filetype (>=1.2.0,<2.0.0)', diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt new file mode 100644 index 00000000..d7840e99 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt @@ -0,0 +1,66 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group WebVTT cue block + item-2 at level 2: text: 00:11.000 --> 00:13.000 + item-3 at level 2: inline: group WebVTT cue voice span + item-4 at level 3: text: Roger Bingham: + item-5 at level 3: text: We are in New York City + item-6 at level 1: section: group WebVTT cue block + item-7 at level 2: text: 00:13.000 --> 00:16.000 + item-8 at level 2: inline: group WebVTT cue voice span + item-9 at level 3: text: Roger Bingham: + item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street + item-11 at level 1: section: group WebVTT cue block + item-12 at level 2: text: 00:16.000 --> 00:18.000 + item-13 at level 2: inline: group WebVTT cue voice span + item-14 at level 3: text: Roger Bingham: + item-15 at level 3: text: from the American Museum of Natural History + item-16 at level 1: section: group WebVTT cue block + item-17 at level 2: text: 00:18.000 --> 00:20.000 + item-18 at level 2: inline: group WebVTT cue voice span + item-19 at level 3: text: Roger Bingham: + item-20 at level 3: text: And with me is Neil deGrasse Tyson + item-21 at level 1: section: group WebVTT cue block + item-22 at level 2: text: 00:20.000 --> 00:22.000 + item-23 at level 2: inline: group WebVTT cue voice span + item-24 at level 3: text: Roger Bingham: + item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium + item-26 at level 1: section: group WebVTT cue block + item-27 at level 2: text: 00:22.000 --> 00:24.000 + item-28 at level 2: inline: group WebVTT cue voice span + item-29 at level 3: text: Roger Bingham: + item-30 at level 3: text: at the AMNH. + item-31 at level 1: section: group WebVTT cue block + item-32 at level 2: text: 00:24.000 --> 00:26.000 + item-33 at level 2: inline: group WebVTT cue voice span + item-34 at level 3: text: Roger Bingham: + item-35 at level 3: text: Thank you for walking down here. + item-36 at level 1: section: group WebVTT cue block + item-37 at level 2: text: 00:27.000 --> 00:30.000 + item-38 at level 2: inline: group WebVTT cue voice span + item-39 at level 3: text: Roger Bingham: + item-40 at level 3: text: And I want to do a follow-up on the last conversation we did. + item-41 at level 1: section: group WebVTT cue block + item-42 at level 2: text: 00:30.000 --> 00:31.500 + item-43 at level 2: inline: group WebVTT cue voice span + item-44 at level 3: text: Roger Bingham: + item-45 at level 3: text: When we e-mailed— + item-46 at level 1: section: group WebVTT cue block + item-47 at level 2: text: 00:30.500 --> 00:32.500 + item-48 at level 2: inline: group WebVTT cue voice span + item-49 at level 3: text: Neil deGrasse Tyson: + item-50 at level 3: text: Didn’t we talk about enough in that conversation? + item-51 at level 1: section: group WebVTT cue block + item-52 at level 2: text: 00:32.000 --> 00:35.500 + item-53 at level 2: inline: group WebVTT cue voice span + item-54 at level 3: text: Roger Bingham: + item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos + item-56 at level 1: section: group WebVTT cue block + item-57 at level 2: text: 00:32.500 --> 00:33.500 + item-58 at level 2: inline: group WebVTT cue voice span + item-59 at level 3: text: Neil deGrasse Tyson: + item-60 at level 3: text: Laughs + item-61 at level 1: section: group WebVTT cue block + item-62 at level 2: text: 00:35.500 --> 00:38.000 + item-63 at level 2: inline: group WebVTT cue voice span + item-64 at level 3: text: Roger Bingham: + item-65 at level 3: text: You know I’m so excited my glasses are falling off here. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json new file mode 100644 index 00000000..0d34890e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json @@ -0,0 +1,1074 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.6.0", + "name": "webvtt_example_01", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 16887312431371817791, + "filename": "webvtt_example_01.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/groups/6" + }, + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/groups/10" + }, + { + "$ref": "#/groups/12" + }, + { + "$ref": "#/groups/14" + }, + { + "$ref": "#/groups/16" + }, + { + "$ref": "#/groups/18" + }, + { + "$ref": "#/groups/20" + }, + { + "$ref": "#/groups/22" + }, + { + "$ref": "#/groups/24" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/groups/5" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/groups/7" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/groups/6" + }, + "children": [ + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/groups/9" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/groups/8" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/groups/11" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/groups/10" + }, + "children": [ + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/12", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/groups/13" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/13", + "parent": { + "$ref": "#/groups/12" + }, + "children": [ + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/14", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/groups/15" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/15", + "parent": { + "$ref": "#/groups/14" + }, + "children": [ + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/16", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/groups/17" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/17", + "parent": { + "$ref": "#/groups/16" + }, + "children": [ + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/18", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/groups/19" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/19", + "parent": { + "$ref": "#/groups/18" + }, + "children": [ + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/20", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/groups/21" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/21", + "parent": { + "$ref": "#/groups/20" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/22", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/33" + }, + { + "$ref": "#/groups/23" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/23", + "parent": { + "$ref": "#/groups/22" + }, + "children": [ + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/24", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/groups/25" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/25", + "parent": { + "$ref": "#/groups/24" + }, + "children": [ + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:11.000 --> 00:13.000", + "text": "00:11.000 --> 00:13.000" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "We are in New York City", + "text": "We are in New York City", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:13.000 --> 00:16.000", + "text": "00:13.000 --> 00:16.000" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "We’re actually at the Lucern Hotel, just down the street", + "text": "We’re actually at the Lucern Hotel, just down the street", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:16.000 --> 00:18.000", + "text": "00:16.000 --> 00:18.000" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "from the American Museum of Natural History", + "text": "from the American Museum of Natural History", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:18.000 --> 00:20.000", + "text": "00:18.000 --> 00:20.000" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "And with me is Neil deGrasse Tyson", + "text": "And with me is Neil deGrasse Tyson", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:20.000 --> 00:22.000", + "text": "00:20.000 --> 00:22.000" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Astrophysicist, Director of the Hayden Planetarium", + "text": "Astrophysicist, Director of the Hayden Planetarium", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:22.000 --> 00:24.000", + "text": "00:22.000 --> 00:24.000" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "at the AMNH.", + "text": "at the AMNH.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:24.000 --> 00:26.000", + "text": "00:24.000 --> 00:26.000" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Thank you for walking down here.", + "text": "Thank you for walking down here.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/14" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:27.000 --> 00:30.000", + "text": "00:27.000 --> 00:30.000" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "And I want to do a follow-up on the last conversation we did.", + "text": "And I want to do a follow-up on the last conversation we did.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:30.000 --> 00:31.500", + "text": "00:30.000 --> 00:31.500" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/groups/17" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/17" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "When we e-mailed—", + "text": "When we e-mailed—", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:30.500 --> 00:32.500", + "text": "00:30.500 --> 00:32.500" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/19" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Neil deGrasse Tyson: ", + "text": "Neil deGrasse Tyson: " + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/19" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Didn’t we talk about enough in that conversation?", + "text": "Didn’t we talk about enough in that conversation?", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/20" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:32.000 --> 00:35.500", + "text": "00:32.000 --> 00:35.500" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/21" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/21" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "No! No no no no; 'cos 'cos obviously 'cos", + "text": "No! No no no no; 'cos 'cos obviously 'cos", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/22" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:32.500 --> 00:33.500", + "text": "00:32.500 --> 00:33.500" + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/23" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Neil deGrasse Tyson: ", + "text": "Neil deGrasse Tyson: " + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/groups/23" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Laughs", + "text": "Laughs", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:35.500 --> 00:38.000", + "text": "00:35.500 --> 00:38.000" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/groups/25" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Roger Bingham: ", + "text": "Roger Bingham: " + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/groups/25" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "You know I’m so excited my glasses are falling off here.", + "text": "You know I’m so excited my glasses are falling off here.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md new file mode 100644 index 00000000..c5767028 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md @@ -0,0 +1,51 @@ +00:11.000 --> 00:13.000 + +Roger Bingham: We are in New York City + +00:13.000 --> 00:16.000 + +Roger Bingham: We’re actually at the Lucern Hotel, just down the street + +00:16.000 --> 00:18.000 + +Roger Bingham: from the American Museum of Natural History + +00:18.000 --> 00:20.000 + +Roger Bingham: And with me is Neil deGrasse Tyson + +00:20.000 --> 00:22.000 + +Roger Bingham: Astrophysicist, Director of the Hayden Planetarium + +00:22.000 --> 00:24.000 + +Roger Bingham: at the AMNH. + +00:24.000 --> 00:26.000 + +Roger Bingham: Thank you for walking down here. + +00:27.000 --> 00:30.000 + +Roger Bingham: And I want to do a follow-up on the last conversation we did. + +00:30.000 --> 00:31.500 + +Roger Bingham: When we e-mailed— + +00:30.500 --> 00:32.500 + +Neil deGrasse Tyson: Didn’t we talk about enough in that conversation? + +00:32.000 --> 00:35.500 + +Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos + +00:32.500 --> 00:33.500 + +Neil deGrasse Tyson: *Laughs* + +00:35.500 --> 00:38.000 + +Roger Bingham: You know I’m so excited my glasses are falling off here. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt new file mode 100644 index 00000000..6d90404f --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt @@ -0,0 +1,22 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group WebVTT cue block + item-2 at level 2: text: 00:00.000 --> 00:02.000 + item-3 at level 2: inline: group WebVTT cue voice span + item-4 at level 3: text: Esme (first, loud): + item-5 at level 3: text: It’s a blue apple tree! + item-6 at level 1: section: group WebVTT cue block + item-7 at level 2: text: 00:02.000 --> 00:04.000 + item-8 at level 2: inline: group WebVTT cue voice span + item-9 at level 3: text: Mary: + item-10 at level 3: text: No way! + item-11 at level 1: section: group WebVTT cue block + item-12 at level 2: text: 00:04.000 --> 00:06.000 + item-13 at level 2: inline: group WebVTT cue voice span + item-14 at level 3: text: Esme: + item-15 at level 3: text: Hee! + item-16 at level 2: text: laughter + item-17 at level 1: section: group WebVTT cue block + item-18 at level 2: text: 00:06.000 --> 00:08.000 + item-19 at level 2: inline: group WebVTT cue voice span + item-20 at level 3: text: Mary (loud): + item-21 at level 3: text: That’s awesome! \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json new file mode 100644 index 00000000..c7700ae2 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json @@ -0,0 +1,376 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.6.0", + "name": "webvtt_example_02", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 12867774546881601731, + "filename": "webvtt_example_02.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/groups/6" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/texts/9" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/groups/7" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/groups/6" + }, + "children": [ + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00.000 --> 00:02.000", + "text": "00:00.000 --> 00:02.000" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Esme (first, loud): ", + "text": "Esme (first, loud): " + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "It’s a blue apple tree!", + "text": "It’s a blue apple tree!", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:02.000 --> 00:04.000", + "text": "00:02.000 --> 00:04.000" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Mary: ", + "text": "Mary: " + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "No way!", + "text": "No way!", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:04.000 --> 00:06.000", + "text": "00:04.000 --> 00:06.000" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Esme: ", + "text": "Esme: " + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Hee!", + "text": "Hee!", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "laughter", + "text": "laughter", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:06.000 --> 00:08.000", + "text": "00:06.000 --> 00:08.000" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Mary (loud): ", + "text": "Mary (loud): " + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "That’s awesome!", + "text": "That’s awesome!", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md new file mode 100644 index 00000000..db84cf11 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md @@ -0,0 +1,17 @@ +00:00.000 --> 00:02.000 + +Esme (first, loud): It’s a blue apple tree! + +00:02.000 --> 00:04.000 + +Mary: No way! + +00:04.000 --> 00:06.000 + +Esme: Hee! + +*laughter* + +00:06.000 --> 00:08.000 + +Mary (loud): That’s awesome! \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt new file mode 100644 index 00000000..ca344e59 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt @@ -0,0 +1,77 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group WebVTT cue block + item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 + item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571 + item-4 at level 2: inline: group WebVTT cue voice span + item-5 at level 3: text: Speaker A: + item-6 at level 3: text: OK, I think now we should be recording + item-7 at level 1: section: group WebVTT cue block + item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 + item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403 + item-10 at level 2: inline: group WebVTT cue voice span + item-11 at level 3: text: Speaker A: + item-12 at level 3: text: properly. + item-13 at level 1: section: group WebVTT cue block + item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 + item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563 + item-16 at level 2: text: Good. + item-17 at level 1: section: group WebVTT cue block + item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 + item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803 + item-20 at level 2: inline: group WebVTT cue voice span + item-21 at level 3: text: Speaker A: + item-22 at level 3: text: Yeah. + item-23 at level 1: section: group WebVTT cue block + item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 + item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363 + item-26 at level 2: inline: group WebVTT cue voice span + item-27 at level 3: text: Speaker B: + item-28 at level 3: text: I was also thinking. + item-29 at level 1: section: group WebVTT cue block + item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 + item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072 + item-32 at level 2: inline: group WebVTT cue voice span + item-33 at level 3: text: Speaker B: + item-34 at level 3: text: Would be maybe good to create items, + item-35 at level 1: section: group WebVTT cue block + item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 + item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811 + item-38 at level 2: inline: group WebVTT cue voice span + item-39 at level 3: text: Speaker B: + item-40 at level 3: text: some metadata, some options that can be specific. + item-41 at level 1: section: group WebVTT cue block + item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 + item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014 + item-44 at level 2: inline: group WebVTT cue voice span + item-45 at level 3: text: Speaker A: + item-46 at level 3: text: Yeah, I mean I think you went even more than + item-47 at level 1: section: group WebVTT cue block + item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 + item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643 + item-50 at level 2: inline: group WebVTT cue voice span + item-51 at level 3: text: Speaker B: + item-52 at level 3: text: But we preserved the atoms. + item-53 at level 1: section: group WebVTT cue block + item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 + item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907 + item-56 at level 2: inline: group WebVTT cue voice span + item-57 at level 3: text: Speaker A: + item-58 at level 3: text: than me. I just opened the format. + item-59 at level 1: section: group WebVTT cue block + item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 + item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643 + item-62 at level 2: inline: group WebVTT cue voice span + item-63 at level 3: text: Speaker A: + item-64 at level 3: text: give it a try, yeah. + item-65 at level 1: section: group WebVTT cue block + item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 + item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043 + item-68 at level 2: inline: group WebVTT cue voice span + item-69 at level 3: text: Speaker B: + item-70 at level 3: text: Okay, talk to you later. + item-71 at level 1: section: group WebVTT cue block + item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 + item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283 + item-74 at level 2: inline: group WebVTT cue voice span + item-75 at level 3: text: Speaker A: + item-76 at level 3: text: See you. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json new file mode 100644 index 00000000..5b833971 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json @@ -0,0 +1,1240 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.6.0", + "name": "webvtt_example_03", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 11620880316586573676, + "filename": "webvtt_example_03.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/groups/7" + }, + { + "$ref": "#/groups/9" + }, + { + "$ref": "#/groups/11" + }, + { + "$ref": "#/groups/13" + }, + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/groups/17" + }, + { + "$ref": "#/groups/19" + }, + { + "$ref": "#/groups/21" + }, + { + "$ref": "#/groups/23" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/groups/6" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/groups/5" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/groups/8" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/groups/7" + }, + "children": [ + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/groups/10" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/groups/12" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/12", + "parent": { + "$ref": "#/groups/11" + }, + "children": [ + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/13", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/groups/14" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/14", + "parent": { + "$ref": "#/groups/13" + }, + "children": [ + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/15", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/groups/16" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/16", + "parent": { + "$ref": "#/groups/15" + }, + "children": [ + { + "$ref": "#/texts/33" + }, + { + "$ref": "#/texts/34" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/17", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/groups/18" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/18", + "parent": { + "$ref": "#/groups/17" + }, + "children": [ + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/19", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/groups/20" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/20", + "parent": { + "$ref": "#/groups/19" + }, + "children": [ + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/21", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/groups/22" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/22", + "parent": { + "$ref": "#/groups/21" + }, + "children": [ + { + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + }, + { + "self_ref": "#/groups/23", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/47" + }, + { + "$ref": "#/texts/48" + }, + { + "$ref": "#/groups/24" + } + ], + "content_layer": "body", + "name": "WebVTT cue block", + "label": "section" + }, + { + "self_ref": "#/groups/24", + "parent": { + "$ref": "#/groups/23" + }, + "children": [ + { + "$ref": "#/texts/49" + }, + { + "$ref": "#/texts/50" + } + ], + "content_layer": "body", + "name": "WebVTT cue voice span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00:04.963 --> 00:00:08.571", + "text": "00:00:04.963 --> 00:00:08.571" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "OK, I think now we should be recording", + "text": "OK, I think now we should be recording", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00:08.571 --> 00:00:09.403", + "text": "00:00:08.571 --> 00:00:09.403" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "properly.", + "text": "properly.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00:10.683 --> 00:00:11.563", + "text": "00:00:10.683 --> 00:00:11.563" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Good.", + "text": "Good.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00:13.363 --> 00:00:13.803", + "text": "00:00:13.363 --> 00:00:13.803" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Yeah.", + "text": "Yeah.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00:49.603 --> 00:00:53.363", + "text": "00:00:49.603 --> 00:00:53.363" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker B: ", + "text": "Speaker B: " + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "I was also thinking.", + "text": "I was also thinking.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:00:54.963 --> 00:01:02.072", + "text": "00:00:54.963 --> 00:01:02.072" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker B: ", + "text": "Speaker B: " + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Would be maybe good to create items,", + "text": "Would be maybe good to create items,", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:02.072 --> 00:01:06.811", + "text": "00:01:02.072 --> 00:01:06.811" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker B: ", + "text": "Speaker B: " + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "some metadata, some options that can be specific.", + "text": "some metadata, some options that can be specific.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:10.243 --> 00:01:13.014", + "text": "00:01:10.243 --> 00:01:13.014" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/14" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/14" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Yeah, I mean I think you went even more than", + "text": "Yeah, I mean I think you went even more than", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:10.563 --> 00:01:12.643", + "text": "00:01:10.563 --> 00:01:12.643" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker B: ", + "text": "Speaker B: " + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "But we preserved the atoms.", + "text": "But we preserved the atoms.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/groups/17" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/17" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:13.014 --> 00:01:15.907", + "text": "00:01:13.014 --> 00:01:15.907" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "than me. I just opened the format.", + "text": "than me. I just opened the format.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/groups/19" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1" + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/groups/19" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:50.222 --> 00:01:51.643", + "text": "00:01:50.222 --> 00:01:51.643" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/groups/20" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/groups/20" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "give it a try, yeah.", + "text": "give it a try, yeah.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/groups/21" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/groups/21" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:52.043 --> 00:01:55.043", + "text": "00:01:52.043 --> 00:01:55.043" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/groups/22" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker B: ", + "text": "Speaker B: " + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/groups/22" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Okay, talk to you later.", + "text": "Okay, talk to you later.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/groups/23" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", + "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0" + }, + { + "self_ref": "#/texts/48", + "parent": { + "$ref": "#/groups/23" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "00:01:54.603 --> 00:01:55.283", + "text": "00:01:54.603 --> 00:01:55.283" + }, + { + "self_ref": "#/texts/49", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Speaker A: ", + "text": "Speaker A: " + }, + { + "self_ref": "#/texts/50", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "See you.", + "text": "See you.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md new file mode 100644 index 00000000..859a6dde --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md @@ -0,0 +1,77 @@ +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 + +00:00:04.963 --> 00:00:08.571 + +Speaker A: OK, I think now we should be recording + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 + +00:00:08.571 --> 00:00:09.403 + +Speaker A: properly. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 + +00:00:10.683 --> 00:00:11.563 + +Good. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 + +00:00:13.363 --> 00:00:13.803 + +Speaker A: Yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 + +00:00:49.603 --> 00:00:53.363 + +Speaker B: I was also thinking. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 + +00:00:54.963 --> 00:01:02.072 + +Speaker B: Would be maybe good to create items, + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 + +00:01:02.072 --> 00:01:06.811 + +Speaker B: some metadata, some options that can be specific. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 + +00:01:10.243 --> 00:01:13.014 + +Speaker A: Yeah, I mean I think you went even more than + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 + +00:01:10.563 --> 00:01:12.643 + +Speaker B: But we preserved the atoms. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 + +00:01:13.014 --> 00:01:15.907 + +Speaker A: than me. I just opened the format. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 + +00:01:50.222 --> 00:01:51.643 + +Speaker A: give it a try, yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 + +00:01:52.043 --> 00:01:55.043 + +Speaker B: Okay, talk to you later. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 + +00:01:54.603 --> 00:01:55.283 + +Speaker A: See you. \ No newline at end of file diff --git a/tests/data/webvtt/webvtt_example_01.vtt b/tests/data/webvtt/webvtt_example_01.vtt new file mode 100644 index 00000000..333ca4a8 --- /dev/null +++ b/tests/data/webvtt/webvtt_example_01.vtt @@ -0,0 +1,42 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:11.000 --> 00:13.000 +We are in New York City + +00:13.000 --> 00:16.000 +We’re actually at the Lucern Hotel, just down the street + +00:16.000 --> 00:18.000 +from the American Museum of Natural History + +00:18.000 --> 00:20.000 +And with me is Neil deGrasse Tyson + +00:20.000 --> 00:22.000 +Astrophysicist, Director of the Hayden Planetarium + +00:22.000 --> 00:24.000 +at the AMNH. + +00:24.000 --> 00:26.000 +Thank you for walking down here. + +00:27.000 --> 00:30.000 +And I want to do a follow-up on the last conversation we did. + +00:30.000 --> 00:31.500 align:right size:50% +When we e-mailed— + +00:30.500 --> 00:32.500 align:left size:50% +Didn’t we talk about enough in that conversation? + +00:32.000 --> 00:35.500 align:right size:50% +No! No no no no; 'cos 'cos obviously 'cos + +00:32.500 --> 00:33.500 align:left size:50% +Laughs + +00:35.500 --> 00:38.000 +You know I’m so excited my glasses are falling off here. diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt new file mode 100644 index 00000000..1152a1e8 --- /dev/null +++ b/tests/data/webvtt/webvtt_example_02.vtt @@ -0,0 +1,15 @@ +WEBVTT + +NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ + +00:00.000 --> 00:02.000 +It’s a blue apple tree! + +00:02.000 --> 00:04.000 +No way! + +00:04.000 --> 00:06.000 +Hee! laughter + +00:06.000 --> 00:08.000 +That’s awesome! \ No newline at end of file diff --git a/tests/data/webvtt/webvtt_example_03.vtt b/tests/data/webvtt/webvtt_example_03.vtt new file mode 100644 index 00000000..a4dc1291 --- /dev/null +++ b/tests/data/webvtt/webvtt_example_03.vtt @@ -0,0 +1,57 @@ +WEBVTT + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 +00:00:04.963 --> 00:00:08.571 +OK, +I think now we should be recording + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 +00:00:08.571 --> 00:00:09.403 +properly. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 +00:00:10.683 --> 00:00:11.563 +Good. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 +00:00:13.363 --> 00:00:13.803 +Yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 +00:00:49.603 --> 00:00:53.363 +I was also thinking. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 +00:00:54.963 --> 00:01:02.072 +Would be maybe good to create items, + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 +00:01:02.072 --> 00:01:06.811 +some metadata, +some options that can be specific. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 +00:01:10.243 --> 00:01:13.014 +Yeah, +I mean I think you went even more than + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 +00:01:10.563 --> 00:01:12.643 +But we preserved the atoms. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 +00:01:13.014 --> 00:01:15.907 +than me. +I just opened the format. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 +00:01:50.222 --> 00:01:51.643 +give it a try, yeah. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 +00:01:52.043 --> 00:01:55.043 +Okay, talk to you later. + +62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 +00:01:54.603 --> 00:01:55.283 +See you. \ No newline at end of file diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py new file mode 100644 index 00000000..a910671b --- /dev/null +++ b/tests/test_backend_vtt.py @@ -0,0 +1,232 @@ +# Assisted by watsonx Code Assistant + +from pathlib import Path + +import pytest +from docling_core.types.doc import DoclingDocument +from pydantic import ValidationError + +from docling.backend.webvtt_backend import ( + _WebVTTCueItalicSpan, + _WebVTTCueTextSpan, + _WebVTTCueTimings, + _WebVTTCueVoiceSpan, + _WebVTTFile, + _WebVTTTimestamp, +) +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import ConversionResult +from docling.document_converter import DocumentConverter + +from .test_data_gen_flag import GEN_TEST_DATA +from .verify_utils import verify_document, verify_export + +GENERATE = GEN_TEST_DATA + + +def test_vtt_cue_commponents(): + """Test WebVTT components.""" + valid_timestamps = [ + "00:01:02.345", + "12:34:56.789", + "02:34.567", + "00:00:00.000", + ] + valid_total_seconds = [ + 1 * 60 + 2.345, + 12 * 3600 + 34 * 60 + 56.789, + 2 * 60 + 34.567, + 0.0, + ] + for idx, ts in enumerate(valid_timestamps): + model = _WebVTTTimestamp(raw=ts) + assert model.seconds == valid_total_seconds[idx] + + """Test invalid WebVTT timestamps.""" + invalid_timestamps = [ + "00:60:02.345", # minutes > 59 + "00:01:60.345", # seconds > 59 + "00:01:02.1000", # milliseconds > 999 + "01:02:03", # missing milliseconds + "01:02", # missing milliseconds + ":01:02.345", # extra : for missing hours + "abc:01:02.345", # invalid format + ] + for ts in invalid_timestamps: + with pytest.raises(ValidationError): + _WebVTTTimestamp(raw=ts) + + """Test the timestamp __str__ method.""" + model = _WebVTTTimestamp(raw="00:01:02.345") + assert str(model) == "00:01:02.345" + + """Test valid cue timings.""" + start = _WebVTTTimestamp(raw="00:10.005") + end = _WebVTTTimestamp(raw="00:14.007") + cue_timings = _WebVTTCueTimings(start=start, end=end) + assert cue_timings.start == start + assert cue_timings.end == end + assert str(cue_timings) == "00:10.005 --> 00:14.007" + + """Test invalid cue timings with end timestamp before start.""" + start = _WebVTTTimestamp(raw="00:10.700") + end = _WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + _WebVTTCueTimings(start=start, end=end) + assert "End timestamp must be greater than start timestamp" in str(excinfo.value) + + """Test invalid cue timings with missing end.""" + start = _WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + _WebVTTCueTimings(start=start) + assert "Field required" in str(excinfo.value) + + """Test invalid cue timings with missing start.""" + end = _WebVTTTimestamp(raw="00:10.500") + with pytest.raises(ValidationError) as excinfo: + _WebVTTCueTimings(end=end) + assert "Field required" in str(excinfo.value) + + """Test with valid text.""" + valid_text = "This is a valid cue text span." + span = _WebVTTCueTextSpan(text=valid_text) + assert span.text == valid_text + assert str(span) == valid_text + + """Test with text containing newline characters.""" + invalid_text = "This cue text span\ncontains a newline." + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + + """Test with text containing ampersand.""" + invalid_text = "This cue text span contains &." + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + + """Test with text containing less-than sign.""" + invalid_text = "This cue text span contains <." + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text=invalid_text) + + """Test with empty text.""" + with pytest.raises(ValidationError): + _WebVTTCueTextSpan(text="") + + """Test that annotation validation works correctly.""" + valid_annotation = "valid-annotation" + invalid_annotation = "invalid\nannotation" + with pytest.raises(ValidationError): + _WebVTTCueVoiceSpan(annotation=invalid_annotation) + assert _WebVTTCueVoiceSpan(annotation=valid_annotation) + + """Test that classes validation works correctly.""" + annotation = "speaker name" + valid_classes = ["class1", "class2"] + invalid_classes = ["class\nwith\nnewlines", ""] + with pytest.raises(ValidationError): + _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes) + assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes) + + """Test that components validation works correctly.""" + annotation = "speaker name" + valid_components = [_WebVTTCueTextSpan(text="random text")] + invalid_components = [123, "not a component"] + with pytest.raises(ValidationError): + _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components) + assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components) + + """Test valid cue voice spans.""" + cue_span = _WebVTTCueVoiceSpan( + annotation="speaker", + classes=["loud", "clear"], + components=[_WebVTTCueTextSpan(text="random text")], + ) + + expected_str = "random text" + assert str(cue_span) == expected_str + + cue_span = _WebVTTCueVoiceSpan( + annotation="speaker", + components=[_WebVTTCueTextSpan(text="random text")], + ) + expected_str = "random text" + assert str(cue_span) == expected_str + + +def test_webvtt_file(): + """Test WebVTT files.""" + with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f: + content = f.read() + vtt = _WebVTTFile.parse(content) + assert len(vtt) == 13 + block = vtt.cue_blocks[11] + assert str(block.timings) == "00:32.500 --> 00:33.500" + assert len(block.payload) == 1 + cue_span = block.payload[0] + assert isinstance(cue_span, _WebVTTCueVoiceSpan) + assert cue_span.annotation == "Neil deGrasse Tyson" + assert not cue_span.classes + assert len(cue_span.components) == 1 + comp = cue_span.components[0] + assert isinstance(comp, _WebVTTCueItalicSpan) + assert len(comp.components) == 1 + comp2 = comp.components[0] + assert isinstance(comp2, _WebVTTCueTextSpan) + assert comp2.text == "Laughs" + + with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f: + content = f.read() + vtt = _WebVTTFile.parse(content) + assert len(vtt) == 4 + reverse = ( + "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " + "https://www.w3.org/TR/webvtt1/\n\n" + ) + reverse += "\n\n".join([str(block) for block in vtt.cue_blocks]) + assert content == reverse + + with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: + content = f.read() + vtt = _WebVTTFile.parse(content) + assert len(vtt) == 13 + for block in vtt: + assert block.identifier + block = vtt.cue_blocks[0] + assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" + assert str(block.timings) == "00:00:04.963 --> 00:00:08.571" + assert len(block.payload) == 1 + assert isinstance(block.payload[0], _WebVTTCueVoiceSpan) + block = vtt.cue_blocks[2] + assert isinstance(cue_span, _WebVTTCueVoiceSpan) + assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" + assert str(block.timings) == "00:00:10.683 --> 00:00:11.563" + assert len(block.payload) == 1 + assert isinstance(block.payload[0], _WebVTTCueTextSpan) + assert block.payload[0].text == "Good." + + +def test_e2e_vtt_conversions(): + directory = Path("./tests/data/webvtt/") + vtt_paths = sorted(directory.rglob("*.vtt")) + converter = DocumentConverter(allowed_formats=[InputFormat.VTT]) + + for vtt in vtt_paths: + gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name + + conv_result: ConversionResult = converter.convert(vtt) + + doc: DoclingDocument = conv_result.document + + pred_md: str = doc.export_to_markdown(escape_html=False) + assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( + "export to md" + ) + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( + "export to indented-text" + ) + + assert verify_document(doc, str(gt_path) + ".json", GENERATE) diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 29f1dafe..4b7ce469 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -206,6 +206,11 @@ def test_guess_format(tmp_path): doc_path.write_text("xyz", encoding="utf-8") assert dci._guess_format(doc_path) is None + # Valid WebVTT + buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read()) + stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf) + assert dci._guess_format(stream) == InputFormat.VTT + # Valid Docling JSON test_str = '{"name": ""}' stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode())) diff --git a/uv.lock b/uv.lock index d265f426..c7f3721a 100644 --- a/uv.lock +++ b/uv.lock @@ -1154,7 +1154,7 @@ requires-dist = [ { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" }, { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" }, { name = "certifi", specifier = ">=2024.7.4" }, - { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" }, + { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" }, { name = "docling-ibm-models", specifier = ">=3.9.1,<4" }, { name = "docling-parse", specifier = ">=4.4.0,<5.0.0" }, { name = "easyocr", specifier = ">=1.7,<2.0" }, @@ -1233,7 +1233,7 @@ examples = [ [[package]] name = "docling-core" -version = "2.48.1" +version = "2.48.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jsonref" }, @@ -1247,9 +1247,9 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" }, + { url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" }, ] [package.optional-dependencies] @@ -4936,6 +4936,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" }, { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" }, { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" }, + { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" }, + { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" }, + { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" }, { url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" }, { url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" }, { url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" }, @@ -4957,6 +4960,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" }, { url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" }, { url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" }, + { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" }, + { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" }, { url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" }, { url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" }, { url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },