feat: add a backend parser for WebVTT files (#2288)

* feat: add a backend parser for WebVTT files Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: update README with VTT support Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: add description to supported formats Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade docling-core to unescape WebVTT in markdown Pin the new release of docling-core 2.48.2. Do not escape HTML reserved characters when exporting WebVTT documents to markdown. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test: add missing copyright notice Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-09-22 15:24:34 +02:00
parent b5628f1227
commit 46efaaefee
23 changed files with 3969 additions and 34 deletions
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -0,0 +1,66 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group WebVTT cue block
+    item-2 at level 2: text: 00:11.000 --> 00:13.000
+    item-3 at level 2: inline: group WebVTT cue voice span
+      item-4 at level 3: text: Roger Bingham: 
+      item-5 at level 3: text: We are in New York City
+  item-6 at level 1: section: group WebVTT cue block
+    item-7 at level 2: text: 00:13.000 --> 00:16.000
+    item-8 at level 2: inline: group WebVTT cue voice span
+      item-9 at level 3: text: Roger Bingham: 
+      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
+  item-11 at level 1: section: group WebVTT cue block
+    item-12 at level 2: text: 00:16.000 --> 00:18.000
+    item-13 at level 2: inline: group WebVTT cue voice span
+      item-14 at level 3: text: Roger Bingham: 
+      item-15 at level 3: text: from the American Museum of Natural History
+  item-16 at level 1: section: group WebVTT cue block
+    item-17 at level 2: text: 00:18.000 --> 00:20.000
+    item-18 at level 2: inline: group WebVTT cue voice span
+      item-19 at level 3: text: Roger Bingham: 
+      item-20 at level 3: text: And with me is Neil deGrasse Tyson
+  item-21 at level 1: section: group WebVTT cue block
+    item-22 at level 2: text: 00:20.000 --> 00:22.000
+    item-23 at level 2: inline: group WebVTT cue voice span
+      item-24 at level 3: text: Roger Bingham: 
+      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
+  item-26 at level 1: section: group WebVTT cue block
+    item-27 at level 2: text: 00:22.000 --> 00:24.000
+    item-28 at level 2: inline: group WebVTT cue voice span
+      item-29 at level 3: text: Roger Bingham: 
+      item-30 at level 3: text: at the AMNH.
+  item-31 at level 1: section: group WebVTT cue block
+    item-32 at level 2: text: 00:24.000 --> 00:26.000
+    item-33 at level 2: inline: group WebVTT cue voice span
+      item-34 at level 3: text: Roger Bingham: 
+      item-35 at level 3: text: Thank you for walking down here.
+  item-36 at level 1: section: group WebVTT cue block
+    item-37 at level 2: text: 00:27.000 --> 00:30.000
+    item-38 at level 2: inline: group WebVTT cue voice span
+      item-39 at level 3: text: Roger Bingham: 
+      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
+  item-41 at level 1: section: group WebVTT cue block
+    item-42 at level 2: text: 00:30.000 --> 00:31.500
+    item-43 at level 2: inline: group WebVTT cue voice span
+      item-44 at level 3: text: Roger Bingham: 
+      item-45 at level 3: text: When we e-mailed—
+  item-46 at level 1: section: group WebVTT cue block
+    item-47 at level 2: text: 00:30.500 --> 00:32.500
+    item-48 at level 2: inline: group WebVTT cue voice span
+      item-49 at level 3: text: Neil deGrasse Tyson: 
+      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
+  item-51 at level 1: section: group WebVTT cue block
+    item-52 at level 2: text: 00:32.000 --> 00:35.500
+    item-53 at level 2: inline: group WebVTT cue voice span
+      item-54 at level 3: text: Roger Bingham: 
+      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
+  item-56 at level 1: section: group WebVTT cue block
+    item-57 at level 2: text: 00:32.500 --> 00:33.500
+    item-58 at level 2: inline: group WebVTT cue voice span
+      item-59 at level 3: text: Neil deGrasse Tyson: 
+      item-60 at level 3: text: Laughs
+  item-61 at level 1: section: group WebVTT cue block
+    item-62 at level 2: text: 00:35.500 --> 00:38.000
+    item-63 at level 2: inline: group WebVTT cue voice span
+      item-64 at level 3: text: Roger Bingham: 
+      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -0,0 +1,51 @@
+00:11.000 --> 00:13.000
+
+Roger Bingham:  We are in New York City
+
+00:13.000 --> 00:16.000
+
+Roger Bingham:  We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+
+Roger Bingham:  from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+
+Roger Bingham:  And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+
+Roger Bingham:  Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+
+Roger Bingham:  at the AMNH.
+
+00:24.000 --> 00:26.000
+
+Roger Bingham:  Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+
+Roger Bingham:  And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500
+
+Roger Bingham:  When we e-mailed—
+
+00:30.500 --> 00:32.500
+
+Neil deGrasse Tyson:  Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500
+
+Roger Bingham:  No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500
+
+Neil deGrasse Tyson:  *Laughs*
+
+00:35.500 --> 00:38.000
+
+Roger Bingham:  You know I’m so excited my glasses are falling off here.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -0,0 +1,22 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group WebVTT cue block
+    item-2 at level 2: text: 00:00.000 --> 00:02.000
+    item-3 at level 2: inline: group WebVTT cue voice span
+      item-4 at level 3: text: Esme (first, loud): 
+      item-5 at level 3: text: It’s a blue apple tree!
+  item-6 at level 1: section: group WebVTT cue block
+    item-7 at level 2: text: 00:02.000 --> 00:04.000
+    item-8 at level 2: inline: group WebVTT cue voice span
+      item-9 at level 3: text: Mary: 
+      item-10 at level 3: text: No way!
+  item-11 at level 1: section: group WebVTT cue block
+    item-12 at level 2: text: 00:04.000 --> 00:06.000
+    item-13 at level 2: inline: group WebVTT cue voice span
+      item-14 at level 3: text: Esme: 
+      item-15 at level 3: text: Hee!
+    item-16 at level 2: text: laughter
+  item-17 at level 1: section: group WebVTT cue block
+    item-18 at level 2: text: 00:06.000 --> 00:08.000
+    item-19 at level 2: inline: group WebVTT cue voice span
+      item-20 at level 3: text: Mary (loud): 
+      item-21 at level 3: text: That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -0,0 +1,376 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.6.0",
+  "name": "webvtt_example_02",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 12867774546881601731,
+    "filename": "webvtt_example_02.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/2"
+      },
+      {
+        "$ref": "#/groups/4"
+      },
+      {
+        "$ref": "#/groups/6"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/0"
+        },
+        {
+          "$ref": "#/groups/1"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/groups/3"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/3",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/4"
+        },
+        {
+          "$ref": "#/texts/5"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/groups/5"
+        },
+        {
+          "$ref": "#/texts/9"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/5",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/10"
+        },
+        {
+          "$ref": "#/groups/7"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue block",
+      "label": "section"
+    },
+    {
+      "self_ref": "#/groups/7",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/11"
+        },
+        {
+          "$ref": "#/texts/12"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue voice span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:00.000 --> 00:02.000",
+      "text": "00:00.000 --> 00:02.000"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Esme (first, loud): ",
+      "text": "Esme (first, loud): "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "It’s a blue apple tree!",
+      "text": "It’s a blue apple tree!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:02.000 --> 00:04.000",
+      "text": "00:02.000 --> 00:04.000"
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Mary: ",
+      "text": "Mary: "
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "No way!",
+      "text": "No way!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:04.000 --> 00:06.000",
+      "text": "00:04.000 --> 00:06.000"
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Esme: ",
+      "text": "Esme: "
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Hee!",
+      "text": "Hee!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "laughter",
+      "text": "laughter",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "00:06.000 --> 00:08.000",
+      "text": "00:06.000 --> 00:08.000"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/groups/7"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Mary (loud): ",
+      "text": "Mary (loud): "
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/groups/7"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "That’s awesome!",
+      "text": "That’s awesome!",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -0,0 +1,17 @@
+00:00.000 --> 00:02.000
+
+Esme (first, loud):  It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+
+Mary:  No way!
+
+00:04.000 --> 00:06.000
+
+Esme:  Hee!
+
+*laughter*
+
+00:06.000 --> 00:08.000
+
+Mary (loud):  That’s awesome!
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -0,0 +1,77 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group WebVTT cue block
+    item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+    item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
+    item-4 at level 2: inline: group WebVTT cue voice span
+      item-5 at level 3: text: Speaker A: 
+      item-6 at level 3: text: OK, I think now we should be recording
+  item-7 at level 1: section: group WebVTT cue block
+    item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+    item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
+    item-10 at level 2: inline: group WebVTT cue voice span
+      item-11 at level 3: text: Speaker A: 
+      item-12 at level 3: text: properly.
+  item-13 at level 1: section: group WebVTT cue block
+    item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+    item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
+    item-16 at level 2: text: Good.
+  item-17 at level 1: section: group WebVTT cue block
+    item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+    item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
+    item-20 at level 2: inline: group WebVTT cue voice span
+      item-21 at level 3: text: Speaker A: 
+      item-22 at level 3: text: Yeah.
+  item-23 at level 1: section: group WebVTT cue block
+    item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+    item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
+    item-26 at level 2: inline: group WebVTT cue voice span
+      item-27 at level 3: text: Speaker B: 
+      item-28 at level 3: text: I was also thinking.
+  item-29 at level 1: section: group WebVTT cue block
+    item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+    item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
+    item-32 at level 2: inline: group WebVTT cue voice span
+      item-33 at level 3: text: Speaker B: 
+      item-34 at level 3: text: Would be maybe good to create items,
+  item-35 at level 1: section: group WebVTT cue block
+    item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+    item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
+    item-38 at level 2: inline: group WebVTT cue voice span
+      item-39 at level 3: text: Speaker B: 
+      item-40 at level 3: text: some metadata, some options that can be specific.
+  item-41 at level 1: section: group WebVTT cue block
+    item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+    item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
+    item-44 at level 2: inline: group WebVTT cue voice span
+      item-45 at level 3: text: Speaker A: 
+      item-46 at level 3: text: Yeah, I mean I think you went even more than
+  item-47 at level 1: section: group WebVTT cue block
+    item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+    item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
+    item-50 at level 2: inline: group WebVTT cue voice span
+      item-51 at level 3: text: Speaker B: 
+      item-52 at level 3: text: But we preserved the atoms.
+  item-53 at level 1: section: group WebVTT cue block
+    item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+    item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
+    item-56 at level 2: inline: group WebVTT cue voice span
+      item-57 at level 3: text: Speaker A: 
+      item-58 at level 3: text: than me. I just opened the format.
+  item-59 at level 1: section: group WebVTT cue block
+    item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+    item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
+    item-62 at level 2: inline: group WebVTT cue voice span
+      item-63 at level 3: text: Speaker A: 
+      item-64 at level 3: text: give it a try, yeah.
+  item-65 at level 1: section: group WebVTT cue block
+    item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+    item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
+    item-68 at level 2: inline: group WebVTT cue voice span
+      item-69 at level 3: text: Speaker B: 
+      item-70 at level 3: text: Okay, talk to you later.
+  item-71 at level 1: section: group WebVTT cue block
+    item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+    item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
+    item-74 at level 2: inline: group WebVTT cue voice span
+      item-75 at level 3: text: Speaker A: 
+      item-76 at level 3: text: See you.
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -0,0 +1,77 @@
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+
+00:00:04.963 --> 00:00:08.571
+
+Speaker A:  OK, I think now we should be recording
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+
+00:00:08.571 --> 00:00:09.403
+
+Speaker A:  properly.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+
+00:00:10.683 --> 00:00:11.563
+
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+
+00:00:13.363 --> 00:00:13.803
+
+Speaker A:  Yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+
+00:00:49.603 --> 00:00:53.363
+
+Speaker B:  I was also thinking.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+
+00:00:54.963 --> 00:01:02.072
+
+Speaker B:  Would be maybe good to create items,
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+
+00:01:02.072 --> 00:01:06.811
+
+Speaker B:  some metadata, some options that can be specific.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+
+00:01:10.243 --> 00:01:13.014
+
+Speaker A:  Yeah, I mean I think you went even more than
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+
+00:01:10.563 --> 00:01:12.643
+
+Speaker B:  But we preserved the atoms.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+
+00:01:13.014 --> 00:01:15.907
+
+Speaker A:  than me. I just opened the format.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+
+00:01:50.222 --> 00:01:51.643
+
+Speaker A:  give it a try, yeah.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+
+00:01:52.043 --> 00:01:55.043
+
+Speaker B:  Okay, talk to you later.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+
+00:01:54.603 --> 00:01:55.283
+
+Speaker A:  See you.
--- a/tests/data/webvtt/webvtt_example_01.vtt
+++ b/tests/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:11.000 --> 00:13.000
+<v Roger Bingham>We are in New York City
+
+00:13.000 --> 00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+<v Roger Bingham>from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+<v Roger Bingham>at the AMNH.
+
+00:24.000 --> 00:26.000
+<v Roger Bingham>Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500 align:right size:50%
+<v Roger Bingham>When we e-mailed—
+
+00:30.500 --> 00:32.500 align:left size:50%
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500 align:right size:50%
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500 align:left size:50%
+<v Neil deGrasse Tyson><i>Laughs</i>
+
+00:35.500 --> 00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:00.000 --> 00:02.000
+<v.first.loud Esme>It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+<v Mary>No way!
+
+00:04.000 --> 00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:06.000 --> 00:08.000
+<v.loud Mary>That’s awesome!
--- a/tests/data/webvtt/webvtt_example_03.vtt
+++ b/tests/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -0,0 +1,232 @@
+# Assisted by watsonx Code Assistant
+
+from pathlib import Path
+
+import pytest
+from docling_core.types.doc import DoclingDocument
+from pydantic import ValidationError
+
+from docling.backend.webvtt_backend import (
+    _WebVTTCueItalicSpan,
+    _WebVTTCueTextSpan,
+    _WebVTTCueTimings,
+    _WebVTTCueVoiceSpan,
+    _WebVTTFile,
+    _WebVTTTimestamp,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+
+from .test_data_gen_flag import GEN_TEST_DATA
+from .verify_utils import verify_document, verify_export
+
+GENERATE = GEN_TEST_DATA
+
+
+def test_vtt_cue_commponents():
+    """Test WebVTT components."""
+    valid_timestamps = [
+        "00:01:02.345",
+        "12:34:56.789",
+        "02:34.567",
+        "00:00:00.000",
+    ]
+    valid_total_seconds = [
+        1 * 60 + 2.345,
+        12 * 3600 + 34 * 60 + 56.789,
+        2 * 60 + 34.567,
+        0.0,
+    ]
+    for idx, ts in enumerate(valid_timestamps):
+        model = _WebVTTTimestamp(raw=ts)
+        assert model.seconds == valid_total_seconds[idx]
+
+    """Test invalid WebVTT timestamps."""
+    invalid_timestamps = [
+        "00:60:02.345",  # minutes > 59
+        "00:01:60.345",  # seconds > 59
+        "00:01:02.1000",  # milliseconds > 999
+        "01:02:03",  # missing milliseconds
+        "01:02",  # missing milliseconds
+        ":01:02.345",  # extra : for missing hours
+        "abc:01:02.345",  # invalid format
+    ]
+    for ts in invalid_timestamps:
+        with pytest.raises(ValidationError):
+            _WebVTTTimestamp(raw=ts)
+
+    """Test the timestamp __str__ method."""
+    model = _WebVTTTimestamp(raw="00:01:02.345")
+    assert str(model) == "00:01:02.345"
+
+    """Test valid cue timings."""
+    start = _WebVTTTimestamp(raw="00:10.005")
+    end = _WebVTTTimestamp(raw="00:14.007")
+    cue_timings = _WebVTTCueTimings(start=start, end=end)
+    assert cue_timings.start == start
+    assert cue_timings.end == end
+    assert str(cue_timings) == "00:10.005 --> 00:14.007"
+
+    """Test invalid cue timings with end timestamp before start."""
+    start = _WebVTTTimestamp(raw="00:10.700")
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start, end=end)
+    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
+
+    """Test invalid cue timings with missing end."""
+    start = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start)
+    assert "Field required" in str(excinfo.value)
+
+    """Test invalid cue timings with missing start."""
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(end=end)
+    assert "Field required" in str(excinfo.value)
+
+    """Test with valid text."""
+    valid_text = "This is a valid cue text span."
+    span = _WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
+    assert str(span) == valid_text
+
+    """Test with text containing newline characters."""
+    invalid_text = "This cue text span\ncontains a newline."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing ampersand."""
+    invalid_text = "This cue text span contains &."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing less-than sign."""
+    invalid_text = "This cue text span contains <."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with empty text."""
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text="")
+
+    """Test that annotation validation works correctly."""
+    valid_annotation = "valid-annotation"
+    invalid_annotation = "invalid\nannotation"
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
+    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+
+    """Test that classes validation works correctly."""
+    annotation = "speaker name"
+    valid_classes = ["class1", "class2"]
+    invalid_classes = ["class\nwith\nnewlines", ""]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+
+    """Test that components validation works correctly."""
+    annotation = "speaker name"
+    valid_components = [_WebVTTCueTextSpan(text="random text")]
+    invalid_components = [123, "not a component"]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+
+    """Test valid cue voice spans."""
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        classes=["loud", "clear"],
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+
+    expected_str = "<v.loud.clear speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+    expected_str = "<v speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+
+def test_webvtt_file():
+    """Test WebVTT files."""
+    with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    block = vtt.cue_blocks[11]
+    assert str(block.timings) == "00:32.500 --> 00:33.500"
+    assert len(block.payload) == 1
+    cue_span = block.payload[0]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert cue_span.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.classes
+    assert len(cue_span.components) == 1
+    comp = cue_span.components[0]
+    assert isinstance(comp, _WebVTTCueItalicSpan)
+    assert len(comp.components) == 1
+    comp2 = comp.components[0]
+    assert isinstance(comp2, _WebVTTCueTextSpan)
+    assert comp2.text == "Laughs"
+
+    with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 4
+    reverse = (
+        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
+        "https://www.w3.org/TR/webvtt1/\n\n"
+    )
+    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
+    assert content == reverse
+
+    with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    for block in vtt:
+        assert block.identifier
+    block = vtt.cue_blocks[0]
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+    block = vtt.cue_blocks[2]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
+    assert block.payload[0].text == "Good."
+
+
+def test_e2e_vtt_conversions():
+    directory = Path("./tests/data/webvtt/")
+    vtt_paths = sorted(directory.rglob("*.vtt"))
+    converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
+
+    for vtt in vtt_paths:
+        gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
+
+        conv_result: ConversionResult = converter.convert(vtt)
+
+        doc: DoclingDocument = conv_result.document
+
+        pred_md: str = doc.export_to_markdown(escape_html=False)
+        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
+            "export to md"
+        )
+
+        pred_itxt: str = doc._export_to_indented_text(
+            max_text_len=70, explicit_tables=False
+        )
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
+            "export to indented-text"
+        )
+
+        assert verify_document(doc, str(gt_path) + ".json", GENERATE)
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
    doc_path.write_text("xyz", encoding="utf-8")
    assert dci._guess_format(doc_path) is None

+    # Valid WebVTT
+    buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
+    stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
+    assert dci._guess_format(stream) == InputFormat.VTT
+
    # Valid Docling JSON
    test_str = '{"name": ""}'
    stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))