mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: add a backend parser for WebVTT files (#2288)
* feat: add a backend parser for WebVTT files Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: update README with VTT support Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: add description to supported formats Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade docling-core to unescape WebVTT in markdown Pin the new release of docling-core 2.48.2. Do not escape HTML reserved characters when exporting WebVTT documents to markdown. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test: add missing copyright notice Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
b5628f1227
commit
46efaaefee
66
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
vendored
Normal file
66
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group WebVTT cue block
|
||||
item-2 at level 2: text: 00:11.000 --> 00:13.000
|
||||
item-3 at level 2: inline: group WebVTT cue voice span
|
||||
item-4 at level 3: text: Roger Bingham:
|
||||
item-5 at level 3: text: We are in New York City
|
||||
item-6 at level 1: section: group WebVTT cue block
|
||||
item-7 at level 2: text: 00:13.000 --> 00:16.000
|
||||
item-8 at level 2: inline: group WebVTT cue voice span
|
||||
item-9 at level 3: text: Roger Bingham:
|
||||
item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
|
||||
item-11 at level 1: section: group WebVTT cue block
|
||||
item-12 at level 2: text: 00:16.000 --> 00:18.000
|
||||
item-13 at level 2: inline: group WebVTT cue voice span
|
||||
item-14 at level 3: text: Roger Bingham:
|
||||
item-15 at level 3: text: from the American Museum of Natural History
|
||||
item-16 at level 1: section: group WebVTT cue block
|
||||
item-17 at level 2: text: 00:18.000 --> 00:20.000
|
||||
item-18 at level 2: inline: group WebVTT cue voice span
|
||||
item-19 at level 3: text: Roger Bingham:
|
||||
item-20 at level 3: text: And with me is Neil deGrasse Tyson
|
||||
item-21 at level 1: section: group WebVTT cue block
|
||||
item-22 at level 2: text: 00:20.000 --> 00:22.000
|
||||
item-23 at level 2: inline: group WebVTT cue voice span
|
||||
item-24 at level 3: text: Roger Bingham:
|
||||
item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
|
||||
item-26 at level 1: section: group WebVTT cue block
|
||||
item-27 at level 2: text: 00:22.000 --> 00:24.000
|
||||
item-28 at level 2: inline: group WebVTT cue voice span
|
||||
item-29 at level 3: text: Roger Bingham:
|
||||
item-30 at level 3: text: at the AMNH.
|
||||
item-31 at level 1: section: group WebVTT cue block
|
||||
item-32 at level 2: text: 00:24.000 --> 00:26.000
|
||||
item-33 at level 2: inline: group WebVTT cue voice span
|
||||
item-34 at level 3: text: Roger Bingham:
|
||||
item-35 at level 3: text: Thank you for walking down here.
|
||||
item-36 at level 1: section: group WebVTT cue block
|
||||
item-37 at level 2: text: 00:27.000 --> 00:30.000
|
||||
item-38 at level 2: inline: group WebVTT cue voice span
|
||||
item-39 at level 3: text: Roger Bingham:
|
||||
item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
|
||||
item-41 at level 1: section: group WebVTT cue block
|
||||
item-42 at level 2: text: 00:30.000 --> 00:31.500
|
||||
item-43 at level 2: inline: group WebVTT cue voice span
|
||||
item-44 at level 3: text: Roger Bingham:
|
||||
item-45 at level 3: text: When we e-mailed—
|
||||
item-46 at level 1: section: group WebVTT cue block
|
||||
item-47 at level 2: text: 00:30.500 --> 00:32.500
|
||||
item-48 at level 2: inline: group WebVTT cue voice span
|
||||
item-49 at level 3: text: Neil deGrasse Tyson:
|
||||
item-50 at level 3: text: Didn’t we talk about enough in that conversation?
|
||||
item-51 at level 1: section: group WebVTT cue block
|
||||
item-52 at level 2: text: 00:32.000 --> 00:35.500
|
||||
item-53 at level 2: inline: group WebVTT cue voice span
|
||||
item-54 at level 3: text: Roger Bingham:
|
||||
item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
|
||||
item-56 at level 1: section: group WebVTT cue block
|
||||
item-57 at level 2: text: 00:32.500 --> 00:33.500
|
||||
item-58 at level 2: inline: group WebVTT cue voice span
|
||||
item-59 at level 3: text: Neil deGrasse Tyson:
|
||||
item-60 at level 3: text: Laughs
|
||||
item-61 at level 1: section: group WebVTT cue block
|
||||
item-62 at level 2: text: 00:35.500 --> 00:38.000
|
||||
item-63 at level 2: inline: group WebVTT cue voice span
|
||||
item-64 at level 3: text: Roger Bingham:
|
||||
item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
|
||||
1074
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
vendored
Normal file
1074
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
51
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
vendored
Normal file
51
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
00:11.000 --> 00:13.000
|
||||
|
||||
Roger Bingham: We are in New York City
|
||||
|
||||
00:13.000 --> 00:16.000
|
||||
|
||||
Roger Bingham: We’re actually at the Lucern Hotel, just down the street
|
||||
|
||||
00:16.000 --> 00:18.000
|
||||
|
||||
Roger Bingham: from the American Museum of Natural History
|
||||
|
||||
00:18.000 --> 00:20.000
|
||||
|
||||
Roger Bingham: And with me is Neil deGrasse Tyson
|
||||
|
||||
00:20.000 --> 00:22.000
|
||||
|
||||
Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
|
||||
|
||||
00:22.000 --> 00:24.000
|
||||
|
||||
Roger Bingham: at the AMNH.
|
||||
|
||||
00:24.000 --> 00:26.000
|
||||
|
||||
Roger Bingham: Thank you for walking down here.
|
||||
|
||||
00:27.000 --> 00:30.000
|
||||
|
||||
Roger Bingham: And I want to do a follow-up on the last conversation we did.
|
||||
|
||||
00:30.000 --> 00:31.500
|
||||
|
||||
Roger Bingham: When we e-mailed—
|
||||
|
||||
00:30.500 --> 00:32.500
|
||||
|
||||
Neil deGrasse Tyson: Didn’t we talk about enough in that conversation?
|
||||
|
||||
00:32.000 --> 00:35.500
|
||||
|
||||
Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
|
||||
|
||||
00:32.500 --> 00:33.500
|
||||
|
||||
Neil deGrasse Tyson: *Laughs*
|
||||
|
||||
00:35.500 --> 00:38.000
|
||||
|
||||
Roger Bingham: You know I’m so excited my glasses are falling off here.
|
||||
22
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
vendored
Normal file
22
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group WebVTT cue block
|
||||
item-2 at level 2: text: 00:00.000 --> 00:02.000
|
||||
item-3 at level 2: inline: group WebVTT cue voice span
|
||||
item-4 at level 3: text: Esme (first, loud):
|
||||
item-5 at level 3: text: It’s a blue apple tree!
|
||||
item-6 at level 1: section: group WebVTT cue block
|
||||
item-7 at level 2: text: 00:02.000 --> 00:04.000
|
||||
item-8 at level 2: inline: group WebVTT cue voice span
|
||||
item-9 at level 3: text: Mary:
|
||||
item-10 at level 3: text: No way!
|
||||
item-11 at level 1: section: group WebVTT cue block
|
||||
item-12 at level 2: text: 00:04.000 --> 00:06.000
|
||||
item-13 at level 2: inline: group WebVTT cue voice span
|
||||
item-14 at level 3: text: Esme:
|
||||
item-15 at level 3: text: Hee!
|
||||
item-16 at level 2: text: laughter
|
||||
item-17 at level 1: section: group WebVTT cue block
|
||||
item-18 at level 2: text: 00:06.000 --> 00:08.000
|
||||
item-19 at level 2: inline: group WebVTT cue voice span
|
||||
item-20 at level 3: text: Mary (loud):
|
||||
item-21 at level 3: text: That’s awesome!
|
||||
376
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
vendored
Normal file
376
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
vendored
Normal file
@@ -0,0 +1,376 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"name": "webvtt_example_02",
|
||||
"origin": {
|
||||
"mimetype": "text/vtt",
|
||||
"binary_hash": 12867774546881601731,
|
||||
"filename": "webvtt_example_02.vtt"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/6",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/7"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:00.000 --> 00:02.000",
|
||||
"text": "00:00.000 --> 00:02.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Esme (first, loud): ",
|
||||
"text": "Esme (first, loud): "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "It’s a blue apple tree!",
|
||||
"text": "It’s a blue apple tree!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:02.000 --> 00:04.000",
|
||||
"text": "00:02.000 --> 00:04.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Mary: ",
|
||||
"text": "Mary: "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "No way!",
|
||||
"text": "No way!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:04.000 --> 00:06.000",
|
||||
"text": "00:04.000 --> 00:06.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Esme: ",
|
||||
"text": "Esme: "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Hee!",
|
||||
"text": "Hee!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "laughter",
|
||||
"text": "laughter",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:06.000 --> 00:08.000",
|
||||
"text": "00:06.000 --> 00:08.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Mary (loud): ",
|
||||
"text": "Mary (loud): "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "That’s awesome!",
|
||||
"text": "That’s awesome!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
17
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
vendored
Normal file
17
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
00:00.000 --> 00:02.000
|
||||
|
||||
Esme (first, loud): It’s a blue apple tree!
|
||||
|
||||
00:02.000 --> 00:04.000
|
||||
|
||||
Mary: No way!
|
||||
|
||||
00:04.000 --> 00:06.000
|
||||
|
||||
Esme: Hee!
|
||||
|
||||
*laughter*
|
||||
|
||||
00:06.000 --> 00:08.000
|
||||
|
||||
Mary (loud): That’s awesome!
|
||||
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
vendored
Normal file
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group WebVTT cue block
|
||||
item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
|
||||
item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
|
||||
item-4 at level 2: inline: group WebVTT cue voice span
|
||||
item-5 at level 3: text: Speaker A:
|
||||
item-6 at level 3: text: OK, I think now we should be recording
|
||||
item-7 at level 1: section: group WebVTT cue block
|
||||
item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
|
||||
item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
|
||||
item-10 at level 2: inline: group WebVTT cue voice span
|
||||
item-11 at level 3: text: Speaker A:
|
||||
item-12 at level 3: text: properly.
|
||||
item-13 at level 1: section: group WebVTT cue block
|
||||
item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
|
||||
item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
|
||||
item-16 at level 2: text: Good.
|
||||
item-17 at level 1: section: group WebVTT cue block
|
||||
item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
|
||||
item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
|
||||
item-20 at level 2: inline: group WebVTT cue voice span
|
||||
item-21 at level 3: text: Speaker A:
|
||||
item-22 at level 3: text: Yeah.
|
||||
item-23 at level 1: section: group WebVTT cue block
|
||||
item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
|
||||
item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
|
||||
item-26 at level 2: inline: group WebVTT cue voice span
|
||||
item-27 at level 3: text: Speaker B:
|
||||
item-28 at level 3: text: I was also thinking.
|
||||
item-29 at level 1: section: group WebVTT cue block
|
||||
item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
|
||||
item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
|
||||
item-32 at level 2: inline: group WebVTT cue voice span
|
||||
item-33 at level 3: text: Speaker B:
|
||||
item-34 at level 3: text: Would be maybe good to create items,
|
||||
item-35 at level 1: section: group WebVTT cue block
|
||||
item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
|
||||
item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
|
||||
item-38 at level 2: inline: group WebVTT cue voice span
|
||||
item-39 at level 3: text: Speaker B:
|
||||
item-40 at level 3: text: some metadata, some options that can be specific.
|
||||
item-41 at level 1: section: group WebVTT cue block
|
||||
item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
|
||||
item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
|
||||
item-44 at level 2: inline: group WebVTT cue voice span
|
||||
item-45 at level 3: text: Speaker A:
|
||||
item-46 at level 3: text: Yeah, I mean I think you went even more than
|
||||
item-47 at level 1: section: group WebVTT cue block
|
||||
item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
|
||||
item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
|
||||
item-50 at level 2: inline: group WebVTT cue voice span
|
||||
item-51 at level 3: text: Speaker B:
|
||||
item-52 at level 3: text: But we preserved the atoms.
|
||||
item-53 at level 1: section: group WebVTT cue block
|
||||
item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
|
||||
item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
|
||||
item-56 at level 2: inline: group WebVTT cue voice span
|
||||
item-57 at level 3: text: Speaker A:
|
||||
item-58 at level 3: text: than me. I just opened the format.
|
||||
item-59 at level 1: section: group WebVTT cue block
|
||||
item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
|
||||
item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
|
||||
item-62 at level 2: inline: group WebVTT cue voice span
|
||||
item-63 at level 3: text: Speaker A:
|
||||
item-64 at level 3: text: give it a try, yeah.
|
||||
item-65 at level 1: section: group WebVTT cue block
|
||||
item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
|
||||
item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
|
||||
item-68 at level 2: inline: group WebVTT cue voice span
|
||||
item-69 at level 3: text: Speaker B:
|
||||
item-70 at level 3: text: Okay, talk to you later.
|
||||
item-71 at level 1: section: group WebVTT cue block
|
||||
item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
|
||||
item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
|
||||
item-74 at level 2: inline: group WebVTT cue voice span
|
||||
item-75 at level 3: text: Speaker A:
|
||||
item-76 at level 3: text: See you.
|
||||
1240
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
vendored
Normal file
1240
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
vendored
Normal file
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
|
||||
|
||||
00:00:04.963 --> 00:00:08.571
|
||||
|
||||
Speaker A: OK, I think now we should be recording
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
|
||||
|
||||
00:00:08.571 --> 00:00:09.403
|
||||
|
||||
Speaker A: properly.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
|
||||
|
||||
00:00:10.683 --> 00:00:11.563
|
||||
|
||||
Good.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
|
||||
|
||||
00:00:13.363 --> 00:00:13.803
|
||||
|
||||
Speaker A: Yeah.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
|
||||
|
||||
00:00:49.603 --> 00:00:53.363
|
||||
|
||||
Speaker B: I was also thinking.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
|
||||
|
||||
00:00:54.963 --> 00:01:02.072
|
||||
|
||||
Speaker B: Would be maybe good to create items,
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
|
||||
|
||||
00:01:02.072 --> 00:01:06.811
|
||||
|
||||
Speaker B: some metadata, some options that can be specific.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
|
||||
|
||||
00:01:10.243 --> 00:01:13.014
|
||||
|
||||
Speaker A: Yeah, I mean I think you went even more than
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
|
||||
|
||||
00:01:10.563 --> 00:01:12.643
|
||||
|
||||
Speaker B: But we preserved the atoms.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
|
||||
|
||||
00:01:13.014 --> 00:01:15.907
|
||||
|
||||
Speaker A: than me. I just opened the format.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
|
||||
|
||||
00:01:50.222 --> 00:01:51.643
|
||||
|
||||
Speaker A: give it a try, yeah.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
|
||||
|
||||
00:01:52.043 --> 00:01:55.043
|
||||
|
||||
Speaker B: Okay, talk to you later.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
|
||||
|
||||
00:01:54.603 --> 00:01:55.283
|
||||
|
||||
Speaker A: See you.
|
||||
42
tests/data/webvtt/webvtt_example_01.vtt
vendored
Normal file
42
tests/data/webvtt/webvtt_example_01.vtt
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
WEBVTT
|
||||
|
||||
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
|
||||
|
||||
00:11.000 --> 00:13.000
|
||||
<v Roger Bingham>We are in New York City
|
||||
|
||||
00:13.000 --> 00:16.000
|
||||
<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
|
||||
|
||||
00:16.000 --> 00:18.000
|
||||
<v Roger Bingham>from the American Museum of Natural History
|
||||
|
||||
00:18.000 --> 00:20.000
|
||||
<v Roger Bingham>And with me is Neil deGrasse Tyson
|
||||
|
||||
00:20.000 --> 00:22.000
|
||||
<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
|
||||
|
||||
00:22.000 --> 00:24.000
|
||||
<v Roger Bingham>at the AMNH.
|
||||
|
||||
00:24.000 --> 00:26.000
|
||||
<v Roger Bingham>Thank you for walking down here.
|
||||
|
||||
00:27.000 --> 00:30.000
|
||||
<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
|
||||
|
||||
00:30.000 --> 00:31.500 align:right size:50%
|
||||
<v Roger Bingham>When we e-mailed—
|
||||
|
||||
00:30.500 --> 00:32.500 align:left size:50%
|
||||
<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
|
||||
|
||||
00:32.000 --> 00:35.500 align:right size:50%
|
||||
<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
|
||||
|
||||
00:32.500 --> 00:33.500 align:left size:50%
|
||||
<v Neil deGrasse Tyson><i>Laughs</i>
|
||||
|
||||
00:35.500 --> 00:38.000
|
||||
<v Roger Bingham>You know I’m so excited my glasses are falling off here.
|
||||
15
tests/data/webvtt/webvtt_example_02.vtt
vendored
Normal file
15
tests/data/webvtt/webvtt_example_02.vtt
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
WEBVTT
|
||||
|
||||
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
|
||||
|
||||
00:00.000 --> 00:02.000
|
||||
<v.first.loud Esme>It’s a blue apple tree!
|
||||
|
||||
00:02.000 --> 00:04.000
|
||||
<v Mary>No way!
|
||||
|
||||
00:04.000 --> 00:06.000
|
||||
<v Esme>Hee!</v> <i>laughter</i>
|
||||
|
||||
00:06.000 --> 00:08.000
|
||||
<v.loud Mary>That’s awesome!
|
||||
57
tests/data/webvtt/webvtt_example_03.vtt
vendored
Normal file
57
tests/data/webvtt/webvtt_example_03.vtt
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
WEBVTT
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
|
||||
00:00:04.963 --> 00:00:08.571
|
||||
<v Speaker A>OK,
|
||||
I think now we should be recording</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
|
||||
00:00:08.571 --> 00:00:09.403
|
||||
<v Speaker A>properly.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
|
||||
00:00:10.683 --> 00:00:11.563
|
||||
Good.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
|
||||
00:00:13.363 --> 00:00:13.803
|
||||
<v Speaker A>Yeah.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
|
||||
00:00:49.603 --> 00:00:53.363
|
||||
<v Speaker B>I was also thinking.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
|
||||
00:00:54.963 --> 00:01:02.072
|
||||
<v Speaker B>Would be maybe good to create items,</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
|
||||
00:01:02.072 --> 00:01:06.811
|
||||
<v Speaker B>some metadata,
|
||||
some options that can be specific.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
|
||||
00:01:10.243 --> 00:01:13.014
|
||||
<v Speaker A>Yeah,
|
||||
I mean I think you went even more than</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
|
||||
00:01:10.563 --> 00:01:12.643
|
||||
<v Speaker B>But we preserved the atoms.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
|
||||
00:01:13.014 --> 00:01:15.907
|
||||
<v Speaker A>than me.
|
||||
I just opened the format.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
|
||||
00:01:50.222 --> 00:01:51.643
|
||||
<v Speaker A>give it a try, yeah.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
|
||||
00:01:52.043 --> 00:01:55.043
|
||||
<v Speaker B>Okay, talk to you later.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
|
||||
00:01:54.603 --> 00:01:55.283
|
||||
<v Speaker A>See you.</v>
|
||||
232
tests/test_backend_vtt.py
Normal file
232
tests/test_backend_vtt.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# Assisted by watsonx Code Assistant
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.webvtt_backend import (
|
||||
_WebVTTCueItalicSpan,
|
||||
_WebVTTCueTextSpan,
|
||||
_WebVTTCueTimings,
|
||||
_WebVTTCueVoiceSpan,
|
||||
_WebVTTFile,
|
||||
_WebVTTTimestamp,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def test_vtt_cue_commponents():
|
||||
"""Test WebVTT components."""
|
||||
valid_timestamps = [
|
||||
"00:01:02.345",
|
||||
"12:34:56.789",
|
||||
"02:34.567",
|
||||
"00:00:00.000",
|
||||
]
|
||||
valid_total_seconds = [
|
||||
1 * 60 + 2.345,
|
||||
12 * 3600 + 34 * 60 + 56.789,
|
||||
2 * 60 + 34.567,
|
||||
0.0,
|
||||
]
|
||||
for idx, ts in enumerate(valid_timestamps):
|
||||
model = _WebVTTTimestamp(raw=ts)
|
||||
assert model.seconds == valid_total_seconds[idx]
|
||||
|
||||
"""Test invalid WebVTT timestamps."""
|
||||
invalid_timestamps = [
|
||||
"00:60:02.345", # minutes > 59
|
||||
"00:01:60.345", # seconds > 59
|
||||
"00:01:02.1000", # milliseconds > 999
|
||||
"01:02:03", # missing milliseconds
|
||||
"01:02", # missing milliseconds
|
||||
":01:02.345", # extra : for missing hours
|
||||
"abc:01:02.345", # invalid format
|
||||
]
|
||||
for ts in invalid_timestamps:
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTTimestamp(raw=ts)
|
||||
|
||||
"""Test the timestamp __str__ method."""
|
||||
model = _WebVTTTimestamp(raw="00:01:02.345")
|
||||
assert str(model) == "00:01:02.345"
|
||||
|
||||
"""Test valid cue timings."""
|
||||
start = _WebVTTTimestamp(raw="00:10.005")
|
||||
end = _WebVTTTimestamp(raw="00:14.007")
|
||||
cue_timings = _WebVTTCueTimings(start=start, end=end)
|
||||
assert cue_timings.start == start
|
||||
assert cue_timings.end == end
|
||||
assert str(cue_timings) == "00:10.005 --> 00:14.007"
|
||||
|
||||
"""Test invalid cue timings with end timestamp before start."""
|
||||
start = _WebVTTTimestamp(raw="00:10.700")
|
||||
end = _WebVTTTimestamp(raw="00:10.500")
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
_WebVTTCueTimings(start=start, end=end)
|
||||
assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
|
||||
|
||||
"""Test invalid cue timings with missing end."""
|
||||
start = _WebVTTTimestamp(raw="00:10.500")
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
_WebVTTCueTimings(start=start)
|
||||
assert "Field required" in str(excinfo.value)
|
||||
|
||||
"""Test invalid cue timings with missing start."""
|
||||
end = _WebVTTTimestamp(raw="00:10.500")
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
_WebVTTCueTimings(end=end)
|
||||
assert "Field required" in str(excinfo.value)
|
||||
|
||||
"""Test with valid text."""
|
||||
valid_text = "This is a valid cue text span."
|
||||
span = _WebVTTCueTextSpan(text=valid_text)
|
||||
assert span.text == valid_text
|
||||
assert str(span) == valid_text
|
||||
|
||||
"""Test with text containing newline characters."""
|
||||
invalid_text = "This cue text span\ncontains a newline."
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text=invalid_text)
|
||||
|
||||
"""Test with text containing ampersand."""
|
||||
invalid_text = "This cue text span contains &."
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text=invalid_text)
|
||||
|
||||
"""Test with text containing less-than sign."""
|
||||
invalid_text = "This cue text span contains <."
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text=invalid_text)
|
||||
|
||||
"""Test with empty text."""
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text="")
|
||||
|
||||
"""Test that annotation validation works correctly."""
|
||||
valid_annotation = "valid-annotation"
|
||||
invalid_annotation = "invalid\nannotation"
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueVoiceSpan(annotation=invalid_annotation)
|
||||
assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
|
||||
|
||||
"""Test that classes validation works correctly."""
|
||||
annotation = "speaker name"
|
||||
valid_classes = ["class1", "class2"]
|
||||
invalid_classes = ["class\nwith\nnewlines", ""]
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
|
||||
assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
|
||||
|
||||
"""Test that components validation works correctly."""
|
||||
annotation = "speaker name"
|
||||
valid_components = [_WebVTTCueTextSpan(text="random text")]
|
||||
invalid_components = [123, "not a component"]
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
|
||||
assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
|
||||
|
||||
"""Test valid cue voice spans."""
|
||||
cue_span = _WebVTTCueVoiceSpan(
|
||||
annotation="speaker",
|
||||
classes=["loud", "clear"],
|
||||
components=[_WebVTTCueTextSpan(text="random text")],
|
||||
)
|
||||
|
||||
expected_str = "<v.loud.clear speaker>random text</v>"
|
||||
assert str(cue_span) == expected_str
|
||||
|
||||
cue_span = _WebVTTCueVoiceSpan(
|
||||
annotation="speaker",
|
||||
components=[_WebVTTCueTextSpan(text="random text")],
|
||||
)
|
||||
expected_str = "<v speaker>random text</v>"
|
||||
assert str(cue_span) == expected_str
|
||||
|
||||
|
||||
def test_webvtt_file():
|
||||
"""Test WebVTT files."""
|
||||
with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
vtt = _WebVTTFile.parse(content)
|
||||
assert len(vtt) == 13
|
||||
block = vtt.cue_blocks[11]
|
||||
assert str(block.timings) == "00:32.500 --> 00:33.500"
|
||||
assert len(block.payload) == 1
|
||||
cue_span = block.payload[0]
|
||||
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
|
||||
assert cue_span.annotation == "Neil deGrasse Tyson"
|
||||
assert not cue_span.classes
|
||||
assert len(cue_span.components) == 1
|
||||
comp = cue_span.components[0]
|
||||
assert isinstance(comp, _WebVTTCueItalicSpan)
|
||||
assert len(comp.components) == 1
|
||||
comp2 = comp.components[0]
|
||||
assert isinstance(comp2, _WebVTTCueTextSpan)
|
||||
assert comp2.text == "Laughs"
|
||||
|
||||
with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
vtt = _WebVTTFile.parse(content)
|
||||
assert len(vtt) == 4
|
||||
reverse = (
|
||||
"WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
|
||||
"https://www.w3.org/TR/webvtt1/\n\n"
|
||||
)
|
||||
reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
|
||||
assert content == reverse
|
||||
|
||||
with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
vtt = _WebVTTFile.parse(content)
|
||||
assert len(vtt) == 13
|
||||
for block in vtt:
|
||||
assert block.identifier
|
||||
block = vtt.cue_blocks[0]
|
||||
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
|
||||
assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
|
||||
assert len(block.payload) == 1
|
||||
assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
|
||||
block = vtt.cue_blocks[2]
|
||||
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
|
||||
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
|
||||
assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
|
||||
assert len(block.payload) == 1
|
||||
assert isinstance(block.payload[0], _WebVTTCueTextSpan)
|
||||
assert block.payload[0].text == "Good."
|
||||
|
||||
|
||||
def test_e2e_vtt_conversions():
|
||||
directory = Path("./tests/data/webvtt/")
|
||||
vtt_paths = sorted(directory.rglob("*.vtt"))
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
|
||||
|
||||
for vtt in vtt_paths:
|
||||
gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
|
||||
|
||||
conv_result: ConversionResult = converter.convert(vtt)
|
||||
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown(escape_html=False)
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
|
||||
doc_path.write_text("xyz", encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) is None
|
||||
|
||||
# Valid WebVTT
|
||||
buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
|
||||
stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.VTT
|
||||
|
||||
# Valid Docling JSON
|
||||
test_str = '{"name": ""}'
|
||||
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||
|
||||
Reference in New Issue
Block a user