mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
* feat: add a backend parser for WebVTT files Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: update README with VTT support Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: add description to supported formats Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade docling-core to unescape WebVTT in markdown Pin the new release of docling-core 2.48.2. Do not escape HTML reserved characters when exporting WebVTT documents to markdown. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test: add missing copyright notice Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
376 lines
7.5 KiB
JSON
Vendored
376 lines
7.5 KiB
JSON
Vendored
{
|
||
"schema_name": "DoclingDocument",
|
||
"version": "1.6.0",
|
||
"name": "webvtt_example_02",
|
||
"origin": {
|
||
"mimetype": "text/vtt",
|
||
"binary_hash": 12867774546881601731,
|
||
"filename": "webvtt_example_02.vtt"
|
||
},
|
||
"furniture": {
|
||
"self_ref": "#/furniture",
|
||
"children": [],
|
||
"content_layer": "furniture",
|
||
"name": "_root_",
|
||
"label": "unspecified"
|
||
},
|
||
"body": {
|
||
"self_ref": "#/body",
|
||
"children": [
|
||
{
|
||
"$ref": "#/groups/0"
|
||
},
|
||
{
|
||
"$ref": "#/groups/2"
|
||
},
|
||
{
|
||
"$ref": "#/groups/4"
|
||
},
|
||
{
|
||
"$ref": "#/groups/6"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "_root_",
|
||
"label": "unspecified"
|
||
},
|
||
"groups": [
|
||
{
|
||
"self_ref": "#/groups/0",
|
||
"parent": {
|
||
"$ref": "#/body"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/0"
|
||
},
|
||
{
|
||
"$ref": "#/groups/1"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue block",
|
||
"label": "section"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/1",
|
||
"parent": {
|
||
"$ref": "#/groups/0"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/1"
|
||
},
|
||
{
|
||
"$ref": "#/texts/2"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue voice span",
|
||
"label": "inline"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/2",
|
||
"parent": {
|
||
"$ref": "#/body"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/3"
|
||
},
|
||
{
|
||
"$ref": "#/groups/3"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue block",
|
||
"label": "section"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/3",
|
||
"parent": {
|
||
"$ref": "#/groups/2"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/4"
|
||
},
|
||
{
|
||
"$ref": "#/texts/5"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue voice span",
|
||
"label": "inline"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/4",
|
||
"parent": {
|
||
"$ref": "#/body"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/6"
|
||
},
|
||
{
|
||
"$ref": "#/groups/5"
|
||
},
|
||
{
|
||
"$ref": "#/texts/9"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue block",
|
||
"label": "section"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/5",
|
||
"parent": {
|
||
"$ref": "#/groups/4"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/7"
|
||
},
|
||
{
|
||
"$ref": "#/texts/8"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue voice span",
|
||
"label": "inline"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/6",
|
||
"parent": {
|
||
"$ref": "#/body"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/10"
|
||
},
|
||
{
|
||
"$ref": "#/groups/7"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue block",
|
||
"label": "section"
|
||
},
|
||
{
|
||
"self_ref": "#/groups/7",
|
||
"parent": {
|
||
"$ref": "#/groups/6"
|
||
},
|
||
"children": [
|
||
{
|
||
"$ref": "#/texts/11"
|
||
},
|
||
{
|
||
"$ref": "#/texts/12"
|
||
}
|
||
],
|
||
"content_layer": "body",
|
||
"name": "WebVTT cue voice span",
|
||
"label": "inline"
|
||
}
|
||
],
|
||
"texts": [
|
||
{
|
||
"self_ref": "#/texts/0",
|
||
"parent": {
|
||
"$ref": "#/groups/0"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "00:00.000 --> 00:02.000",
|
||
"text": "00:00.000 --> 00:02.000"
|
||
},
|
||
{
|
||
"self_ref": "#/texts/1",
|
||
"parent": {
|
||
"$ref": "#/groups/1"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "Esme (first, loud): ",
|
||
"text": "Esme (first, loud): "
|
||
},
|
||
{
|
||
"self_ref": "#/texts/2",
|
||
"parent": {
|
||
"$ref": "#/groups/1"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "It’s a blue apple tree!",
|
||
"text": "It’s a blue apple tree!",
|
||
"formatting": {
|
||
"bold": false,
|
||
"italic": false,
|
||
"underline": false,
|
||
"strikethrough": false,
|
||
"script": "baseline"
|
||
}
|
||
},
|
||
{
|
||
"self_ref": "#/texts/3",
|
||
"parent": {
|
||
"$ref": "#/groups/2"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "00:02.000 --> 00:04.000",
|
||
"text": "00:02.000 --> 00:04.000"
|
||
},
|
||
{
|
||
"self_ref": "#/texts/4",
|
||
"parent": {
|
||
"$ref": "#/groups/3"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "Mary: ",
|
||
"text": "Mary: "
|
||
},
|
||
{
|
||
"self_ref": "#/texts/5",
|
||
"parent": {
|
||
"$ref": "#/groups/3"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "No way!",
|
||
"text": "No way!",
|
||
"formatting": {
|
||
"bold": false,
|
||
"italic": false,
|
||
"underline": false,
|
||
"strikethrough": false,
|
||
"script": "baseline"
|
||
}
|
||
},
|
||
{
|
||
"self_ref": "#/texts/6",
|
||
"parent": {
|
||
"$ref": "#/groups/4"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "00:04.000 --> 00:06.000",
|
||
"text": "00:04.000 --> 00:06.000"
|
||
},
|
||
{
|
||
"self_ref": "#/texts/7",
|
||
"parent": {
|
||
"$ref": "#/groups/5"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "Esme: ",
|
||
"text": "Esme: "
|
||
},
|
||
{
|
||
"self_ref": "#/texts/8",
|
||
"parent": {
|
||
"$ref": "#/groups/5"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "Hee!",
|
||
"text": "Hee!",
|
||
"formatting": {
|
||
"bold": false,
|
||
"italic": false,
|
||
"underline": false,
|
||
"strikethrough": false,
|
||
"script": "baseline"
|
||
}
|
||
},
|
||
{
|
||
"self_ref": "#/texts/9",
|
||
"parent": {
|
||
"$ref": "#/groups/4"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "laughter",
|
||
"text": "laughter",
|
||
"formatting": {
|
||
"bold": false,
|
||
"italic": true,
|
||
"underline": false,
|
||
"strikethrough": false,
|
||
"script": "baseline"
|
||
}
|
||
},
|
||
{
|
||
"self_ref": "#/texts/10",
|
||
"parent": {
|
||
"$ref": "#/groups/6"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "00:06.000 --> 00:08.000",
|
||
"text": "00:06.000 --> 00:08.000"
|
||
},
|
||
{
|
||
"self_ref": "#/texts/11",
|
||
"parent": {
|
||
"$ref": "#/groups/7"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "Mary (loud): ",
|
||
"text": "Mary (loud): "
|
||
},
|
||
{
|
||
"self_ref": "#/texts/12",
|
||
"parent": {
|
||
"$ref": "#/groups/7"
|
||
},
|
||
"children": [],
|
||
"content_layer": "body",
|
||
"label": "text",
|
||
"prov": [],
|
||
"orig": "That’s awesome!",
|
||
"text": "That’s awesome!",
|
||
"formatting": {
|
||
"bold": false,
|
||
"italic": false,
|
||
"underline": false,
|
||
"strikethrough": false,
|
||
"script": "baseline"
|
||
}
|
||
}
|
||
],
|
||
"pictures": [],
|
||
"tables": [],
|
||
"key_value_items": [],
|
||
"form_items": [],
|
||
"pages": {}
|
||
} |