Files
docling/tests/data/groundtruth/docling_v2/formatting.html.json
Maxim Lysak c803abed9a feat: Rich tables support for HTML backend (#2324)
* Rich tables support for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Decoupling JATS backend from HTML backend, ways of creating tables changed significantly

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* updated and added tests

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Refactored parse_table_data in html_backend into few smaller functions

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Changing scope of few functions in html_backend.py, making them static, when possible

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Fix for HTML tables that have tbody and/or thead, now these tables are also properly supported

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-09-29 18:12:16 +02:00

1219 lines
25 KiB
JSON
Vendored

{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "formatting",
"origin": {
"mimetype": "text/html",
"binary_hash": 5449390083302584711,
"filename": "formatting.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/8"
},
{
"$ref": "#/texts/9"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/10"
},
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
},
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/17"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/18"
},
{
"$ref": "#/texts/19"
},
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/24"
},
{
"$ref": "#/texts/25"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/26"
},
{
"$ref": "#/texts/27"
},
{
"$ref": "#/texts/28"
},
{
"$ref": "#/texts/29"
},
{
"$ref": "#/texts/30"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/32"
},
{
"$ref": "#/texts/33"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/8",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/36"
},
{
"$ref": "#/texts/37"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/9",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/39"
},
{
"$ref": "#/texts/40"
},
{
"$ref": "#/texts/41"
},
{
"$ref": "#/texts/42"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/10",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/44"
},
{
"$ref": "#/texts/45"
},
{
"$ref": "#/texts/46"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/11",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/47"
},
{
"$ref": "#/texts/48"
},
{
"$ref": "#/texts/49"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/12",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/51"
},
{
"$ref": "#/texts/52"
},
{
"$ref": "#/texts/53"
},
{
"$ref": "#/texts/54"
},
{
"$ref": "#/texts/55"
},
{
"$ref": "#/texts/56"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "HTML Formatting Tags Demo",
"text": "HTML Formatting Tags Demo"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/3"
},
{
"$ref": "#/groups/4"
},
{
"$ref": "#/texts/23"
},
{
"$ref": "#/groups/5"
},
{
"$ref": "#/groups/6"
},
{
"$ref": "#/texts/31"
},
{
"$ref": "#/groups/7"
},
{
"$ref": "#/texts/34"
},
{
"$ref": "#/texts/35"
},
{
"$ref": "#/groups/8"
},
{
"$ref": "#/texts/38"
},
{
"$ref": "#/groups/9"
},
{
"$ref": "#/texts/43"
},
{
"$ref": "#/groups/10"
},
{
"$ref": "#/groups/11"
},
{
"$ref": "#/texts/50"
},
{
"$ref": "#/groups/12"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "HTML Text Formatting Examples",
"text": "HTML Text Formatting Examples"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is a",
"text": "This is a"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold (b)",
"text": "bold (b)",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "example and right next to it we have a",
"text": "example and right next to it we have a"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "strong emphasis (strong)",
"text": "strong emphasis (strong)",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Notice that",
"text": "Notice that"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "strong + bold mixed",
"text": "strong + bold mixed",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "looks similar but carries additional semantic meaning.",
"text": "looks similar but carries additional semantic meaning."
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Here is an",
"text": "Here is an"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic (i)",
"text": "italic (i)",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "word and an",
"text": "word and an"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "emphasis (em)",
"text": "emphasis (em)",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "example.",
"text": "example."
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Sometimes we combine them like",
"text": "Sometimes we combine them like"
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic + emphasis together",
"text": "italic + emphasis together",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Now let's look at text that appears crossed out:",
"text": "Now let's look at text that appears crossed out:"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "strikethrough with s",
"text": "strikethrough with s",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": true,
"script": "baseline"
}
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "and",
"text": "and"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "deleted with del",
"text": "deleted with del",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": true,
"script": "baseline"
}
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "You can also mix them:",
"text": "You can also mix them:"
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "double strikethrough (s + del)",
"text": "double strikethrough (s + del)",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": true,
"script": "baseline"
}
},
{
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "To highlight insertions or underlines:",
"text": "To highlight insertions or underlines:"
},
{
"self_ref": "#/texts/27",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "underlined with u",
"text": "underlined with u",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ",",
"text": ","
},
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "inserted with ins",
"text": "inserted with ins",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "A combination could be:",
"text": "A combination could be:"
},
{
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "underline + insertion together",
"text": "underline + insertion together",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/33",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/34",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Subscript and superscript examples:",
"text": "Subscript and superscript examples:"
},
{
"self_ref": "#/texts/35",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Water is written as H",
"text": "Water is written as H"
},
{
"self_ref": "#/texts/36",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "2",
"text": "2",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "sub"
}
},
{
"self_ref": "#/texts/37",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "O using sub.",
"text": "O using sub."
},
{
"self_ref": "#/texts/38",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The mathematical expression x",
"text": "The mathematical expression x"
},
{
"self_ref": "#/texts/39",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "2",
"text": "2",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "super"
}
},
{
"self_ref": "#/texts/40",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "+ y",
"text": "+ y"
},
{
"self_ref": "#/texts/41",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "3",
"text": "3",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "super"
}
},
{
"self_ref": "#/texts/42",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "uses sup.",
"text": "uses sup."
},
{
"self_ref": "#/texts/43",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "They can also be combined: CO",
"text": "They can also be combined: CO"
},
{
"self_ref": "#/texts/44",
"parent": {
"$ref": "#/groups/10"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "2",
"text": "2",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "sub"
}
},
{
"self_ref": "#/texts/45",
"parent": {
"$ref": "#/groups/10"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "*",
"text": "*",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "super"
}
},
{
"self_ref": "#/texts/46",
"parent": {
"$ref": "#/groups/10"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/47",
"parent": {
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mixing several: This sentence has",
"text": "Mixing several: This sentence has"
},
{
"self_ref": "#/texts/48",
"parent": {
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "strong + emphasis",
"text": "strong + emphasis",
"formatting": {
"bold": true,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/49",
"parent": {
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ",",
"text": ","
},
{
"self_ref": "#/texts/50",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "some",
"text": "some"
},
{
"self_ref": "#/texts/51",
"parent": {
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold + underline",
"text": "bold + underline",
"formatting": {
"bold": true,
"italic": false,
"underline": true,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/52",
"parent": {
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ", and a formula like a",
"text": ", and a formula like a"
},
{
"self_ref": "#/texts/53",
"parent": {
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "2",
"text": "2",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "super"
}
},
{
"self_ref": "#/texts/54",
"parent": {
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "+ b",
"text": "+ b"
},
{
"self_ref": "#/texts/55",
"parent": {
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "3",
"text": "3",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "sub"
}
},
{
"self_ref": "#/texts/56",
"parent": {
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}