fix(markdown): fix single-formatted headings & list items (#1820)

* fix(markdown): fix formatting & inline edge cases (show behavior before change)

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* add change and updated test data

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* update lock

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* improve test case

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

---------

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas
2025-06-25 13:05:06 +02:00
committed by GitHub
parent 41e8cae26b
commit 7c5614a37a
67 changed files with 2648 additions and 2351 deletions

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"version": "1.4.0",
"name": "textbox",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -36,10 +36,10 @@
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/2"
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/10"
@@ -50,17 +50,14 @@
{
"$ref": "#/texts/12"
},
{
"$ref": "#/texts/13"
},
{
"$ref": "#/groups/3"
},
{
"$ref": "#/texts/15"
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/16"
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/4"
@@ -68,6 +65,9 @@
{
"$ref": "#/groups/6"
},
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
@@ -80,9 +80,6 @@
{
"$ref": "#/texts/24"
},
{
"$ref": "#/texts/25"
},
{
"$ref": "#/groups/7"
},
@@ -90,11 +87,14 @@
"$ref": "#/groups/9"
},
{
"$ref": "#/texts/32"
"$ref": "#/texts/31"
},
{
"$ref": "#/groups/10"
},
{
"$ref": "#/texts/33"
},
{
"$ref": "#/texts/34"
},
@@ -114,10 +114,10 @@
"$ref": "#/texts/39"
},
{
"$ref": "#/texts/40"
"$ref": "#/groups/11"
},
{
"$ref": "#/groups/11"
"$ref": "#/texts/44"
},
{
"$ref": "#/texts/45"
@@ -125,56 +125,53 @@
{
"$ref": "#/texts/46"
},
{
"$ref": "#/texts/47"
},
{
"$ref": "#/groups/13"
},
{
"$ref": "#/texts/50"
"$ref": "#/texts/49"
},
{
"$ref": "#/groups/14"
},
{
"$ref": "#/texts/53"
"$ref": "#/texts/52"
},
{
"$ref": "#/texts/54"
"$ref": "#/texts/53"
},
{
"$ref": "#/groups/15"
},
{
"$ref": "#/texts/56"
"$ref": "#/texts/55"
},
{
"$ref": "#/groups/16"
},
{
"$ref": "#/texts/58"
"$ref": "#/texts/57"
},
{
"$ref": "#/texts/59"
"$ref": "#/texts/58"
},
{
"$ref": "#/groups/17"
},
{
"$ref": "#/texts/63"
"$ref": "#/texts/62"
},
{
"$ref": "#/groups/18"
},
{
"$ref": "#/texts/64"
},
{
"$ref": "#/texts/65"
},
{
"$ref": "#/texts/66"
},
{
"$ref": "#/texts/67"
}
],
"content_layer": "body",
@@ -223,7 +220,7 @@
},
"children": [
{
"$ref": "#/texts/9"
"$ref": "#/texts/8"
}
],
"content_layer": "body",
@@ -237,7 +234,7 @@
},
"children": [
{
"$ref": "#/texts/14"
"$ref": "#/texts/13"
}
],
"content_layer": "body",
@@ -254,7 +251,7 @@
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/19"
"$ref": "#/texts/18"
}
],
"content_layer": "body",
@@ -268,10 +265,10 @@
},
"children": [
{
"$ref": "#/texts/17"
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/18"
"$ref": "#/texts/17"
}
],
"content_layer": "body",
@@ -285,7 +282,7 @@
},
"children": [
{
"$ref": "#/texts/20"
"$ref": "#/texts/19"
}
],
"content_layer": "body",
@@ -299,16 +296,16 @@
},
"children": [
{
"$ref": "#/texts/26"
"$ref": "#/texts/25"
},
{
"$ref": "#/texts/27"
"$ref": "#/texts/26"
},
{
"$ref": "#/groups/8"
},
{
"$ref": "#/texts/30"
"$ref": "#/texts/29"
}
],
"content_layer": "body",
@@ -322,10 +319,10 @@
},
"children": [
{
"$ref": "#/texts/28"
"$ref": "#/texts/27"
},
{
"$ref": "#/texts/29"
"$ref": "#/texts/28"
}
],
"content_layer": "body",
@@ -339,7 +336,7 @@
},
"children": [
{
"$ref": "#/texts/31"
"$ref": "#/texts/30"
}
],
"content_layer": "body",
@@ -353,7 +350,7 @@
},
"children": [
{
"$ref": "#/texts/33"
"$ref": "#/texts/32"
}
],
"content_layer": "body",
@@ -370,7 +367,7 @@
"$ref": "#/groups/12"
},
{
"$ref": "#/texts/44"
"$ref": "#/texts/43"
}
],
"content_layer": "body",
@@ -383,14 +380,14 @@
"$ref": "#/groups/11"
},
"children": [
{
"$ref": "#/texts/40"
},
{
"$ref": "#/texts/41"
},
{
"$ref": "#/texts/42"
},
{
"$ref": "#/texts/43"
}
],
"content_layer": "body",
@@ -404,10 +401,10 @@
},
"children": [
{
"$ref": "#/texts/48"
"$ref": "#/texts/47"
},
{
"$ref": "#/texts/49"
"$ref": "#/texts/48"
}
],
"content_layer": "body",
@@ -421,10 +418,10 @@
},
"children": [
{
"$ref": "#/texts/51"
"$ref": "#/texts/50"
},
{
"$ref": "#/texts/52"
"$ref": "#/texts/51"
}
],
"content_layer": "body",
@@ -438,7 +435,7 @@
},
"children": [
{
"$ref": "#/texts/55"
"$ref": "#/texts/54"
}
],
"content_layer": "body",
@@ -452,7 +449,7 @@
},
"children": [
{
"$ref": "#/texts/57"
"$ref": "#/texts/56"
}
],
"content_layer": "body",
@@ -465,14 +462,14 @@
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/59"
},
{
"$ref": "#/texts/60"
},
{
"$ref": "#/texts/61"
},
{
"$ref": "#/texts/62"
}
],
"content_layer": "body",
@@ -486,7 +483,7 @@
},
"children": [
{
"$ref": "#/texts/64"
"$ref": "#/texts/63"
}
],
"content_layer": "body",
@@ -510,7 +507,8 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
@@ -528,7 +526,8 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
@@ -558,7 +557,8 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
@@ -588,7 +588,8 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
@@ -600,12 +601,10 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "paragraph",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
"text": ""
},
{
"self_ref": "#/texts/7",
@@ -621,18 +620,6 @@
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/2"
},
@@ -646,9 +633,22 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/10",
"parent": {
@@ -687,18 +687,6 @@
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/3"
},
@@ -712,9 +700,22 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/15",
"parent": {
@@ -729,18 +730,6 @@
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/5"
},
@@ -754,13 +743,14 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/18",
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/5"
},
@@ -774,13 +764,14 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/4"
},
@@ -792,7 +783,7 @@
"text": ""
},
{
"self_ref": "#/texts/20",
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/6"
},
@@ -805,6 +796,18 @@
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/21",
"parent": {
@@ -855,18 +858,6 @@
},
{
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/groups/7"
},
@@ -880,11 +871,12 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/27",
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/groups/7"
},
@@ -898,11 +890,12 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/28",
"self_ref": "#/texts/27",
"parent": {
"$ref": "#/groups/8"
},
@@ -916,13 +909,14 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/29",
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/8"
},
@@ -936,13 +930,14 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/30",
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/7"
},
@@ -954,7 +949,7 @@
"text": ""
},
{
"self_ref": "#/texts/31",
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/9"
},
@@ -968,7 +963,7 @@
"marker": "-"
},
{
"self_ref": "#/texts/32",
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/body"
},
@@ -980,7 +975,7 @@
"text": ""
},
{
"self_ref": "#/texts/33",
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/groups/10"
},
@@ -994,9 +989,22 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/33",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/34",
"parent": {
@@ -1071,18 +1079,6 @@
},
{
"self_ref": "#/texts/40",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/41",
"parent": {
"$ref": "#/groups/12"
},
@@ -1096,11 +1092,12 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/42",
"self_ref": "#/texts/41",
"parent": {
"$ref": "#/groups/12"
},
@@ -1114,11 +1111,12 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/43",
"self_ref": "#/texts/42",
"parent": {
"$ref": "#/groups/12"
},
@@ -1132,13 +1130,26 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/43",
"parent": {
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/44",
"parent": {
"$ref": "#/groups/11"
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
@@ -1173,18 +1184,6 @@
},
{
"self_ref": "#/texts/47",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/48",
"parent": {
"$ref": "#/groups/13"
},
@@ -1198,11 +1197,12 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/49",
"self_ref": "#/texts/48",
"parent": {
"$ref": "#/groups/13"
},
@@ -1214,7 +1214,7 @@
"text": ""
},
{
"self_ref": "#/texts/50",
"self_ref": "#/texts/49",
"parent": {
"$ref": "#/body"
},
@@ -1226,7 +1226,7 @@
"text": ""
},
{
"self_ref": "#/texts/51",
"self_ref": "#/texts/50",
"parent": {
"$ref": "#/groups/14"
},
@@ -1240,11 +1240,12 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/52",
"self_ref": "#/texts/51",
"parent": {
"$ref": "#/groups/14"
},
@@ -1258,9 +1259,22 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/52",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/53",
"parent": {
@@ -1275,18 +1289,6 @@
},
{
"self_ref": "#/texts/54",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/55",
"parent": {
"$ref": "#/groups/15"
},
@@ -1300,11 +1302,12 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/56",
"self_ref": "#/texts/55",
"parent": {
"$ref": "#/body"
},
@@ -1316,7 +1319,7 @@
"text": ""
},
{
"self_ref": "#/texts/57",
"self_ref": "#/texts/56",
"parent": {
"$ref": "#/groups/16"
},
@@ -1330,9 +1333,22 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/57",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/58",
"parent": {
@@ -1347,18 +1363,6 @@
},
{
"self_ref": "#/texts/59",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/60",
"parent": {
"$ref": "#/groups/17"
},
@@ -1372,11 +1376,12 @@
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/61",
"self_ref": "#/texts/60",
"parent": {
"$ref": "#/groups/17"
},
@@ -1388,7 +1393,7 @@
"text": ""
},
{
"self_ref": "#/texts/62",
"self_ref": "#/texts/61",
"parent": {
"$ref": "#/groups/17"
},
@@ -1402,11 +1407,12 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/63",
"self_ref": "#/texts/62",
"parent": {
"$ref": "#/body"
},
@@ -1418,7 +1424,7 @@
"text": ""
},
{
"self_ref": "#/texts/64",
"self_ref": "#/texts/63",
"parent": {
"$ref": "#/groups/18"
},
@@ -1432,9 +1438,22 @@
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/64",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/65",
"parent": {
@@ -1458,18 +1477,6 @@
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/67",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],