feat: Rich tables for MSWord backend (#2291)

* Adding support of rich table cells to MSWord backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Fixes for properly accounting lists, pictures and headers in rich table cells

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Cleaned up msword backend, re-generated docx tests

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added detection of simple table cells in word backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Cleaned up

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2025-09-22 16:41:59 +02:00
committed by GitHub
parent 46efaaefee
commit e2482a2ada
27 changed files with 1103 additions and 787 deletions

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "word_sample",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -98,7 +98,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Summer activities",
"text": "Summer activities",
@@ -142,7 +142,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Duck",
"text": "Duck",
@@ -161,7 +161,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Figure 1: This is a cute duckling",
"text": "Figure 1: This is a cute duckling",
@@ -212,7 +212,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
"text": "To get started with swimming, first lay down in a water and try not to drown:",
@@ -294,7 +294,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Also, dont forget:",
"text": "Also, dont forget:",
@@ -376,7 +376,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Hmm, what else…",
"text": "Hmm, what else…",
@@ -430,7 +430,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "After we had a good day of swimming in the lake, its important to eat something nice",
"text": "After we had a good day of swimming in the lake, its important to eat something nice",
@@ -449,7 +449,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "I like to eat leaves",
"text": "I like to eat leaves",
@@ -468,7 +468,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Here are some interesting things a respectful duck could eat:",
"text": "Here are some interesting things a respectful duck could eat:",
@@ -487,7 +487,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -499,7 +499,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "And lets add another list in the end:",
"text": "And lets add another list in the end:",
@@ -625,7 +625,8 @@
"text": "",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -637,7 +638,8 @@
"text": "Food",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -649,7 +651,8 @@
"text": "Calories per portion",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -661,7 +664,8 @@
"text": "Leaves",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -673,7 +677,8 @@
"text": "Ash, Elm, Maple",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -685,7 +690,8 @@
"text": "50",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -697,7 +703,8 @@
"text": "Berries",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -709,7 +716,8 @@
"text": "Blueberry, Strawberry, Cranberry",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -721,7 +729,8 @@
"text": "150",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -733,7 +742,8 @@
"text": "Grain",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -745,7 +755,8 @@
"text": "Corn, Buckwheat, Barley",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -757,7 +768,8 @@
"text": "200",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
@@ -774,7 +786,8 @@
"text": "",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -786,7 +799,8 @@
"text": "Food",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -798,7 +812,8 @@
"text": "Calories per portion",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -812,7 +827,8 @@
"text": "Leaves",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -824,7 +840,8 @@
"text": "Ash, Elm, Maple",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -836,7 +853,8 @@
"text": "50",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -850,7 +868,8 @@
"text": "Berries",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -862,7 +881,8 @@
"text": "Blueberry, Strawberry, Cranberry",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -874,7 +894,8 @@
"text": "150",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -888,7 +909,8 @@
"text": "Grain",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -900,7 +922,8 @@
"text": "Corn, Buckwheat, Barley",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -912,7 +935,8 @@
"text": "200",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]