feat: Rich tables support for HTML backend (#2324)

* Rich tables support for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Decoupling JATS backend from HTML backend, ways of creating tables changed significantly

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* updated and added tests

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Refactored parse_table_data in html_backend into few smaller functions

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Changing scope of few functions in html_backend.py, making them static, when possible

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Fix for HTML tables that have tbody and/or thead, now these tables are also properly supported

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2025-09-29 18:12:16 +02:00
committed by GitHub
parent 325877aee9
commit c803abed9a
46 changed files with 9233 additions and 5815 deletions

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_03",
"origin": {
"mimetype": "text/html",
@@ -346,7 +346,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -358,7 +359,8 @@
"text": "Header 2",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -370,7 +372,8 @@
"text": "Header 3",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -382,7 +385,8 @@
"text": "Row 1, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -394,7 +398,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -406,7 +411,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -418,7 +424,8 @@
"text": "Row 2, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -430,7 +437,8 @@
"text": "Row 2, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -442,7 +450,8 @@
"text": "Row 2, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -454,7 +463,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -466,7 +476,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -478,7 +489,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
@@ -495,7 +507,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -507,7 +520,8 @@
"text": "Header 2",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -519,7 +533,8 @@
"text": "Header 3",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -533,7 +548,8 @@
"text": "Row 1, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -545,7 +561,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -557,7 +574,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -571,7 +589,8 @@
"text": "Row 2, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -583,7 +602,8 @@
"text": "Row 2, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -595,7 +615,8 @@
"text": "Row 2, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -609,7 +630,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -621,7 +643,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -633,7 +656,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]