fix(html): prevent hierarchy reset in rich table cells (#2716)

* fix(html): restore parents after rich cell walking

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>

* fix(html): add table cell context manager, update tests

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>

* fix(html): table with heading test data

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>

---------

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>
This commit is contained in:
Matvei Smirnov
2025-12-03 20:52:23 +03:00
committed by GitHub
parent c97715f5fd
commit aebe25cf00
33 changed files with 373 additions and 32 deletions

View File

@@ -459,10 +459,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
rich_table_cell = self._is_rich_table_cell(html_cell) rich_table_cell = self._is_rich_table_cell(html_cell)
if rich_table_cell: if rich_table_cell:
# Parse table cell sub-tree for Rich Cells content: # Parse table cell sub-tree for Rich Cells content:
table_level = self.level with self._use_table_cell_context():
provs_in_cell = self._walk(html_cell, doc) provs_in_cell = self._walk(html_cell, doc)
# After walking sub-tree in cell, restore previously set level
self.level = table_level
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}" group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
rich_table_cell, ref_for_rich_cell = ( rich_table_cell, ref_for_rich_cell = (
@@ -829,6 +827,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level -= 1 self.level -= 1
self.content_layer = current_layer self.content_layer = current_layer
@contextmanager
def _use_table_cell_context(self):
"""Preserve the hierarchy level and parents during table cell processing.
While the context manager is active, the hierarchy level and parents can be modified.
When exiting, the original level and parents are restored.
"""
original_level = self.level
original_parents = self.parents.copy()
try:
yield
finally:
self.level = original_level
self.parents = original_parents
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
added_ref = [] added_ref = []
tag_name = tag.name.lower() tag_name = tag.name.lower()

View File

@@ -1,10 +1,10 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_01", "name": "example_01",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",
"binary_hash": 13726679883013609282, "binary_hash": 3245959421868226348,
"filename": "example_01.html" "filename": "example_01.html"
}, },
"furniture": { "furniture": {

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_02", "name": "example_02",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_03", "name": "example_03",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_04", "name": "example_04",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_05", "name": "example_05",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_06", "name": "example_06",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_07", "name": "example_07",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "example_08", "name": "example_08",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "formatting", "name": "formatting",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "html_code_snippets", "name": "html_code_snippets",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "html_rich_table_cells", "name": "html_rich_table_cells",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "hyperlink_01", "name": "hyperlink_01",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "hyperlink_02", "name": "hyperlink_02",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "hyperlink_03", "name": "hyperlink_03",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "hyperlink_04", "name": "hyperlink_04",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "hyperlink_05", "name": "hyperlink_05",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_01", "name": "table_01",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_02", "name": "table_02",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_03", "name": "table_03",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_04", "name": "table_04",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_05", "name": "table_05",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_06", "name": "table_06",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,11 +1,11 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "table_with_heading", "name": "table_with_heading_01",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",
"binary_hash": 5578561753677933781, "binary_hash": 5578561753677933781,
"filename": "table_with_heading.html" "filename": "table_with_heading_01.html"
}, },
"furniture": { "furniture": {
"self_ref": "#/furniture", "self_ref": "#/furniture",

View File

@@ -0,0 +1,9 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Main Title
item-2 at level 2: text: Before the table
item-3 at level 2: table with [2x2]
item-4 at level 3: unspecified: group rich_cell_group_1_0_0
item-5 at level 4: section_header: A
item-6 at level 4: text: text
item-7 at level 2: section_header: Section After
item-8 at level 3: text: After the table

View File

@@ -0,0 +1,280 @@
{
"schema_name": "DoclingDocument",
"version": "1.8.0",
"name": "table_with_heading_02",
"origin": {
"mimetype": "text/html",
"binary_hash": 5824324295334010827,
"filename": "table_with_heading_02.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_0_0",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Main Title",
"text": "Main Title"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Before the table",
"text": "Before the table"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "A",
"text": "A",
"level": 1
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "text",
"text": "text"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section After",
"text": "Section After",
"level": 1
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/texts/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After the table",
"text": "After the table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A\ntext",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/0"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A\ntext",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,11 @@
# Main Title
Before the table
| ## A text | B |
|--------------|------|
| 1... | 2... |
## Section After
After the table

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "unit_test_01", "name": "unit_test_01",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.7.0", "version": "1.8.0",
"name": "wiki_duck", "name": "wiki_duck",
"origin": { "origin": {
"mimetype": "text/html", "mimetype": "text/html",

View File

@@ -0,0 +1,28 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Main Title</h1>
<p>Before the table</p>
<table>
<tr>
<td>
<h2>A</h2>
<p>text</p>
</td>
<td>B</td>
</tr>
<tr>
<td>1...</td>
<td>2...</td>
</tr>
</table>
<h2>Section After</h2>
After the table
</body>
</html>