fix(html): prevent hierarchy reset in rich table cells (#2716)

* fix(html): restore parents after rich cell walking

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>

* fix(html): add table cell context manager, update tests

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>

* fix(html): table with heading test data

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>

---------

Signed-off-by: Matvei Smirnov <vdalekesmirnov@gmail.com>
This commit is contained in:
Matvei Smirnov
2025-12-03 20:52:23 +03:00
committed by GitHub
parent c97715f5fd
commit aebe25cf00
33 changed files with 373 additions and 32 deletions

View File

@@ -459,10 +459,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
rich_table_cell = self._is_rich_table_cell(html_cell)
if rich_table_cell:
# Parse table cell sub-tree for Rich Cells content:
table_level = self.level
provs_in_cell = self._walk(html_cell, doc)
# After walking sub-tree in cell, restore previously set level
self.level = table_level
with self._use_table_cell_context():
provs_in_cell = self._walk(html_cell, doc)
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
rich_table_cell, ref_for_rich_cell = (
@@ -829,6 +827,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level -= 1
self.content_layer = current_layer
@contextmanager
def _use_table_cell_context(self):
"""Preserve the hierarchy level and parents during table cell processing.
While the context manager is active, the hierarchy level and parents can be modified.
When exiting, the original level and parents are restored.
"""
original_level = self.level
original_parents = self.parents.copy()
try:
yield
finally:
self.level = original_level
self.parents = original_parents
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
added_ref = []
tag_name = tag.name.lower()