fix: Proper heading support in rich tables for HTML backend (#2394)

* Fix for the proper headers support in rich tables in HTML Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Compatibility with older Python versions Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixing Furniture before the first heading rule Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added minimalistic test case Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * added html for the test Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-10-07 15:57:32 +02:00
parent 8a4b946a1a
commit 9705f4020c
5 changed files with 267 additions and 15 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        for br in content("br"):
            br.replace_with(NavigableString("\n"))
        # set default content layer
-        headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+
+        # Furniture before the first heading rule, except for headers in tables
+        header = None
+        # Find all headers first
+        all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
+        # Keep only those that do NOT have a <table> in a parent chain
+        clean_headers = [h for h in all_headers if not h.find_parent("table")]
+        # Pick the first header from the remaining
+        if len(clean_headers):
+            header = clean_headers[0]
+        # Set starting content layer
        self.content_layer = (
-            ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+            ContentLayer.BODY if header is None else ContentLayer.FURNITURE
        )
        # reset context
        self.ctx = _Context()
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        group_name: str,
        doc: DoclingDocument,
        docling_table: TableItem,
-    ) -> tuple[bool, RefItem]:
+    ) -> tuple[bool, Union[RefItem, None]]:
        rich_table_cell = False
-        ref_for_rich_cell = provs_in_cell[0]
+        ref_for_rich_cell = None
+        if len(provs_in_cell) > 0:
+            ref_for_rich_cell = provs_in_cell[0]
        if len(provs_in_cell) > 1:
            # Cell has multiple elements, we need to group them
            rich_table_cell = True
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            if isinstance(pr_item, TextItem):
                # Cell has only one element and it's just a text
                rich_table_cell = False
-                doc.delete_items(node_items=[pr_item])
+                try:
+                    doc.delete_items(node_items=[pr_item])
+                except Exception as e:
+                    _log.error(f"Error while making rich table: {e}.")
            else:
                rich_table_cell = True
                ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

                provs_in_cell: list[RefItem] = []
                # Parse table cell sub-tree for Rich Cells content:
+                table_level = self.level
                provs_in_cell = self._walk(html_cell, doc)
+                # After walking sub-tree in cell, restore previously set level
+                self.level = table_level

                rich_table_cell = False
                ref_for_rich_cell = None
-                if len(provs_in_cell) > 0:
-                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
-                    rich_table_cell, ref_for_rich_cell = (
-                        HTMLDocumentBackend.process_rich_table_cells(
-                            provs_in_cell, group_name, doc, docling_table
-                        )
+                group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
+                rich_table_cell, ref_for_rich_cell = (
+                    HTMLDocumentBackend.process_rich_table_cells(
+                        provs_in_cell, group_name, doc, docling_table
                    )
+                )

                # Extracting text
                text = self.get_text(html_cell).strip()
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            for key in self.parents.keys():
                self.parents[key] = None
            self.level = 0
-            docling_title = self.parents[self.level + 1] = doc.add_title(
+            self.parents[self.level + 1] = doc.add_title(
                text_clean,
                content_layer=self.content_layer,
                formatting=annotated_text.formatting,
                hyperlink=annotated_text.hyperlink,
            )
-            added_ref = [docling_title.get_ref()]
+            p1 = self.parents[self.level + 1]
+            if p1 is not None:
+                added_ref = [p1.get_ref()]
        # the other levels need to be lowered by 1 if a title was set
        else:
            level -= 1
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        _log.debug(f"Remove the tail of level {key}")
                        self.parents[key] = None
                self.level = level
-            docling_heading = self.parents[self.level + 1] = doc.add_heading(
+            self.parents[self.level + 1] = doc.add_heading(
                parent=self.parents[self.level],
                text=text_clean,
                orig=annotated_text.text,
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                formatting=annotated_text.formatting,
                hyperlink=annotated_text.hyperlink,
            )
-            added_ref = [docling_heading.get_ref()]
+            p2 = self.parents[self.level + 1]
+            if p2 is not None:
+                added_ref = [p2.get_ref()]
        self.level += 1
        for img_tag in tag("img"):
            if isinstance(img_tag, Tag):