fix: Proper heading support in rich tables for HTML backend (#2394)

* Fix for the proper headers support in rich tables in HTML Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Compatibility with older Python versions Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixing Furniture before the first heading rule Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added minimalistic test case Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * added html for the test Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-07 15:57:32 +02:00
parent 8a4b946a1a
commit 9705f4020c
5 changed files with 267 additions and 15 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        for br in content("br"):
            br.replace_with(NavigableString("\n"))
        # set default content layer
-        headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+
+        # Furniture before the first heading rule, except for headers in tables
+        header = None
+        # Find all headers first
+        all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
+        # Keep only those that do NOT have a <table> in a parent chain
+        clean_headers = [h for h in all_headers if not h.find_parent("table")]
+        # Pick the first header from the remaining
+        if len(clean_headers):
+            header = clean_headers[0]
+        # Set starting content layer
        self.content_layer = (
-            ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+            ContentLayer.BODY if header is None else ContentLayer.FURNITURE
        )
        # reset context
        self.ctx = _Context()
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        group_name: str,
        doc: DoclingDocument,
        docling_table: TableItem,
-    ) -> tuple[bool, RefItem]:
+    ) -> tuple[bool, Union[RefItem, None]]:
        rich_table_cell = False
-        ref_for_rich_cell = provs_in_cell[0]
+        ref_for_rich_cell = None
+        if len(provs_in_cell) > 0:
+            ref_for_rich_cell = provs_in_cell[0]
        if len(provs_in_cell) > 1:
            # Cell has multiple elements, we need to group them
            rich_table_cell = True
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            if isinstance(pr_item, TextItem):
                # Cell has only one element and it's just a text
                rich_table_cell = False
-                doc.delete_items(node_items=[pr_item])
+                try:
+                    doc.delete_items(node_items=[pr_item])
+                except Exception as e:
+                    _log.error(f"Error while making rich table: {e}.")
            else:
                rich_table_cell = True
                ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

                provs_in_cell: list[RefItem] = []
                # Parse table cell sub-tree for Rich Cells content:
+                table_level = self.level
                provs_in_cell = self._walk(html_cell, doc)
+                # After walking sub-tree in cell, restore previously set level
+                self.level = table_level

                rich_table_cell = False
                ref_for_rich_cell = None
-                if len(provs_in_cell) > 0:
-                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
-                    rich_table_cell, ref_for_rich_cell = (
-                        HTMLDocumentBackend.process_rich_table_cells(
-                            provs_in_cell, group_name, doc, docling_table
-                        )
+                group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
+                rich_table_cell, ref_for_rich_cell = (
+                    HTMLDocumentBackend.process_rich_table_cells(
+                        provs_in_cell, group_name, doc, docling_table
                    )
+                )

                # Extracting text
                text = self.get_text(html_cell).strip()
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            for key in self.parents.keys():
                self.parents[key] = None
            self.level = 0
-            docling_title = self.parents[self.level + 1] = doc.add_title(
+            self.parents[self.level + 1] = doc.add_title(
                text_clean,
                content_layer=self.content_layer,
                formatting=annotated_text.formatting,
                hyperlink=annotated_text.hyperlink,
            )
-            added_ref = [docling_title.get_ref()]
+            p1 = self.parents[self.level + 1]
+            if p1 is not None:
+                added_ref = [p1.get_ref()]
        # the other levels need to be lowered by 1 if a title was set
        else:
            level -= 1
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        _log.debug(f"Remove the tail of level {key}")
                        self.parents[key] = None
                self.level = level
-            docling_heading = self.parents[self.level + 1] = doc.add_heading(
+            self.parents[self.level + 1] = doc.add_heading(
                parent=self.parents[self.level],
                text=text_clean,
                orig=annotated_text.text,
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                formatting=annotated_text.formatting,
                hyperlink=annotated_text.hyperlink,
            )
-            added_ref = [docling_heading.get_ref()]
+            p2 = self.parents[self.level + 1]
+            if p2 is not None:
+                added_ref = [p2.get_ref()]
        self.level += 1
        for img_tag in tag("img"):
            if isinstance(img_tag, Tag):
--- a/tests/data/groundtruth/docling_v2/table_with_heading.html.itxt
+++ b/tests/data/groundtruth/docling_v2/table_with_heading.html.itxt
@@ -0,0 +1,4 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: text: Before tha table
+  item-2 at level 1: table with [2x2]
+  item-3 at level 1: text: After the table
--- a/tests/data/groundtruth/docling_v2/table_with_heading.html.json
+++ b/tests/data/groundtruth/docling_v2/table_with_heading.html.json
@@ -0,0 +1,197 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.7.0",
+  "name": "table_with_heading",
+  "origin": {
+    "mimetype": "text/html",
+    "binary_hash": 5578561753677933781,
+    "filename": "table_with_heading.html"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/tables/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Before tha table",
+      "text": "Before tha table"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "After the table",
+      "text": "After the table"
+    }
+  ],
+  "pictures": [],
+  "tables": [
+    {
+      "self_ref": "#/tables/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "table",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "data": {
+        "table_cells": [
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "A",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false,
+            "fillable": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "B",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false,
+            "fillable": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "1...",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false,
+            "fillable": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "2...",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false,
+            "fillable": false
+          }
+        ],
+        "num_rows": 2,
+        "num_cols": 2,
+        "grid": [
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "A",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false,
+              "fillable": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "B",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false,
+              "fillable": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "1...",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false,
+              "fillable": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "2...",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false,
+              "fillable": false
+            }
+          ]
+        ]
+      },
+      "annotations": []
+    }
+  ],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/table_with_heading.html.md
+++ b/tests/data/groundtruth/docling_v2/table_with_heading.html.md
@@ -0,0 +1,7 @@
+Before tha table
+
+| A    | B    |
+|------|------|
+| 1... | 2... |
+
+After the table
--- a/tests/data/html/table_with_heading.html
+++ b/tests/data/html/table_with_heading.html
@@ -0,0 +1,23 @@
+<html>
+    <head>
+        <style>
+            table, th, td {border: 1px solid black; border-collapse: collapse;}
+            td {padding:30px;}
+            table {margin: 30px;}
+        </style>
+    </head>
+    <body>
+        <p>Before tha table</p>
+        <table>
+            <tr>
+                <td><h1>A</h1></td>
+                <td>B</td>
+            </tr>
+            <tr>
+                <td>1...</td>
+                <td>2...</td>
+            </tr>
+        </table>
+        After the table
+    </body>
+</html>