fix(HTML): handle row headers like in pivot tables

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-07-26 20:14:47 +00:00 · 2025-05-06 17:32:30 +02:00 · 2025-05-06 17:32:30 +02:00 · 6395495824
commit 6395495824
parent e0b77e3173
2 changed files with 184 additions and 16 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -391,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            _log.debug(f"list-item has no text: {element}")
    @staticmethod
-    def parse_table_data(element: Tag) -> Optional[TableData]:
+    def parse_table_data(element: Tag) -> Optional[TableData]:  # noqa: C901
        nested_tables = element.find("table")
        if nested_tables is not None:
            _log.debug("Skipping nested table.")
            return None
-        # Count the number of rows (number of <tr> elements)
+        # Find the number of rows and columns (taking into account spans)
-        num_rows = len(element("tr"))
+        num_rows = 0
        # Find the number of columns (taking into account colspan)
        num_cols = 0
        for row in element("tr"):
            col_count = 0
            is_row_header = True
            if not isinstance(row, Tag):
                continue
            for cell in row(["td", "th"]):
                if not isinstance(row, Tag):
                    continue
-                val = cast(Tag, cell).get("colspan", "1")
+                cell_tag = cast(Tag, cell)
                val = cell_tag.get("colspan", "1")
                colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
                col_count += colspan
                if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
                    is_row_header = False
            num_cols = max(num_cols, col_count)
            if not is_row_header:
                num_rows += 1
        _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
        grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
        # Iterate over the rows in the table
-        for row_idx, row in enumerate(element("tr")):
+        start_row_span = 0
        row_idx = -1
        for row in element("tr"):
            if not isinstance(row, Tag):
                continue
            # For each row, find all the column cells (both <td> and <th>)
            cells = row(["td", "th"])
-            # Check if each cell in the row is a header -> means it is a column header
+            # Check if cell is in a column header or row header
            col_header = True
            row_header = True
            for html_cell in cells:
-                if isinstance(html_cell, Tag) and html_cell.name == "td":
+                if isinstance(html_cell, Tag):
-                    col_header = False
+                    if html_cell.name == "td":
                        col_header = False
                        row_header = False
                    elif html_cell.get("rowspan") is None:
                        row_header = False
            if not row_header:
                row_idx += 1
                start_row_span = 0
            else:
                start_row_span += 1
            # Extract the text content of each cell
            col_idx = 0
@ -461,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    if isinstance(row_val, str) and row_val.isnumeric()
                    else 1
                )
-
+                if row_header:
-                while grid[row_idx][col_idx] is not None:
+                    row_span -= 1
                while (
                    col_idx < num_cols
                    and grid[row_idx + start_row_span][col_idx] is not None
                ):
                    col_idx += 1
-                for r in range(row_span):
+                for r in range(start_row_span, start_row_span + row_span):
                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
+                        if row_idx + r < num_rows and col_idx + c < num_cols:
                            grid[row_idx + r][col_idx + c] = text
                table_cell = TableCell(
                    text=text,
                    row_span=row_span,
                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
+                    start_row_offset_idx=start_row_span + row_idx,
-                    end_row_offset_idx=row_idx + row_span,
+                    end_row_offset_idx=start_row_span + row_idx + row_span,
                    start_col_offset_idx=col_idx,
                    end_col_offset_idx=col_idx + col_span,
                    column_header=col_header,
--- a/tests/data/html/example_8.html
+++ b/tests/data/html/example_8.html
@ -0,0 +1,145 @@
 <!DOCTYPE html>
 <html>
 <head>
    <style>
        table,
        th,
        td {
            border: 1px solid black;
        }
    </style>
 </head>
 <body>
    <h2>Pivot table with with 1 row header</h2>
    <table>
        <tr>
            <th>Year</th>
            <th>Month</th>
            <th>Revenue</th>
            <th>Cost</th>
        </tr>
        <tr>
            <th rowspan="6">2025</th>
        </tr>
        <tr>
            <td>January</td>
            <td>$134</td>
            <td>$162</td>
        </tr>
        <tr>
            <td>February</td>
            <td>$150</td>
            <td>$155</td>
        </tr>
        <tr>
            <td>March</td>
            <td>$160</td>
            <td>$143</td>
        </tr>
        <tr>
            <td>April</td>
            <td>$210</td>
            <td>$150</td>
        </tr>
        <tr>
            <td>May</td>
            <td>$280</td>
            <td>$120</td>
        </tr>
    </table>
    <h2>Pivot table with 2 row headers</h2>
    <table>
        <tr>
            <th>Year</th>
            <th>Quarter</th>
            <th>Month</th>
            <th>Revenue</th>
            <th>Cost</th>
        </tr>
        <tr>
            <th rowspan="7">2025</th>
            <th rowspan="4">Q1</th>
        </tr>
        <tr>
            <td>January</td>
            <td>$134</td>
            <td>$162</td>
        </tr>
        <tr>
            <td>February</td>
            <td>$150</td>
            <td>$155</td>
        </tr>
        <tr>
            <td>March</td>
            <td>$160</td>
            <td>$143</td>
        </tr>
        <tr>
            <th rowspan="3">Q2</th>
        </tr>
        <tr>
            <td>April</td>
            <td>$210</td>
            <td>$150</td>
        </tr>
        <tr>
            <td>May</td>
            <td>$280</td>
            <td>$120</td>
        </tr>
    </table>
    <h2>Equivalent pivot table</h2>
    <table>
        <tr>
            <th>Year</th>
            <th>Quarter</th>
            <th>Month</th>
            <th>Revenue</th>
            <th>Cost</th>
        </tr>
        <tr>
            <th rowspan="8">2025</th>
            <th rowspan="4">Q1</th>
        </tr>
        <tr>
            <td>January</td>
            <td>$134</td>
            <td>$162</td>
        </tr>
        <tr>
            <td>February</td>
            <td>$150</td>
            <td>$155</td>
        </tr>
        <tr>
            <td>March</td>
            <td>$160</td>
            <td>$143</td>
        </tr>
        <tr>
            <th rowspan="3">Q2</th>
        </tr>
        <tr>
            <td>April</td>
            <td>$210</td>
            <td>$150</td>
        </tr>
        <tr>
            <td>May</td>
            <td>$280</td>
            <td>$120</td>
        </tr>
    </table>
 </body>
 </html>