fix(HTML): handle row headers like in pivot tables

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-05-06 17:32:30 +02:00
parent e0b77e3173
commit 6395495824
2 changed files with 184 additions and 16 deletions

View File

@ -391,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f"list-item has no text: {element}") _log.debug(f"list-item has no text: {element}")
@staticmethod @staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]: def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
nested_tables = element.find("table") nested_tables = element.find("table")
if nested_tables is not None: if nested_tables is not None:
_log.debug("Skipping nested table.") _log.debug("Skipping nested table.")
return None return None
# Count the number of rows (number of <tr> elements) # Find the number of rows and columns (taking into account spans)
num_rows = len(element("tr")) num_rows = 0
# Find the number of columns (taking into account colspan)
num_cols = 0 num_cols = 0
for row in element("tr"): for row in element("tr"):
col_count = 0 col_count = 0
is_row_header = True
if not isinstance(row, Tag): if not isinstance(row, Tag):
continue continue
for cell in row(["td", "th"]): for cell in row(["td", "th"]):
if not isinstance(row, Tag): if not isinstance(row, Tag):
continue continue
val = cast(Tag, cell).get("colspan", "1") cell_tag = cast(Tag, cell)
val = cell_tag.get("colspan", "1")
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1 colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan col_count += colspan
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
is_row_header = False
num_cols = max(num_cols, col_count) num_cols = max(num_cols, col_count)
if not is_row_header:
num_rows += 1
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)] grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table # Iterate over the rows in the table
for row_idx, row in enumerate(element("tr")): start_row_span = 0
row_idx = -1
for row in element("tr"):
if not isinstance(row, Tag): if not isinstance(row, Tag):
continue continue
# For each row, find all the column cells (both <td> and <th>) # For each row, find all the column cells (both <td> and <th>)
cells = row(["td", "th"]) cells = row(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header # Check if cell is in a column header or row header
col_header = True col_header = True
row_header = True
for html_cell in cells: for html_cell in cells:
if isinstance(html_cell, Tag) and html_cell.name == "td": if isinstance(html_cell, Tag):
col_header = False if html_cell.name == "td":
col_header = False
row_header = False
elif html_cell.get("rowspan") is None:
row_header = False
if not row_header:
row_idx += 1
start_row_span = 0
else:
start_row_span += 1
# Extract the text content of each cell # Extract the text content of each cell
col_idx = 0 col_idx = 0
@ -461,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(row_val, str) and row_val.isnumeric() if isinstance(row_val, str) and row_val.isnumeric()
else 1 else 1
) )
if row_header:
while grid[row_idx][col_idx] is not None: row_span -= 1
while (
col_idx < num_cols
and grid[row_idx + start_row_span][col_idx] is not None
):
col_idx += 1 col_idx += 1
for r in range(row_span): for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span): for c in range(col_span):
grid[row_idx + r][col_idx + c] = text if row_idx + r < num_rows and col_idx + c < num_cols:
grid[row_idx + r][col_idx + c] = text
table_cell = TableCell( table_cell = TableCell(
text=text, text=text,
row_span=row_span, row_span=row_span,
col_span=col_span, col_span=col_span,
start_row_offset_idx=row_idx, start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=row_idx + row_span, end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx, start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span, end_col_offset_idx=col_idx + col_span,
column_header=col_header, column_header=col_header,

View File

@ -0,0 +1,145 @@
<!DOCTYPE html>
<html>
<head>
<style>
table,
th,
td {
border: 1px solid black;
}
</style>
</head>
<body>
<h2>Pivot table with with 1 row header</h2>
<table>
<tr>
<th>Year</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="6">2025</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Pivot table with 2 row headers</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="7">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Equivalent pivot table</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="8">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
</body>
</html>