fix(docx): merged cells not properly converted

Fix conversion issue of merged cells in Word tables leading to repeated text.
Simplify Word table conversion code.
Add docx file with several table formats for regression tests.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-01-31 16:03:28 +01:00
parent b1cf796730
commit 40145b59b3
8 changed files with 2612 additions and 63 deletions

View File

@ -2,7 +2,7 @@ import logging
import re import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Optional, Set, Union
import docx import docx
from docling_core.types.doc import ( from docling_core.types.doc import (
@ -14,6 +14,8 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from docx.oxml.table import CT_Tc
from docx.table import Table, _Cell
from lxml import etree from lxml import etree
from lxml.etree import XPath from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
@ -449,30 +451,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return return
def handle_tables(self, element, docx_obj, doc): def handle_tables(self, element, docx_obj, doc):
table: Table = Table(element, docx_obj)
# Function to check if a cell has a colspan (gridSpan)
def get_colspan(cell):
grid_span = cell._element.xpath("@w:gridSpan")
if grid_span:
return int(grid_span[0]) # Return the number of columns spanned
return 1 # Default is 1 (no colspan)
# Function to check if a cell has a rowspan (vMerge)
def get_rowspan(cell):
v_merge = cell._element.xpath("@w:vMerge")
if v_merge:
return v_merge[
0
] # 'restart' indicates the beginning of a rowspan, others are continuation
return 1
table = docx.table.Table(element, docx_obj)
num_rows = len(table.rows) num_rows = len(table.rows)
num_cols = 0 num_cols = len(table.columns)
for row in table.rows: _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
# Calculate the max number of columns
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
if num_rows == 1 and num_cols == 1: if num_rows == 1 and num_cols == 1:
cell_element = table.rows[0].cells[0] cell_element = table.rows[0].cells[0]
@ -481,52 +463,47 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.walk_linear(cell_element._element, docx_obj, doc) self.walk_linear(cell_element._element, docx_obj, doc)
return return
# Initialize the table grid data = TableData(num_rows=num_rows, num_cols=num_cols)
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] cell_set: set[CT_Tc] = set()
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
for row_idx, row in enumerate(table.rows): for row_idx, row in enumerate(table.rows):
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
col_idx = 0 col_idx = 0
for c, cell in enumerate(row.cells): while col_idx < num_cols:
row_span = get_rowspan(cell) cell: _Cell = row.cells[col_idx]
col_span = get_colspan(cell) _log.debug(
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
)
if cell is None or cell._tc in cell_set:
_log.debug(f" skipped since repeated content")
col_idx += cell.grid_span
continue
else:
cell_set.add(cell._tc)
cell_text = cell.text spanned_idx = row_idx
# In case cell doesn't return text via docx library: spanned_tc: Optional[CT_Tc] = cell._tc
if len(cell_text) == 0: while spanned_tc == cell._tc:
cell_xml = cell._element spanned_idx += 1
spanned_tc = (
table.rows[spanned_idx].cells[col_idx]._tc
if spanned_idx < num_rows
else None
)
_log.debug(f" spanned before row {spanned_idx}")
texts = [""] table_cell = TableCell(
for elem in cell_xml.iter(): text=cell.text,
if elem.tag.endswith("t"): # <w:t> tags that contain text row_span=spanned_idx - row_idx,
if elem.text: col_span=cell.grid_span,
texts.append(elem.text) start_row_offset_idx=row.grid_cols_before + row_idx,
# Join the collected text end_row_offset_idx=row.grid_cols_before + spanned_idx,
cell_text = " ".join(texts).strip()
# Find the next available column in the grid
while table_grid[row_idx][col_idx] is not None:
col_idx += 1
# Fill the grid with the cell value, considering rowspan and colspan
for i in range(row_span if row_span == "restart" else 1):
for j in range(col_span):
table_grid[row_idx + i][col_idx + j] = ""
cell = TableCell(
text=cell_text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx, start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span, end_col_offset_idx=col_idx + cell.grid_span,
col_header=False, col_header=False,
row_header=False, row_header=False,
) )
data.table_cells.append(table_cell)
data.table_cells.append(cell) col_idx += cell.grid_span
level = self.get_level() level = self.get_level()
doc.add_table(data=data, parent=self.parents[level - 1]) doc.add_table(data=data, parent=self.parents[level - 1])

Binary file not shown.

View File

@ -0,0 +1,75 @@
<!DOCTYPE html>
<html lang="en">
<head>
<link rel="icon" type="image/png"
href="https://ds4sd.github.io/docling/assets/logo.png"/>
<meta charset="UTF-8">
<title>
Powered by Docling
</title>
<style>
html {
background-color: LightGray;
}
body {
margin: 0 auto;
width:800px;
padding: 30px;
background-color: White;
font-family: Arial, sans-serif;
box-shadow: 10px 10px 10px grey;
}
figure{
display: block;
width: 100%;
margin: 0px;
margin-top: 10px;
margin-bottom: 10px;
}
img {
display: block;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
max-width: 640px;
max-height: 640px;
}
table {
min-width:500px;
background-color: White;
border-collapse: collapse;
cell-padding: 5px;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
}
th, td {
border: 1px solid black;
padding: 8px;
}
th {
font-weight: bold;
}
table tr:nth-child(even) td{
background-color: LightGray;
}
</style>
</head>
<h2>Test with tables</h2>
<p>A uniform table</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with horizontal spans</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with horizontal spans in inner columns</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td>Header 0.3</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td><td>Cell 1.3</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td><td>Cell 2.3</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with vertical spans</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with all kinds of spans and empty cells</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td></td><td></td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
<p></p>
<p></p>
</html>

View File

@ -0,0 +1,19 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Test with tables
item-3 at level 3: paragraph: A uniform table
item-4 at level 3: table with [3x3]
item-5 at level 3: paragraph:
item-6 at level 3: paragraph: A non-uniform table with horizontal spans
item-7 at level 3: table with [3x3]
item-8 at level 3: paragraph:
item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns
item-10 at level 3: table with [3x4]
item-11 at level 3: paragraph:
item-12 at level 3: paragraph: A non-uniform table with vertical spans
item-13 at level 3: table with [5x3]
item-14 at level 3: paragraph:
item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells
item-16 at level 3: table with [9x5]
item-17 at level 3: paragraph:
item-18 at level 3: paragraph:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,44 @@
## Test with tables
A uniform table
| Header 0.0 | Header 0.1 | Header 0.2 |
|--------------|--------------|--------------|
| Cell 1.0 | Cell 1.1 | Cell 1.2 |
| Cell 2.0 | Cell 2.1 | Cell 2.2 |
A non-uniform table with horizontal spans
| Header 0.0 | Header 0.1 | Header 0.2 |
|--------------|---------------------|---------------------|
| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 |
| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 |
A non-uniform table with horizontal spans in inner columns
| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 |
|--------------|---------------------|---------------------|--------------|
| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 |
| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 |
A non-uniform table with vertical spans
| Header 0.0 | Header 0.1 | Header 0.2 |
|--------------|---------------------|--------------|
| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 |
| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 |
| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 |
| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 |
A non-uniform table with all kinds of spans and empty cells
| Header 0.0 | Header 0.1 | Header 0.2 | | |
|--------------|---------------------|--------------|----|---------------------|
| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | |
| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | |
| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | |
| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 |
| | | | | Merged Cell 4.4 5.4 |
| | | | | |
| | | | | |
| | | | | Cell 8.4 |

View File

@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str):
with open(gtfile, "r") as fr: with open(gtfile, "r") as fr:
true_text = fr.read() true_text = fr.read()
assert pred_text == true_text, "pred_itxt==true_itxt"
return pred_text == true_text return pred_text == true_text
@ -101,3 +100,7 @@ def test_e2e_docx_conversions():
pred_json: str = json.dumps(doc.export_to_dict(), indent=2) pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()
assert verify_export(pred_html, str(gt_path) + ".html"), "export to html"

75
word_tables.html Normal file
View File

@ -0,0 +1,75 @@
<!DOCTYPE html>
<html lang="en">
<head>
<link rel="icon" type="image/png"
href="https://ds4sd.github.io/docling/assets/logo.png"/>
<meta charset="UTF-8">
<title>
Powered by Docling
</title>
<style>
html {
background-color: LightGray;
}
body {
margin: 0 auto;
width:800px;
padding: 30px;
background-color: White;
font-family: Arial, sans-serif;
box-shadow: 10px 10px 10px grey;
}
figure{
display: block;
width: 100%;
margin: 0px;
margin-top: 10px;
margin-bottom: 10px;
}
img {
display: block;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
max-width: 640px;
max-height: 640px;
}
table {
min-width:500px;
background-color: White;
border-collapse: collapse;
cell-padding: 5px;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
}
th, td {
border: 1px solid black;
padding: 8px;
}
th {
font-weight: bold;
}
table tr:nth-child(even) td{
background-color: LightGray;
}
</style>
</head>
<h2>Test with tables</h2>
<p>A uniform table</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with horizontal spans</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with horizontal spans in inner columns</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td>Header 0.3</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td><td>Cell 1.3</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td><td>Cell 2.3</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with vertical spans</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td></tr></tbody></table>
<p></p>
<p>A non-uniform table with all kinds of spans and empty cells</p>
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td></td><td></td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
<p></p>
<p></p>
</html>