mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
fix(docx): merged cells not properly converted
Fix conversion issue of merged cells in Word tables leading to repeated text. Simplify Word table conversion code. Add docx file with several table formats for regression tests. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
b1cf796730
commit
40145b59b3
@ -2,7 +2,7 @@ import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from typing import Optional, Set, Union
|
||||
|
||||
import docx
|
||||
from docling_core.types.doc import (
|
||||
@ -14,6 +14,8 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docx.oxml.table import CT_Tc
|
||||
from docx.table import Table, _Cell
|
||||
from lxml import etree
|
||||
from lxml.etree import XPath
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
@ -449,30 +451,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
def handle_tables(self, element, docx_obj, doc):
|
||||
|
||||
# Function to check if a cell has a colspan (gridSpan)
|
||||
def get_colspan(cell):
|
||||
grid_span = cell._element.xpath("@w:gridSpan")
|
||||
if grid_span:
|
||||
return int(grid_span[0]) # Return the number of columns spanned
|
||||
return 1 # Default is 1 (no colspan)
|
||||
|
||||
# Function to check if a cell has a rowspan (vMerge)
|
||||
def get_rowspan(cell):
|
||||
v_merge = cell._element.xpath("@w:vMerge")
|
||||
if v_merge:
|
||||
return v_merge[
|
||||
0
|
||||
] # 'restart' indicates the beginning of a rowspan, others are continuation
|
||||
return 1
|
||||
|
||||
table = docx.table.Table(element, docx_obj)
|
||||
|
||||
table: Table = Table(element, docx_obj)
|
||||
num_rows = len(table.rows)
|
||||
num_cols = 0
|
||||
for row in table.rows:
|
||||
# Calculate the max number of columns
|
||||
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
||||
num_cols = len(table.columns)
|
||||
_log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
|
||||
|
||||
if num_rows == 1 and num_cols == 1:
|
||||
cell_element = table.rows[0].cells[0]
|
||||
@ -481,52 +463,47 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.walk_linear(cell_element._element, docx_obj, doc)
|
||||
return
|
||||
|
||||
# Initialize the table grid
|
||||
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||
cell_set: set[CT_Tc] = set()
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
||||
col_idx = 0
|
||||
for c, cell in enumerate(row.cells):
|
||||
row_span = get_rowspan(cell)
|
||||
col_span = get_colspan(cell)
|
||||
while col_idx < num_cols:
|
||||
cell: _Cell = row.cells[col_idx]
|
||||
_log.debug(
|
||||
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||
)
|
||||
if cell is None or cell._tc in cell_set:
|
||||
_log.debug(f" skipped since repeated content")
|
||||
col_idx += cell.grid_span
|
||||
continue
|
||||
else:
|
||||
cell_set.add(cell._tc)
|
||||
|
||||
cell_text = cell.text
|
||||
# In case cell doesn't return text via docx library:
|
||||
if len(cell_text) == 0:
|
||||
cell_xml = cell._element
|
||||
spanned_idx = row_idx
|
||||
spanned_tc: Optional[CT_Tc] = cell._tc
|
||||
while spanned_tc == cell._tc:
|
||||
spanned_idx += 1
|
||||
spanned_tc = (
|
||||
table.rows[spanned_idx].cells[col_idx]._tc
|
||||
if spanned_idx < num_rows
|
||||
else None
|
||||
)
|
||||
_log.debug(f" spanned before row {spanned_idx}")
|
||||
|
||||
texts = [""]
|
||||
for elem in cell_xml.iter():
|
||||
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
||||
if elem.text:
|
||||
texts.append(elem.text)
|
||||
# Join the collected text
|
||||
cell_text = " ".join(texts).strip()
|
||||
|
||||
# Find the next available column in the grid
|
||||
while table_grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
|
||||
# Fill the grid with the cell value, considering rowspan and colspan
|
||||
for i in range(row_span if row_span == "restart" else 1):
|
||||
for j in range(col_span):
|
||||
table_grid[row_idx + i][col_idx + j] = ""
|
||||
|
||||
cell = TableCell(
|
||||
text=cell_text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
table_cell = TableCell(
|
||||
text=cell.text,
|
||||
row_span=spanned_idx - row_idx,
|
||||
col_span=cell.grid_span,
|
||||
start_row_offset_idx=row.grid_cols_before + row_idx,
|
||||
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
end_col_offset_idx=col_idx + cell.grid_span,
|
||||
col_header=False,
|
||||
row_header=False,
|
||||
)
|
||||
|
||||
data.table_cells.append(cell)
|
||||
data.table_cells.append(table_cell)
|
||||
col_idx += cell.grid_span
|
||||
|
||||
level = self.get_level()
|
||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||
|
BIN
tests/data/docx/word_tables.docx
Normal file
BIN
tests/data/docx/word_tables.docx
Normal file
Binary file not shown.
75
tests/data/groundtruth/docling_v2/word_tables.docx.html
Normal file
75
tests/data/groundtruth/docling_v2/word_tables.docx.html
Normal file
@ -0,0 +1,75 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<link rel="icon" type="image/png"
|
||||
href="https://ds4sd.github.io/docling/assets/logo.png"/>
|
||||
<meta charset="UTF-8">
|
||||
<title>
|
||||
Powered by Docling
|
||||
</title>
|
||||
<style>
|
||||
html {
|
||||
background-color: LightGray;
|
||||
}
|
||||
body {
|
||||
margin: 0 auto;
|
||||
width:800px;
|
||||
padding: 30px;
|
||||
background-color: White;
|
||||
font-family: Arial, sans-serif;
|
||||
box-shadow: 10px 10px 10px grey;
|
||||
}
|
||||
figure{
|
||||
display: block;
|
||||
width: 100%;
|
||||
margin: 0px;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
img {
|
||||
display: block;
|
||||
margin: auto;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
max-width: 640px;
|
||||
max-height: 640px;
|
||||
}
|
||||
table {
|
||||
min-width:500px;
|
||||
background-color: White;
|
||||
border-collapse: collapse;
|
||||
cell-padding: 5px;
|
||||
margin: auto;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid black;
|
||||
padding: 8px;
|
||||
}
|
||||
th {
|
||||
font-weight: bold;
|
||||
}
|
||||
table tr:nth-child(even) td{
|
||||
background-color: LightGray;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<h2>Test with tables</h2>
|
||||
<p>A uniform table</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with horizontal spans</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with horizontal spans in inner columns</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td>Header 0.3</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td><td>Cell 1.3</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td><td>Cell 2.3</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with vertical spans</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with all kinds of spans and empty cells</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td></td><td></td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p></p>
|
||||
</html>
|
19
tests/data/groundtruth/docling_v2/word_tables.docx.itxt
Normal file
19
tests/data/groundtruth/docling_v2/word_tables.docx.itxt
Normal file
@ -0,0 +1,19 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group header-0
|
||||
item-2 at level 2: section_header: Test with tables
|
||||
item-3 at level 3: paragraph: A uniform table
|
||||
item-4 at level 3: table with [3x3]
|
||||
item-5 at level 3: paragraph:
|
||||
item-6 at level 3: paragraph: A non-uniform table with horizontal spans
|
||||
item-7 at level 3: table with [3x3]
|
||||
item-8 at level 3: paragraph:
|
||||
item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns
|
||||
item-10 at level 3: table with [3x4]
|
||||
item-11 at level 3: paragraph:
|
||||
item-12 at level 3: paragraph: A non-uniform table with vertical spans
|
||||
item-13 at level 3: table with [5x3]
|
||||
item-14 at level 3: paragraph:
|
||||
item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells
|
||||
item-16 at level 3: table with [9x5]
|
||||
item-17 at level 3: paragraph:
|
||||
item-18 at level 3: paragraph:
|
2356
tests/data/groundtruth/docling_v2/word_tables.docx.json
Normal file
2356
tests/data/groundtruth/docling_v2/word_tables.docx.json
Normal file
File diff suppressed because it is too large
Load Diff
44
tests/data/groundtruth/docling_v2/word_tables.docx.md
Normal file
44
tests/data/groundtruth/docling_v2/word_tables.docx.md
Normal file
@ -0,0 +1,44 @@
|
||||
## Test with tables
|
||||
|
||||
A uniform table
|
||||
|
||||
| Header 0.0 | Header 0.1 | Header 0.2 |
|
||||
|--------------|--------------|--------------|
|
||||
| Cell 1.0 | Cell 1.1 | Cell 1.2 |
|
||||
| Cell 2.0 | Cell 2.1 | Cell 2.2 |
|
||||
|
||||
A non-uniform table with horizontal spans
|
||||
|
||||
| Header 0.0 | Header 0.1 | Header 0.2 |
|
||||
|--------------|---------------------|---------------------|
|
||||
| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 |
|
||||
| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 |
|
||||
|
||||
A non-uniform table with horizontal spans in inner columns
|
||||
|
||||
| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 |
|
||||
|--------------|---------------------|---------------------|--------------|
|
||||
| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 |
|
||||
| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 |
|
||||
|
||||
A non-uniform table with vertical spans
|
||||
|
||||
| Header 0.0 | Header 0.1 | Header 0.2 |
|
||||
|--------------|---------------------|--------------|
|
||||
| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 |
|
||||
| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 |
|
||||
| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 |
|
||||
| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 |
|
||||
|
||||
A non-uniform table with all kinds of spans and empty cells
|
||||
|
||||
| Header 0.0 | Header 0.1 | Header 0.2 | | |
|
||||
|--------------|---------------------|--------------|----|---------------------|
|
||||
| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | |
|
||||
| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | |
|
||||
| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | |
|
||||
| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 |
|
||||
| | | | | Merged Cell 4.4 5.4 |
|
||||
| | | | | |
|
||||
| | | | | |
|
||||
| | | | | Cell 8.4 |
|
@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str):
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, "pred_itxt==true_itxt"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
@ -101,3 +100,7 @@ def test_e2e_docx_conversions():
|
||||
|
||||
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
||||
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
pred_html: str = doc.export_to_html()
|
||||
assert verify_export(pred_html, str(gt_path) + ".html"), "export to html"
|
||||
|
75
word_tables.html
Normal file
75
word_tables.html
Normal file
@ -0,0 +1,75 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<link rel="icon" type="image/png"
|
||||
href="https://ds4sd.github.io/docling/assets/logo.png"/>
|
||||
<meta charset="UTF-8">
|
||||
<title>
|
||||
Powered by Docling
|
||||
</title>
|
||||
<style>
|
||||
html {
|
||||
background-color: LightGray;
|
||||
}
|
||||
body {
|
||||
margin: 0 auto;
|
||||
width:800px;
|
||||
padding: 30px;
|
||||
background-color: White;
|
||||
font-family: Arial, sans-serif;
|
||||
box-shadow: 10px 10px 10px grey;
|
||||
}
|
||||
figure{
|
||||
display: block;
|
||||
width: 100%;
|
||||
margin: 0px;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
img {
|
||||
display: block;
|
||||
margin: auto;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
max-width: 640px;
|
||||
max-height: 640px;
|
||||
}
|
||||
table {
|
||||
min-width:500px;
|
||||
background-color: White;
|
||||
border-collapse: collapse;
|
||||
cell-padding: 5px;
|
||||
margin: auto;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid black;
|
||||
padding: 8px;
|
||||
}
|
||||
th {
|
||||
font-weight: bold;
|
||||
}
|
||||
table tr:nth-child(even) td{
|
||||
background-color: LightGray;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<h2>Test with tables</h2>
|
||||
<p>A uniform table</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with horizontal spans</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with horizontal spans in inner columns</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td>Header 0.3</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td><td>Cell 1.3</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td><td>Cell 2.3</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with vertical spans</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p>A non-uniform table with all kinds of spans and empty cells</p>
|
||||
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td></td><td></td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p></p>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user