mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix: use first table row as col headers (#1156)
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -380,7 +380,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False,
|
||||
column_header=row_idx == 0,
|
||||
row_header=False,
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
|
||||
@@ -111,7 +111,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
end_row_offset_idx=row_idx + 1,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + 1,
|
||||
col_header=row_idx == 0, # First row as header
|
||||
column_header=row_idx == 0, # First row as header
|
||||
row_header=False,
|
||||
)
|
||||
table_data.table_cells.append(cell)
|
||||
|
||||
@@ -457,7 +457,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=col_header,
|
||||
column_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(table_cell)
|
||||
|
||||
@@ -136,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
end_row_offset_idx=trow_ind + row_span,
|
||||
start_col_offset_idx=tcol_ind,
|
||||
end_col_offset_idx=tcol_ind + col_span,
|
||||
col_header=False,
|
||||
column_header=trow_ind == 0,
|
||||
row_header=False,
|
||||
)
|
||||
tcells.append(icell)
|
||||
|
||||
@@ -164,7 +164,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
||||
start_col_offset_idx=excel_cell.col,
|
||||
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
||||
col_header=False,
|
||||
column_header=excel_cell.row == 0,
|
||||
row_header=False,
|
||||
)
|
||||
table_data.table_cells.append(cell)
|
||||
@@ -173,7 +173,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return doc
|
||||
|
||||
def _find_data_tables(self, sheet: Worksheet):
|
||||
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
|
||||
"""
|
||||
Find all compact rectangular data tables in a sheet.
|
||||
"""
|
||||
@@ -340,47 +340,4 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
except:
|
||||
_log.error("could not extract the image from excel sheets")
|
||||
|
||||
"""
|
||||
for idx, chart in enumerate(sheet._charts): # type: ignore
|
||||
try:
|
||||
chart_path = f"chart_{idx + 1}.png"
|
||||
_log.info(
|
||||
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
||||
)
|
||||
|
||||
_log.info(f"Chart {idx + 1}:")
|
||||
|
||||
# Chart type
|
||||
# _log.info(f"Type: {type(chart).__name__}")
|
||||
print(f"Type: {type(chart).__name__}")
|
||||
|
||||
# Extract series data
|
||||
for series_idx, series in enumerate(chart.series):
|
||||
#_log.info(f"Series {series_idx + 1}:")
|
||||
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
||||
#print(f"x-values: {series.xVal}")
|
||||
#print(f"y-values: {series.yVal}")
|
||||
|
||||
print(f"xval type: {type(series.xVal).__name__}")
|
||||
|
||||
xvals = []
|
||||
for _ in series.xVal.numLit.pt:
|
||||
print(f"xval type: {type(_).__name__}")
|
||||
if hasattr(_, 'v'):
|
||||
xvals.append(_.v)
|
||||
|
||||
print(f"x-values: {xvals}")
|
||||
|
||||
yvals = []
|
||||
for _ in series.yVal:
|
||||
if hasattr(_, 'v'):
|
||||
yvals.append(_.v)
|
||||
|
||||
print(f"y-values: {yvals}")
|
||||
|
||||
except Exception as exc:
|
||||
print(exc)
|
||||
continue
|
||||
"""
|
||||
|
||||
return doc
|
||||
|
||||
@@ -346,7 +346,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False,
|
||||
column_header=row_idx == 0,
|
||||
row_header=False,
|
||||
)
|
||||
if len(cell.text.strip()) > 0:
|
||||
|
||||
@@ -601,7 +601,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + cell.grid_span,
|
||||
col_header=False,
|
||||
column_header=row.grid_cols_before + row_idx == 0,
|
||||
row_header=False,
|
||||
)
|
||||
data.table_cells.append(table_cell)
|
||||
|
||||
Reference in New Issue
Block a user