diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 48b11296..74bf8da4 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -52,6 +52,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): ) from e def is_valid(self) -> bool: + _log.info(f"valid: {self.valid}") return self.valid @classmethod @@ -71,20 +72,26 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # Parses the DOCX into a structured document model. + _log.info("starting to convert excel ...") + origin = DocumentOrigin( filename=self.file.name or "file", - mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + #mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", binary_hash=self.document_hash, ) + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) + if self.is_valid(): - return self.convert_workbook(doc) + doc = self.convert_workbook(doc) else: + _log.warning("file is not valid") raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) - return DoclingDocument(name=self.file.stem or "file", origin=origin) + return doc def get_level(self) -> int: """Return the first None index.""" @@ -93,40 +100,51 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return k return 0 - def convert_workbook(self) -> DoclingDocument: - - doc = DoclingDocument(name=self.file.stem or "file", origin=origin) + def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument: + _log.info("starting to convert_workbook excel ...") # Iterate over all sheets - for sheet_name in workbook.sheetnames: + for sheet_name in self.workbook.sheetnames: _log.info(f"Processing sheet: {sheet_name}") - sheet = workbook[sheet_name] # Access the sheet by name + + sheet = self.workbook[sheet_name] # Access the sheet by name # level = self.get_level() - self.parents[0] = doc.add_group( parent=None, # self.parents[level-1], label=GroupLabel.SECTION, name=f"sheet: {sheet_name}", ) + + doc = self.convert_sheet(doc, sheet) - self.convert_sheet(doc, sheet) + return doc def convert_sheet(self, doc: DoclingDocument, sheet): - + _log.info(" => convert_sheet") + tables = self.find_data_tables(sheet) for excel_table in tables: + print(excel_table) + + num_rows = excel_table["num_rows"] + num_cols = excel_table["num_cols"] + + _log.info(f"({num_rows}, {num_cols})") table_data = TableData( - num_rows=excel_table["num_rows"], - num_cols=excel_table["num_cols"], + num_rows=num_rows, + num_cols=num_cols, table_cells=[], ) + _log.info(f"({num_rows}, {num_cols})") for excel_cell in excel_table["data"]: + _log.info(excel_cell) + cell = TableCell( - text=str(excel_cell.value), + text=str(excel_cell["cell"].value), row_span=excel_cell["row_span"], col_span=excel_cell["col_span"], start_row_offset_idx=excel_cell["row"], @@ -136,29 +154,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): col_header=False, # col_header, row_header=False, # ((not col_header) and html_cell.name=='th') ) - data.table_cells.append(cell) + _log.info(cell) + table_data.table_cells.append(cell) + + _log.info(f" --> adding a table ({num_rows}, {num_cols})!") - doc.append_table(data=table_data, parent=self.parents[0]) + try: + doc.add_table(data=table_data, parent=self.parents[0]) + except Exception as e: + _log.warning(f"Could not add table: {str(e)}") + + _log.info(f" --> added the table ({num_rows}, {num_cols})!") + + return doc def find_data_tables(self, sheet): """ Find all compact rectangular data tables in a sheet. """ + _log.info("find_data_tables") + tables = [] # List to store found tables visited = set() # Track already visited cells # Iterate over all cells in the sheet for ri, row in enumerate(sheet.iter_rows(values_only=False)): for rj, cell in enumerate(row): + _log.info(f"({ri}, {rj}): {cell}") + # Skip empty or already visited cells if cell.value is None or (ri, rj) in visited: continue # If the cell starts a new table, find its bounds - table_bounds, visited_cells = find_table_bounds(sheet, ri, rj, visited) + table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited) + _log.info(table_bounds) + visited.update(visited_cells) # Mark these cells as visited tables.append(table_bounds) + _log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}") + return tables def find_table_bounds(self, sheet, start_row, start_col, visited): @@ -168,6 +204,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): - A dictionary with the bounds and data. - A set of visited cell coordinates. """ + _log.info("find_table_bounds") + max_row = start_row max_col = start_col @@ -189,7 +227,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): data = [] visited_cells = set() for ri in range(start_row, max_row + 1): - row_data = [] + #row_data = [] for rj in range(start_col, max_col + 1): cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing @@ -204,8 +242,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): col_span = merged_range.max_col - merged_range.min_col + 1 break - # Register the cell with spans - row_data.append( + data.append( { "row": ri - start_row, "col": rj - start_col, @@ -214,20 +251,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): "col_span": col_span, } ) - + # Mark all cells in the span as visited for span_row in range(ri, ri + row_span): for span_col in range(rj, rj + col_span): visited_cells.add((span_row, span_col)) - data.append(row_data) - return { "beg_row": start_row, "beg_col": start_col, "end_row": max_row, "end_col": max_col, - "num_rows": max_row - start_row, - "num_cols": max_col - start_col, + "num_rows": max_row + 1 - start_row, + "num_cols": max_col + 1 - start_col, "data": data, }, visited_cells