diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 74bf8da4..2e6e2077 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -1,11 +1,7 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union - -from lxml import etree -from openpyxl import load_workbook -from openpyxl.cell.cell import Cell +from typing import Set, Tuple, Union from docling_core.types.doc import ( DocItemLabel, @@ -15,6 +11,10 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from lxml import etree +from openpyxl import Workbook, load_workbook +from openpyxl.cell.cell import Cell +from openpyxl.worksheet.worksheet import Worksheet from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -73,16 +73,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): # Parses the DOCX into a structured document model. _log.info("starting to convert excel ...") - + origin = DocumentOrigin( filename=self.file.name or "file", - #mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + # mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", binary_hash=self.document_hash, ) doc = DoclingDocument(name=self.file.stem or "file", origin=origin) - + if self.is_valid(): doc = self.convert_workbook(doc) else: @@ -101,36 +101,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return 0 def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument: - _log.info("starting to convert_workbook excel ...") + _log.info("starting to convert_workbook excel ...") - # Iterate over all sheets - for sheet_name in self.workbook.sheetnames: - _log.info(f"Processing sheet: {sheet_name}") - - sheet = self.workbook[sheet_name] # Access the sheet by name + if self.workbook is not None: - # level = self.get_level() - self.parents[0] = doc.add_group( - parent=None, # self.parents[level-1], - label=GroupLabel.SECTION, - name=f"sheet: {sheet_name}", - ) - - doc = self.convert_sheet(doc, sheet) + # Iterate over all sheets + for sheet_name in self.workbook.sheetnames: + _log.info(f"Processing sheet: {sheet_name}") + + sheet = self.workbook[sheet_name] # Access the sheet by name + + # level = self.get_level() + self.parents[0] = doc.add_group( + parent=None, # self.parents[level-1], + label=GroupLabel.SECTION, + name=f"sheet: {sheet_name}", + ) + + doc = self.convert_sheet(doc, sheet) + else: + _log.error("Workbook is not initialized.") return doc - def convert_sheet(self, doc: DoclingDocument, sheet): + def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet): _log.info(" => convert_sheet") - + tables = self.find_data_tables(sheet) for excel_table in tables: print(excel_table) - + num_rows = excel_table["num_rows"] num_cols = excel_table["num_cols"] - + _log.info(f"({num_rows}, {num_cols})") table_data = TableData( @@ -142,7 +146,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): for excel_cell in excel_table["data"]: _log.info(excel_cell) - + cell = TableCell( text=str(excel_cell["cell"].value), row_span=excel_cell["row_span"], @@ -156,48 +160,56 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): ) _log.info(cell) table_data.table_cells.append(cell) - + _log.info(f" --> adding a table ({num_rows}, {num_cols})!") try: doc.add_table(data=table_data, parent=self.parents[0]) except Exception as e: _log.warning(f"Could not add table: {str(e)}") - + _log.info(f" --> added the table ({num_rows}, {num_cols})!") return doc - def find_data_tables(self, sheet): + def find_data_tables(self, sheet: Worksheet): """ Find all compact rectangular data tables in a sheet. """ _log.info("find_data_tables") - + tables = [] # List to store found tables - visited = set() # Track already visited cells + visited: set[Tuple[int, int]] = set() # Track already visited cells # Iterate over all cells in the sheet for ri, row in enumerate(sheet.iter_rows(values_only=False)): for rj, cell in enumerate(row): _log.info(f"({ri}, {rj}): {cell}") - + # Skip empty or already visited cells if cell.value is None or (ri, rj) in visited: continue # If the cell starts a new table, find its bounds - table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited) + table_bounds, visited_cells = self.find_table_bounds( + sheet, ri, rj, visited + ) _log.info(table_bounds) - + visited.update(visited_cells) # Mark these cells as visited tables.append(table_bounds) _log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}") - + return tables - def find_table_bounds(self, sheet, start_row, start_col, visited): + def find_table_bounds( + self, + sheet: Worksheet, + start_row: int, + start_col: int, + visited: set[Tuple[int, int]], + ): """ Determine the bounds of a compact rectangular table. Returns: @@ -205,7 +217,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): - A set of visited cell coordinates. """ _log.info("find_table_bounds") - + max_row = start_row max_col = start_col @@ -227,7 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): data = [] visited_cells = set() for ri in range(start_row, max_row + 1): - #row_data = [] + # row_data = [] for rj in range(start_col, max_col + 1): cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing @@ -251,7 +263,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): "col_span": col_span, } ) - + # Mark all cells in the span as visited for span_row in range(ri, ri + row_span): for span_col in range(rj, rj + col_span): diff --git a/docling/document_converter.py b/docling/document_converter.py index dfdda69e..01c9a58d 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -12,9 +12,9 @@ from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend +from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat from docling.datamodel.document import ( ConversionResult, @@ -48,8 +48,9 @@ class FormatOption(BaseModel): class ExcelFormatOption(FormatOption): pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend - -class WordFormatOption(FormatOption): + + +class WordFormatOption(FormatOption): pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend diff --git a/poetry.lock b/poetry.lock index fe17acb9..a7d76902 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6574,6 +6574,17 @@ rich = ">=10.11.0" shellingham = ">=1.3.0" typing-extensions = ">=3.7.4.3" +[[package]] +name = "types-openpyxl" +version = "3.1.5.20241114" +description = "Typing stubs for openpyxl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-openpyxl-3.1.5.20241114.tar.gz", hash = "sha256:caeb9aafed8a5ffabdc74f880b148d90375364a1cfe7915d5065c5d79f3fe6a2"}, + {file = "types_openpyxl-3.1.5.20241114-py3-none-any.whl", hash = "sha256:f2925f595b08f5aef1baa725c9ee40baaf51beb05d98ac150593d3bdd37b1029"}, +] + [[package]] name = "types-pytz" version = "2024.2.0.20241003" @@ -7177,4 +7188,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "de6821640f0f67bdcfc0a6484cac9c243f9207163b2af90f3a3e2c04f6f13386" +content-hash = "95357a52d305fc7dda3da7e397f20d6fe0d4050a90d904c1714536c5a005ea34" diff --git a/pyproject.toml b/pyproject.toml index ce4f5eab..b55246ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3" pandas = "^2.1.4" marko = "^2.1.2" openpyxl = "^3.1.5" +types-openpyxl = "^3.1.5.20241114" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"}