added proper typing for mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-30 22:14:37 +00:00 · 2024-11-16 07:58:20 +01:00 · 2024-11-16 07:58:20 +01:00 · b8f1439880
commit b8f1439880
parent b1c654c5ef
4 changed files with 69 additions and 44 deletions
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@ -1,11 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
-
-from lxml import etree
-from openpyxl import load_workbook
-from openpyxl.cell.cell import Cell
+from typing import Set, Tuple, Union

 from docling_core.types.doc import (
    DocItemLabel,
@ -15,6 +11,10 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
+from lxml import etree
+from openpyxl import Workbook, load_workbook
+from openpyxl.cell.cell import Cell
+from openpyxl.worksheet.worksheet import Worksheet

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -73,16 +73,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
        # Parses the DOCX into a structured document model.

        _log.info("starting to convert excel ...")
-        
+
        origin = DocumentOrigin(
            filename=self.file.name or "file",
-            #mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            # mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            binary_hash=self.document_hash,
        )

        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        
+
        if self.is_valid():
            doc = self.convert_workbook(doc)
        else:
@ -101,36 +101,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
        return 0

    def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
-        _log.info("starting to convert_workbook excel ...")        
+        _log.info("starting to convert_workbook excel ...")

-        # Iterate over all sheets
-        for sheet_name in self.workbook.sheetnames:            
-            _log.info(f"Processing sheet: {sheet_name}")
-            
-            sheet = self.workbook[sheet_name]  # Access the sheet by name
+        if self.workbook is not None:

-            # level = self.get_level()
-            self.parents[0] = doc.add_group(
-                parent=None,  # self.parents[level-1],
-                label=GroupLabel.SECTION,
-                name=f"sheet: {sheet_name}",
-            )
-            
-            doc = self.convert_sheet(doc, sheet)
+            # Iterate over all sheets
+            for sheet_name in self.workbook.sheetnames:
+                _log.info(f"Processing sheet: {sheet_name}")
+
+                sheet = self.workbook[sheet_name]  # Access the sheet by name
+
+                # level = self.get_level()
+                self.parents[0] = doc.add_group(
+                    parent=None,  # self.parents[level-1],
+                    label=GroupLabel.SECTION,
+                    name=f"sheet: {sheet_name}",
+                )
+
+                doc = self.convert_sheet(doc, sheet)
+        else:
+            _log.error("Workbook is not initialized.")

        return doc

-    def convert_sheet(self, doc: DoclingDocument, sheet):
+    def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
        _log.info(" => convert_sheet")
-        
+
        tables = self.find_data_tables(sheet)

        for excel_table in tables:
            print(excel_table)
-            
+
            num_rows = excel_table["num_rows"]
            num_cols = excel_table["num_cols"]
-            
+
            _log.info(f"({num_rows}, {num_cols})")

            table_data = TableData(
@ -142,7 +146,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):

            for excel_cell in excel_table["data"]:
                _log.info(excel_cell)
-                
+
                cell = TableCell(
                    text=str(excel_cell["cell"].value),
                    row_span=excel_cell["row_span"],
@ -156,48 +160,56 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
                )
                _log.info(cell)
                table_data.table_cells.append(cell)
-            
+
            _log.info(f" --> adding a table ({num_rows}, {num_cols})!")

            try:
                doc.add_table(data=table_data, parent=self.parents[0])
            except Exception as e:
                _log.warning(f"Could not add table: {str(e)}")
-                
+
            _log.info(f" --> added the table ({num_rows}, {num_cols})!")

        return doc

-    def find_data_tables(self, sheet):
+    def find_data_tables(self, sheet: Worksheet):
        """
        Find all compact rectangular data tables in a sheet.
        """
        _log.info("find_data_tables")
-        
+
        tables = []  # List to store found tables
-        visited = set()  # Track already visited cells
+        visited: set[Tuple[int, int]] = set()  # Track already visited cells

        # Iterate over all cells in the sheet
        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
            for rj, cell in enumerate(row):
                _log.info(f"({ri}, {rj}): {cell}")
-                
+
                # Skip empty or already visited cells
                if cell.value is None or (ri, rj) in visited:
                    continue

                # If the cell starts a new table, find its bounds
-                table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited)
+                table_bounds, visited_cells = self.find_table_bounds(
+                    sheet, ri, rj, visited
+                )
                _log.info(table_bounds)
-                
+
                visited.update(visited_cells)  # Mark these cells as visited
                tables.append(table_bounds)

        _log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}")
-                
+
        return tables

-    def find_table_bounds(self, sheet, start_row, start_col, visited):
+    def find_table_bounds(
+        self,
+        sheet: Worksheet,
+        start_row: int,
+        start_col: int,
+        visited: set[Tuple[int, int]],
+    ):
        """
        Determine the bounds of a compact rectangular table.
        Returns:
@ -205,7 +217,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
        - A set of visited cell coordinates.
        """
        _log.info("find_table_bounds")
-        
+
        max_row = start_row
        max_col = start_col

@ -227,7 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
        data = []
        visited_cells = set()
        for ri in range(start_row, max_row + 1):
-            #row_data = []
+            # row_data = []
            for rj in range(start_col, max_col + 1):

                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
@ -251,7 +263,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
                        "col_span": col_span,
                    }
                )
-                
+
                # Mark all cells in the span as visited
                for span_row in range(ri, ri + row_span):
                    for span_col in range(rj, rj + col_span):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -12,9 +12,9 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
 from docling.datamodel.document import (
    ConversionResult,
@ -48,8 +48,9 @@ class FormatOption(BaseModel):
 class ExcelFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
-    
-class WordFormatOption(FormatOption):    
+
+
+class WordFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend

--- a/poetry.lock
+++ b/poetry.lock
@ -6574,6 +6574,17 @@ rich = ">=10.11.0"
 shellingham = ">=1.3.0"
 typing-extensions = ">=3.7.4.3"

+[[package]]
+name = "types-openpyxl"
+version = "3.1.5.20241114"
+description = "Typing stubs for openpyxl"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-openpyxl-3.1.5.20241114.tar.gz", hash = "sha256:caeb9aafed8a5ffabdc74f880b148d90375364a1cfe7915d5065c5d79f3fe6a2"},
+    {file = "types_openpyxl-3.1.5.20241114-py3-none-any.whl", hash = "sha256:f2925f595b08f5aef1baa725c9ee40baaf51beb05d98ac150593d3bdd37b1029"},
+]
+
 [[package]]
 name = "types-pytz"
 version = "2024.2.0.20241003"
@ -7177,4 +7188,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "de6821640f0f67bdcfc0a6484cac9c243f9207163b2af90f3a3e2c04f6f13386"
+content-hash = "95357a52d305fc7dda3da7e397f20d6fe0d4050a90d904c1714536c5a005ea34"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
 pandas = "^2.1.4"
 marko = "^2.1.2"
 openpyxl = "^3.1.5"
+types-openpyxl = "^3.1.5.20241114"

 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}