sytle(xlsx): enforce type hints in XLSX backend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-04-08 11:06:38 +02:00
parent 355d8dc7a6
commit e813f02943

View File

@ -1,7 +1,7 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Dict, Set, Tuple, Union from typing import Union
from docling_core.types.doc import ( from docling_core.types.doc import (
DoclingDocument, DoclingDocument,
@ -11,12 +11,9 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from openpyxl import load_workbook
# from lxml import etree
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import Cell
from openpyxl.drawing.image import Image
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -45,13 +42,16 @@ class ExcelTable(BaseModel):
class MsExcelDocumentBackend(DeclarativeDocumentBackend): class MsExcelDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): @override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
# Initialise the parents for the hierarchy # Initialise the parents for the hierarchy
self.max_levels = 10 self.max_levels = 10
self.parents: Dict[int, Any] = {} self.parents: dict[int, Any] = {}
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
@ -71,24 +71,22 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
@override
def is_valid(self) -> bool: def is_valid(self) -> bool:
_log.info(f"valid: {self.valid}") _log.info(f"valid: {self.valid}")
return self.valid return self.valid
@classmethod @classmethod
@override
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return True return True
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: @override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.XLSX} return {InputFormat.XLSX}
@override
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the XLSX into a structured document model. # Parses the XLSX into a structured document model.
@ -132,7 +130,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return doc return doc
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet): def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
doc = self._find_tables_in_sheet(doc, sheet) doc = self._find_tables_in_sheet(doc, sheet)
@ -140,7 +138,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return doc return doc
def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet): def _find_tables_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:
tables = self._find_data_tables(sheet) tables = self._find_data_tables(sheet)
@ -180,7 +180,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
# _log.info("find_data_tables") # _log.info("find_data_tables")
tables = [] # List to store found tables tables = [] # List to store found tables
visited: set[Tuple[int, int]] = set() # Track already visited cells visited: set[tuple[int, int]] = set() # Track already visited cells
# Iterate over all cells in the sheet # Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)): for ri, row in enumerate(sheet.iter_rows(values_only=False)):
@ -191,9 +191,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
continue continue
# If the cell starts a new table, find its bounds # If the cell starts a new table, find its bounds
table_bounds, visited_cells = self._find_table_bounds( table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
sheet, ri, rj, visited
)
visited.update(visited_cells) # Mark these cells as visited visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds) tables.append(table_bounds)
@ -205,8 +203,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
sheet: Worksheet, sheet: Worksheet,
start_row: int, start_row: int,
start_col: int, start_col: int,
visited: set[Tuple[int, int]], ) -> tuple[ExcelTable, set[tuple[int, int]]]:
):
""" """
Determine the bounds of a compact rectangular table. Determine the bounds of a compact rectangular table.
Returns: Returns:
@ -220,7 +217,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
# Collect the data within the bounds # Collect the data within the bounds
data = [] data = []
visited_cells = set() visited_cells: set[tuple[int, int]] = set()
for ri in range(start_row, max_row + 1): for ri in range(start_row, max_row + 1):
for rj in range(start_col, max_col + 1): for rj in range(start_col, max_col + 1):
@ -270,10 +267,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
visited_cells, visited_cells,
) )
def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int): def _find_table_bottom(
self, sheet: Worksheet, start_row: int, start_col: int
) -> int:
"""Function to find the bottom boundary of the table""" """Function to find the bottom boundary of the table"""
max_row = start_row max_row: int = start_row
while max_row < sheet.max_row - 1: while max_row < sheet.max_row - 1:
# Get the cell value or check if it is part of a merged cell # Get the cell value or check if it is part of a merged cell
@ -296,10 +295,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return max_row return max_row
def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int): def _find_table_right(
self, sheet: Worksheet, start_row: int, start_col: int
) -> int:
"""Function to find the right boundary of the table""" """Function to find the right boundary of the table"""
max_col = start_col max_col: int = start_col
while max_col < sheet.max_column - 1: while max_col < sheet.max_column - 1:
# Get the cell value or check if it is part of a merged cell # Get the cell value or check if it is part of a merged cell