mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
docs(xlsx): add docstrings to XLSX backend module.
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
06a0ae8294
commit
81f38e960a
@ -32,6 +32,16 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class ExcelCell(BaseModel):
|
class ExcelCell(BaseModel):
|
||||||
|
"""Represents an Excel cell.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
row: The row number of the cell.
|
||||||
|
col: The column number of the cell.
|
||||||
|
text: The text content of the cell.
|
||||||
|
row_span: The number of rows the cell spans.
|
||||||
|
col_span: The number of columns the cell spans.
|
||||||
|
"""
|
||||||
|
|
||||||
row: int
|
row: int
|
||||||
col: int
|
col: int
|
||||||
text: str
|
text: str
|
||||||
@ -40,16 +50,35 @@ class ExcelCell(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class ExcelTable(BaseModel):
|
class ExcelTable(BaseModel):
|
||||||
|
"""Represents an Excel table.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
num_rows: The number of rows in the table.
|
||||||
|
num_cols: The number of columns in the table.
|
||||||
|
data: The data in the table, represented as a list of ExcelCell objects.
|
||||||
|
"""
|
||||||
|
|
||||||
num_rows: int
|
num_rows: int
|
||||||
num_cols: int
|
num_cols: int
|
||||||
data: list[ExcelCell]
|
data: list[ExcelCell]
|
||||||
|
|
||||||
|
|
||||||
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||||
|
"""Backend for parsing Excel workbooks."""
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def __init__(
|
def __init__(
|
||||||
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Initialize the MsExcelDocumentBackend object.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
in_doc: The input document object.
|
||||||
|
path_or_stream: The path or stream to the Excel file.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: An error occurred parsing the file.
|
||||||
|
"""
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
# Initialise the parents for the hierarchy
|
# Initialise the parents for the hierarchy
|
||||||
@ -85,6 +114,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@override
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
if self.is_valid() and self.workbook:
|
if self.is_valid() and self.workbook:
|
||||||
return len(self.workbook.sheetnames)
|
return len(self.workbook.sheetnames)
|
||||||
@ -98,8 +128,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
|
|
||||||
@override
|
@override
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the XLSX into a structured document model.
|
"""Parse the Excel workbook into a DoclingDocument object.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: Unable to run the conversion since the backend object failed to
|
||||||
|
initialize.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The DoclingDocument object representing the Excel workbook.
|
||||||
|
"""
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=self.file.name or "file.xlsx",
|
filename=self.file.name or "file.xlsx",
|
||||||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
@ -118,6 +155,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
||||||
|
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: A DoclingDocument object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A DoclingDocument object with the parsed items.
|
||||||
|
"""
|
||||||
|
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
|
|
||||||
@ -144,6 +189,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
||||||
|
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The DoclingDocument to be updated.
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated DoclingDocument.
|
||||||
|
"""
|
||||||
|
|
||||||
doc = self._find_tables_in_sheet(doc, sheet)
|
doc = self._find_tables_in_sheet(doc, sheet)
|
||||||
|
|
||||||
@ -154,6 +208,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
def _find_tables_in_sheet(
|
def _find_tables_in_sheet(
|
||||||
self, doc: DoclingDocument, sheet: Worksheet
|
self, doc: DoclingDocument, sheet: Worksheet
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The DoclingDocument to be updated.
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated DoclingDocument.
|
||||||
|
"""
|
||||||
|
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
tables = self._find_data_tables(sheet)
|
tables = self._find_data_tables(sheet)
|
||||||
@ -199,12 +262,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
||||||
"""
|
"""Find all compact rectangular data tables in an Excel worksheet.
|
||||||
Find all compact rectangular data tables in a sheet.
|
|
||||||
"""
|
|
||||||
# _log.info("find_data_tables")
|
|
||||||
|
|
||||||
tables = [] # List to store found tables
|
Args:
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of ExcelTable objects representing the data tables.
|
||||||
|
"""
|
||||||
|
tables: list[ExcelTable] = [] # List to store found tables
|
||||||
visited: set[tuple[int, int]] = set() # Track already visited cells
|
visited: set[tuple[int, int]] = set() # Track already visited cells
|
||||||
|
|
||||||
# Iterate over all cells in the sheet
|
# Iterate over all cells in the sheet
|
||||||
@ -229,13 +295,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
start_row: int,
|
start_row: int,
|
||||||
start_col: int,
|
start_col: int,
|
||||||
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
||||||
"""
|
"""Determine the bounds of a compact rectangular table.
|
||||||
Determine the bounds of a compact rectangular table.
|
|
||||||
|
Args:
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
start_row: The row number of the starting cell.
|
||||||
|
start_col: The column number of the starting cell.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
- A dictionary with the bounds and data.
|
A tuple with an Excel table and a set of cell coordinates.
|
||||||
- A set of visited cell coordinates.
|
|
||||||
"""
|
"""
|
||||||
_log.info("find_table_bounds")
|
_log.debug("find_table_bounds")
|
||||||
|
|
||||||
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
||||||
max_col = self._find_table_right(sheet, start_row, start_col)
|
max_col = self._find_table_right(sheet, start_row, start_col)
|
||||||
@ -295,8 +365,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
def _find_table_bottom(
|
def _find_table_bottom(
|
||||||
self, sheet: Worksheet, start_row: int, start_col: int
|
self, sheet: Worksheet, start_row: int, start_col: int
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Function to find the bottom boundary of the table"""
|
"""Find the bottom boundary of a table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
start_row: The starting row of the table.
|
||||||
|
start_col: The starting column of the table.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The row index representing the bottom boundary of the table.
|
||||||
|
"""
|
||||||
max_row: int = start_row
|
max_row: int = start_row
|
||||||
|
|
||||||
while max_row < sheet.max_row - 1:
|
while max_row < sheet.max_row - 1:
|
||||||
@ -323,8 +401,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
def _find_table_right(
|
def _find_table_right(
|
||||||
self, sheet: Worksheet, start_row: int, start_col: int
|
self, sheet: Worksheet, start_row: int, start_col: int
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Function to find the right boundary of the table"""
|
"""Find the right boundary of a table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
start_row: The starting row of the table.
|
||||||
|
start_col: The starting column of the table.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The column index representing the right boundary of the table."
|
||||||
|
"""
|
||||||
max_col: int = start_col
|
max_col: int = start_col
|
||||||
|
|
||||||
while max_col < sheet.max_column - 1:
|
while max_col < sheet.max_column - 1:
|
||||||
@ -351,6 +437,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
def _find_images_in_sheet(
|
def _find_images_in_sheet(
|
||||||
self, doc: DoclingDocument, sheet: Worksheet
|
self, doc: DoclingDocument, sheet: Worksheet
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
"""Find images in the Excel sheet and attach them to the DoclingDocument.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The DoclingDocument to be updated.
|
||||||
|
sheet: The Excel worksheet to be parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated DoclingDocument.
|
||||||
|
"""
|
||||||
|
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
# Iterate over byte images in the sheet
|
# Iterate over byte images in the sheet
|
||||||
|
Loading…
Reference in New Issue
Block a user