added proper typing for mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-16 07:58:20 +01:00
parent b1c654c5ef
commit b8f1439880
4 changed files with 69 additions and 44 deletions

View File

@ -1,11 +1,7 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from lxml import etree
from openpyxl import load_workbook
from openpyxl.cell.cell import Cell
from typing import Set, Tuple, Union
from docling_core.types.doc import (
DocItemLabel,
@ -15,6 +11,10 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from lxml import etree
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import Cell
from openpyxl.worksheet.worksheet import Worksheet
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@ -73,16 +73,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
# Parses the DOCX into a structured document model.
_log.info("starting to convert excel ...")
origin = DocumentOrigin(
filename=self.file.name or "file",
#mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
# mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
doc = self.convert_workbook(doc)
else:
@ -101,36 +101,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return 0
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
_log.info("starting to convert_workbook excel ...")
_log.info("starting to convert_workbook excel ...")
# Iterate over all sheets
for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}")
sheet = self.workbook[sheet_name] # Access the sheet by name
if self.workbook is not None:
# level = self.get_level()
self.parents[0] = doc.add_group(
parent=None, # self.parents[level-1],
label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}",
)
doc = self.convert_sheet(doc, sheet)
# Iterate over all sheets
for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}")
sheet = self.workbook[sheet_name] # Access the sheet by name
# level = self.get_level()
self.parents[0] = doc.add_group(
parent=None, # self.parents[level-1],
label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}",
)
doc = self.convert_sheet(doc, sheet)
else:
_log.error("Workbook is not initialized.")
return doc
def convert_sheet(self, doc: DoclingDocument, sheet):
def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
_log.info(" => convert_sheet")
tables = self.find_data_tables(sheet)
for excel_table in tables:
print(excel_table)
num_rows = excel_table["num_rows"]
num_cols = excel_table["num_cols"]
_log.info(f"({num_rows}, {num_cols})")
table_data = TableData(
@ -142,7 +146,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
for excel_cell in excel_table["data"]:
_log.info(excel_cell)
cell = TableCell(
text=str(excel_cell["cell"].value),
row_span=excel_cell["row_span"],
@ -156,48 +160,56 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
)
_log.info(cell)
table_data.table_cells.append(cell)
_log.info(f" --> adding a table ({num_rows}, {num_cols})!")
try:
doc.add_table(data=table_data, parent=self.parents[0])
except Exception as e:
_log.warning(f"Could not add table: {str(e)}")
_log.info(f" --> added the table ({num_rows}, {num_cols})!")
return doc
def find_data_tables(self, sheet):
def find_data_tables(self, sheet: Worksheet):
"""
Find all compact rectangular data tables in a sheet.
"""
_log.info("find_data_tables")
tables = [] # List to store found tables
visited = set() # Track already visited cells
visited: set[Tuple[int, int]] = set() # Track already visited cells
# Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row):
_log.info(f"({ri}, {rj}): {cell}")
# Skip empty or already visited cells
if cell.value is None or (ri, rj) in visited:
continue
# If the cell starts a new table, find its bounds
table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited)
table_bounds, visited_cells = self.find_table_bounds(
sheet, ri, rj, visited
)
_log.info(table_bounds)
visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds)
_log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}")
return tables
def find_table_bounds(self, sheet, start_row, start_col, visited):
def find_table_bounds(
self,
sheet: Worksheet,
start_row: int,
start_col: int,
visited: set[Tuple[int, int]],
):
"""
Determine the bounds of a compact rectangular table.
Returns:
@ -205,7 +217,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
- A set of visited cell coordinates.
"""
_log.info("find_table_bounds")
max_row = start_row
max_col = start_col
@ -227,7 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
data = []
visited_cells = set()
for ri in range(start_row, max_row + 1):
#row_data = []
# row_data = []
for rj in range(start_col, max_col + 1):
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
@ -251,7 +263,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
"col_span": col_span,
}
)
# Mark all cells in the span as visited
for span_row in range(ri, ri + row_span):
for span_col in range(rj, rj + col_span):

View File

@ -12,9 +12,9 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import (
ConversionResult,
@ -48,8 +48,9 @@ class FormatOption(BaseModel):
class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
class WordFormatOption(FormatOption):
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend

13
poetry.lock generated
View File

@ -6574,6 +6574,17 @@ rich = ">=10.11.0"
shellingham = ">=1.3.0"
typing-extensions = ">=3.7.4.3"
[[package]]
name = "types-openpyxl"
version = "3.1.5.20241114"
description = "Typing stubs for openpyxl"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-openpyxl-3.1.5.20241114.tar.gz", hash = "sha256:caeb9aafed8a5ffabdc74f880b148d90375364a1cfe7915d5065c5d79f3fe6a2"},
{file = "types_openpyxl-3.1.5.20241114-py3-none-any.whl", hash = "sha256:f2925f595b08f5aef1baa725c9ee40baaf51beb05d98ac150593d3bdd37b1029"},
]
[[package]]
name = "types-pytz"
version = "2024.2.0.20241003"
@ -7177,4 +7188,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "de6821640f0f67bdcfc0a6484cac9c243f9207163b2af90f3a3e2c04f6f13386"
content-hash = "95357a52d305fc7dda3da7e397f20d6fe0d4050a90d904c1714536c5a005ea34"

View File

@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
openpyxl = "^3.1.5"
types-openpyxl = "^3.1.5.20241114"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}