added proper typing for mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-16 07:58:20 +01:00
parent b1c654c5ef
commit b8f1439880
4 changed files with 69 additions and 44 deletions

View File

@ -1,11 +1,7 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Tuple, Union
from lxml import etree
from openpyxl import load_workbook
from openpyxl.cell.cell import Cell
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
@ -15,6 +11,10 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from lxml import etree
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import Cell
from openpyxl.worksheet.worksheet import Worksheet
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -73,16 +73,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
# Parses the DOCX into a structured document model. # Parses the DOCX into a structured document model.
_log.info("starting to convert excel ...") _log.info("starting to convert excel ...")
origin = DocumentOrigin( origin = DocumentOrigin(
filename=self.file.name or "file", filename=self.file.name or "file",
#mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )
doc = DoclingDocument(name=self.file.stem or "file", origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid(): if self.is_valid():
doc = self.convert_workbook(doc) doc = self.convert_workbook(doc)
else: else:
@ -101,36 +101,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return 0 return 0
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument: def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
_log.info("starting to convert_workbook excel ...") _log.info("starting to convert_workbook excel ...")
# Iterate over all sheets if self.workbook is not None:
for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}")
sheet = self.workbook[sheet_name] # Access the sheet by name
# level = self.get_level() # Iterate over all sheets
self.parents[0] = doc.add_group( for sheet_name in self.workbook.sheetnames:
parent=None, # self.parents[level-1], _log.info(f"Processing sheet: {sheet_name}")
label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}", sheet = self.workbook[sheet_name] # Access the sheet by name
)
# level = self.get_level()
doc = self.convert_sheet(doc, sheet) self.parents[0] = doc.add_group(
parent=None, # self.parents[level-1],
label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}",
)
doc = self.convert_sheet(doc, sheet)
else:
_log.error("Workbook is not initialized.")
return doc return doc
def convert_sheet(self, doc: DoclingDocument, sheet): def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
_log.info(" => convert_sheet") _log.info(" => convert_sheet")
tables = self.find_data_tables(sheet) tables = self.find_data_tables(sheet)
for excel_table in tables: for excel_table in tables:
print(excel_table) print(excel_table)
num_rows = excel_table["num_rows"] num_rows = excel_table["num_rows"]
num_cols = excel_table["num_cols"] num_cols = excel_table["num_cols"]
_log.info(f"({num_rows}, {num_cols})") _log.info(f"({num_rows}, {num_cols})")
table_data = TableData( table_data = TableData(
@ -142,7 +146,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
for excel_cell in excel_table["data"]: for excel_cell in excel_table["data"]:
_log.info(excel_cell) _log.info(excel_cell)
cell = TableCell( cell = TableCell(
text=str(excel_cell["cell"].value), text=str(excel_cell["cell"].value),
row_span=excel_cell["row_span"], row_span=excel_cell["row_span"],
@ -156,48 +160,56 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
) )
_log.info(cell) _log.info(cell)
table_data.table_cells.append(cell) table_data.table_cells.append(cell)
_log.info(f" --> adding a table ({num_rows}, {num_cols})!") _log.info(f" --> adding a table ({num_rows}, {num_cols})!")
try: try:
doc.add_table(data=table_data, parent=self.parents[0]) doc.add_table(data=table_data, parent=self.parents[0])
except Exception as e: except Exception as e:
_log.warning(f"Could not add table: {str(e)}") _log.warning(f"Could not add table: {str(e)}")
_log.info(f" --> added the table ({num_rows}, {num_cols})!") _log.info(f" --> added the table ({num_rows}, {num_cols})!")
return doc return doc
def find_data_tables(self, sheet): def find_data_tables(self, sheet: Worksheet):
""" """
Find all compact rectangular data tables in a sheet. Find all compact rectangular data tables in a sheet.
""" """
_log.info("find_data_tables") _log.info("find_data_tables")
tables = [] # List to store found tables tables = [] # List to store found tables
visited = set() # Track already visited cells visited: set[Tuple[int, int]] = set() # Track already visited cells
# Iterate over all cells in the sheet # Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)): for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row): for rj, cell in enumerate(row):
_log.info(f"({ri}, {rj}): {cell}") _log.info(f"({ri}, {rj}): {cell}")
# Skip empty or already visited cells # Skip empty or already visited cells
if cell.value is None or (ri, rj) in visited: if cell.value is None or (ri, rj) in visited:
continue continue
# If the cell starts a new table, find its bounds # If the cell starts a new table, find its bounds
table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited) table_bounds, visited_cells = self.find_table_bounds(
sheet, ri, rj, visited
)
_log.info(table_bounds) _log.info(table_bounds)
visited.update(visited_cells) # Mark these cells as visited visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds) tables.append(table_bounds)
_log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}") _log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}")
return tables return tables
def find_table_bounds(self, sheet, start_row, start_col, visited): def find_table_bounds(
self,
sheet: Worksheet,
start_row: int,
start_col: int,
visited: set[Tuple[int, int]],
):
""" """
Determine the bounds of a compact rectangular table. Determine the bounds of a compact rectangular table.
Returns: Returns:
@ -205,7 +217,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
- A set of visited cell coordinates. - A set of visited cell coordinates.
""" """
_log.info("find_table_bounds") _log.info("find_table_bounds")
max_row = start_row max_row = start_row
max_col = start_col max_col = start_col
@ -227,7 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
data = [] data = []
visited_cells = set() visited_cells = set()
for ri in range(start_row, max_row + 1): for ri in range(start_row, max_row + 1):
#row_data = [] # row_data = []
for rj in range(start_col, max_col + 1): for rj in range(start_col, max_col + 1):
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
@ -251,7 +263,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
"col_span": col_span, "col_span": col_span,
} }
) )
# Mark all cells in the span as visited # Mark all cells in the span as visited
for span_row in range(ri, ri + row_span): for span_row in range(ri, ri + row_span):
for span_col in range(rj, rj + col_span): for span_col in range(rj, rj + col_span):

View File

@ -12,9 +12,9 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
@ -48,8 +48,9 @@ class FormatOption(BaseModel):
class ExcelFormatOption(FormatOption): class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
class WordFormatOption(FormatOption):
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend

13
poetry.lock generated
View File

@ -6574,6 +6574,17 @@ rich = ">=10.11.0"
shellingham = ">=1.3.0" shellingham = ">=1.3.0"
typing-extensions = ">=3.7.4.3" typing-extensions = ">=3.7.4.3"
[[package]]
name = "types-openpyxl"
version = "3.1.5.20241114"
description = "Typing stubs for openpyxl"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-openpyxl-3.1.5.20241114.tar.gz", hash = "sha256:caeb9aafed8a5ffabdc74f880b148d90375364a1cfe7915d5065c5d79f3fe6a2"},
{file = "types_openpyxl-3.1.5.20241114-py3-none-any.whl", hash = "sha256:f2925f595b08f5aef1baa725c9ee40baaf51beb05d98ac150593d3bdd37b1029"},
]
[[package]] [[package]]
name = "types-pytz" name = "types-pytz"
version = "2024.2.0.20241003" version = "2024.2.0.20241003"
@ -7177,4 +7188,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "de6821640f0f67bdcfc0a6484cac9c243f9207163b2af90f3a3e2c04f6f13386" content-hash = "95357a52d305fc7dda3da7e397f20d6fe0d4050a90d904c1714536c5a005ea34"

View File

@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4" pandas = "^2.1.4"
marko = "^2.1.2" marko = "^2.1.2"
openpyxl = "^3.1.5" openpyxl = "^3.1.5"
types-openpyxl = "^3.1.5.20241114"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}