mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 22:14:37 +00:00
added proper typing for mypy
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b1c654c5ef
commit
b8f1439880
@ -1,11 +1,7 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from lxml import etree
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.cell.cell import Cell
|
||||
from typing import Set, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@ -15,6 +11,10 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from lxml import etree
|
||||
from openpyxl import Workbook, load_workbook
|
||||
from openpyxl.cell.cell import Cell
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -73,16 +73,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Parses the DOCX into a structured document model.
|
||||
|
||||
_log.info("starting to convert excel ...")
|
||||
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file",
|
||||
#mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
# mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
|
||||
|
||||
if self.is_valid():
|
||||
doc = self.convert_workbook(doc)
|
||||
else:
|
||||
@ -101,36 +101,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
return 0
|
||||
|
||||
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
||||
_log.info("starting to convert_workbook excel ...")
|
||||
_log.info("starting to convert_workbook excel ...")
|
||||
|
||||
# Iterate over all sheets
|
||||
for sheet_name in self.workbook.sheetnames:
|
||||
_log.info(f"Processing sheet: {sheet_name}")
|
||||
|
||||
sheet = self.workbook[sheet_name] # Access the sheet by name
|
||||
if self.workbook is not None:
|
||||
|
||||
# level = self.get_level()
|
||||
self.parents[0] = doc.add_group(
|
||||
parent=None, # self.parents[level-1],
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"sheet: {sheet_name}",
|
||||
)
|
||||
|
||||
doc = self.convert_sheet(doc, sheet)
|
||||
# Iterate over all sheets
|
||||
for sheet_name in self.workbook.sheetnames:
|
||||
_log.info(f"Processing sheet: {sheet_name}")
|
||||
|
||||
sheet = self.workbook[sheet_name] # Access the sheet by name
|
||||
|
||||
# level = self.get_level()
|
||||
self.parents[0] = doc.add_group(
|
||||
parent=None, # self.parents[level-1],
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"sheet: {sheet_name}",
|
||||
)
|
||||
|
||||
doc = self.convert_sheet(doc, sheet)
|
||||
else:
|
||||
_log.error("Workbook is not initialized.")
|
||||
|
||||
return doc
|
||||
|
||||
def convert_sheet(self, doc: DoclingDocument, sheet):
|
||||
def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
||||
_log.info(" => convert_sheet")
|
||||
|
||||
|
||||
tables = self.find_data_tables(sheet)
|
||||
|
||||
for excel_table in tables:
|
||||
print(excel_table)
|
||||
|
||||
|
||||
num_rows = excel_table["num_rows"]
|
||||
num_cols = excel_table["num_cols"]
|
||||
|
||||
|
||||
_log.info(f"({num_rows}, {num_cols})")
|
||||
|
||||
table_data = TableData(
|
||||
@ -142,7 +146,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
for excel_cell in excel_table["data"]:
|
||||
_log.info(excel_cell)
|
||||
|
||||
|
||||
cell = TableCell(
|
||||
text=str(excel_cell["cell"].value),
|
||||
row_span=excel_cell["row_span"],
|
||||
@ -156,48 +160,56 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
_log.info(cell)
|
||||
table_data.table_cells.append(cell)
|
||||
|
||||
|
||||
_log.info(f" --> adding a table ({num_rows}, {num_cols})!")
|
||||
|
||||
try:
|
||||
doc.add_table(data=table_data, parent=self.parents[0])
|
||||
except Exception as e:
|
||||
_log.warning(f"Could not add table: {str(e)}")
|
||||
|
||||
|
||||
_log.info(f" --> added the table ({num_rows}, {num_cols})!")
|
||||
|
||||
return doc
|
||||
|
||||
def find_data_tables(self, sheet):
|
||||
def find_data_tables(self, sheet: Worksheet):
|
||||
"""
|
||||
Find all compact rectangular data tables in a sheet.
|
||||
"""
|
||||
_log.info("find_data_tables")
|
||||
|
||||
|
||||
tables = [] # List to store found tables
|
||||
visited = set() # Track already visited cells
|
||||
visited: set[Tuple[int, int]] = set() # Track already visited cells
|
||||
|
||||
# Iterate over all cells in the sheet
|
||||
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
||||
for rj, cell in enumerate(row):
|
||||
_log.info(f"({ri}, {rj}): {cell}")
|
||||
|
||||
|
||||
# Skip empty or already visited cells
|
||||
if cell.value is None or (ri, rj) in visited:
|
||||
continue
|
||||
|
||||
# If the cell starts a new table, find its bounds
|
||||
table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited)
|
||||
table_bounds, visited_cells = self.find_table_bounds(
|
||||
sheet, ri, rj, visited
|
||||
)
|
||||
_log.info(table_bounds)
|
||||
|
||||
|
||||
visited.update(visited_cells) # Mark these cells as visited
|
||||
tables.append(table_bounds)
|
||||
|
||||
_log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}")
|
||||
|
||||
|
||||
return tables
|
||||
|
||||
def find_table_bounds(self, sheet, start_row, start_col, visited):
|
||||
def find_table_bounds(
|
||||
self,
|
||||
sheet: Worksheet,
|
||||
start_row: int,
|
||||
start_col: int,
|
||||
visited: set[Tuple[int, int]],
|
||||
):
|
||||
"""
|
||||
Determine the bounds of a compact rectangular table.
|
||||
Returns:
|
||||
@ -205,7 +217,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
- A set of visited cell coordinates.
|
||||
"""
|
||||
_log.info("find_table_bounds")
|
||||
|
||||
|
||||
max_row = start_row
|
||||
max_col = start_col
|
||||
|
||||
@ -227,7 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
data = []
|
||||
visited_cells = set()
|
||||
for ri in range(start_row, max_row + 1):
|
||||
#row_data = []
|
||||
# row_data = []
|
||||
for rj in range(start_col, max_col + 1):
|
||||
|
||||
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
||||
@ -251,7 +263,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
"col_span": col_span,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# Mark all cells in the span as visited
|
||||
for span_row in range(ri, ri + row_span):
|
||||
for span_col in range(rj, rj + col_span):
|
||||
|
@ -12,9 +12,9 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
@ -48,8 +48,9 @@ class FormatOption(BaseModel):
|
||||
class ExcelFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||
|
||||
|
13
poetry.lock
generated
13
poetry.lock
generated
@ -6574,6 +6574,17 @@ rich = ">=10.11.0"
|
||||
shellingham = ">=1.3.0"
|
||||
typing-extensions = ">=3.7.4.3"
|
||||
|
||||
[[package]]
|
||||
name = "types-openpyxl"
|
||||
version = "3.1.5.20241114"
|
||||
description = "Typing stubs for openpyxl"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-openpyxl-3.1.5.20241114.tar.gz", hash = "sha256:caeb9aafed8a5ffabdc74f880b148d90375364a1cfe7915d5065c5d79f3fe6a2"},
|
||||
{file = "types_openpyxl-3.1.5.20241114-py3-none-any.whl", hash = "sha256:f2925f595b08f5aef1baa725c9ee40baaf51beb05d98ac150593d3bdd37b1029"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-pytz"
|
||||
version = "2024.2.0.20241003"
|
||||
@ -7177,4 +7188,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "de6821640f0f67bdcfc0a6484cac9c243f9207163b2af90f3a3e2c04f6f13386"
|
||||
content-hash = "95357a52d305fc7dda3da7e397f20d6fe0d4050a90d904c1714536c5a005ea34"
|
||||
|
@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
|
||||
pandas = "^2.1.4"
|
||||
marko = "^2.1.2"
|
||||
openpyxl = "^3.1.5"
|
||||
types-openpyxl = "^3.1.5.20241114"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
|
Loading…
Reference in New Issue
Block a user