mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
added proper typing for mypy
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b1c654c5ef
commit
b8f1439880
@ -1,11 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Tuple, Union
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from openpyxl import load_workbook
|
|
||||||
from openpyxl.cell.cell import Cell
|
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -15,6 +11,10 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
from lxml import etree
|
||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
|
from openpyxl.cell.cell import Cell
|
||||||
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -76,7 +76,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=self.file.name or "file",
|
filename=self.file.name or "file",
|
||||||
#mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
# mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
@ -103,24 +103,28 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
||||||
_log.info("starting to convert_workbook excel ...")
|
_log.info("starting to convert_workbook excel ...")
|
||||||
|
|
||||||
# Iterate over all sheets
|
if self.workbook is not None:
|
||||||
for sheet_name in self.workbook.sheetnames:
|
|
||||||
_log.info(f"Processing sheet: {sheet_name}")
|
|
||||||
|
|
||||||
sheet = self.workbook[sheet_name] # Access the sheet by name
|
# Iterate over all sheets
|
||||||
|
for sheet_name in self.workbook.sheetnames:
|
||||||
|
_log.info(f"Processing sheet: {sheet_name}")
|
||||||
|
|
||||||
# level = self.get_level()
|
sheet = self.workbook[sheet_name] # Access the sheet by name
|
||||||
self.parents[0] = doc.add_group(
|
|
||||||
parent=None, # self.parents[level-1],
|
|
||||||
label=GroupLabel.SECTION,
|
|
||||||
name=f"sheet: {sheet_name}",
|
|
||||||
)
|
|
||||||
|
|
||||||
doc = self.convert_sheet(doc, sheet)
|
# level = self.get_level()
|
||||||
|
self.parents[0] = doc.add_group(
|
||||||
|
parent=None, # self.parents[level-1],
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
name=f"sheet: {sheet_name}",
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = self.convert_sheet(doc, sheet)
|
||||||
|
else:
|
||||||
|
_log.error("Workbook is not initialized.")
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def convert_sheet(self, doc: DoclingDocument, sheet):
|
def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
||||||
_log.info(" => convert_sheet")
|
_log.info(" => convert_sheet")
|
||||||
|
|
||||||
tables = self.find_data_tables(sheet)
|
tables = self.find_data_tables(sheet)
|
||||||
@ -168,14 +172,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def find_data_tables(self, sheet):
|
def find_data_tables(self, sheet: Worksheet):
|
||||||
"""
|
"""
|
||||||
Find all compact rectangular data tables in a sheet.
|
Find all compact rectangular data tables in a sheet.
|
||||||
"""
|
"""
|
||||||
_log.info("find_data_tables")
|
_log.info("find_data_tables")
|
||||||
|
|
||||||
tables = [] # List to store found tables
|
tables = [] # List to store found tables
|
||||||
visited = set() # Track already visited cells
|
visited: set[Tuple[int, int]] = set() # Track already visited cells
|
||||||
|
|
||||||
# Iterate over all cells in the sheet
|
# Iterate over all cells in the sheet
|
||||||
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
||||||
@ -187,7 +191,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# If the cell starts a new table, find its bounds
|
# If the cell starts a new table, find its bounds
|
||||||
table_bounds, visited_cells = self.find_table_bounds(sheet, ri, rj, visited)
|
table_bounds, visited_cells = self.find_table_bounds(
|
||||||
|
sheet, ri, rj, visited
|
||||||
|
)
|
||||||
_log.info(table_bounds)
|
_log.info(table_bounds)
|
||||||
|
|
||||||
visited.update(visited_cells) # Mark these cells as visited
|
visited.update(visited_cells) # Mark these cells as visited
|
||||||
@ -197,7 +203,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return tables
|
return tables
|
||||||
|
|
||||||
def find_table_bounds(self, sheet, start_row, start_col, visited):
|
def find_table_bounds(
|
||||||
|
self,
|
||||||
|
sheet: Worksheet,
|
||||||
|
start_row: int,
|
||||||
|
start_col: int,
|
||||||
|
visited: set[Tuple[int, int]],
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Determine the bounds of a compact rectangular table.
|
Determine the bounds of a compact rectangular table.
|
||||||
Returns:
|
Returns:
|
||||||
@ -227,7 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
data = []
|
data = []
|
||||||
visited_cells = set()
|
visited_cells = set()
|
||||||
for ri in range(start_row, max_row + 1):
|
for ri in range(start_row, max_row + 1):
|
||||||
#row_data = []
|
# row_data = []
|
||||||
for rj in range(start_col, max_col + 1):
|
for rj in range(start_col, max_col + 1):
|
||||||
|
|
||||||
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
||||||
|
@ -12,9 +12,9 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
|
|||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
@ -49,6 +49,7 @@ class ExcelFormatOption(FormatOption):
|
|||||||
pipeline_cls: Type = SimplePipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class WordFormatOption(FormatOption):
|
class WordFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = SimplePipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||||
|
13
poetry.lock
generated
13
poetry.lock
generated
@ -6574,6 +6574,17 @@ rich = ">=10.11.0"
|
|||||||
shellingham = ">=1.3.0"
|
shellingham = ">=1.3.0"
|
||||||
typing-extensions = ">=3.7.4.3"
|
typing-extensions = ">=3.7.4.3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "types-openpyxl"
|
||||||
|
version = "3.1.5.20241114"
|
||||||
|
description = "Typing stubs for openpyxl"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "types-openpyxl-3.1.5.20241114.tar.gz", hash = "sha256:caeb9aafed8a5ffabdc74f880b148d90375364a1cfe7915d5065c5d79f3fe6a2"},
|
||||||
|
{file = "types_openpyxl-3.1.5.20241114-py3-none-any.whl", hash = "sha256:f2925f595b08f5aef1baa725c9ee40baaf51beb05d98ac150593d3bdd37b1029"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-pytz"
|
name = "types-pytz"
|
||||||
version = "2024.2.0.20241003"
|
version = "2024.2.0.20241003"
|
||||||
@ -7177,4 +7188,4 @@ tesserocr = ["tesserocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "de6821640f0f67bdcfc0a6484cac9c243f9207163b2af90f3a3e2c04f6f13386"
|
content-hash = "95357a52d305fc7dda3da7e397f20d6fe0d4050a90d904c1714536c5a005ea34"
|
||||||
|
@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
|
|||||||
pandas = "^2.1.4"
|
pandas = "^2.1.4"
|
||||||
marko = "^2.1.2"
|
marko = "^2.1.2"
|
||||||
openpyxl = "^3.1.5"
|
openpyxl = "^3.1.5"
|
||||||
|
types-openpyxl = "^3.1.5.20241114"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
|
Loading…
Reference in New Issue
Block a user