mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
adding images to output [WIP]
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
4698459c59
commit
5b6090bee3
@ -15,6 +15,7 @@ from lxml import etree
|
|||||||
from openpyxl import Workbook, load_workbook
|
from openpyxl import Workbook, load_workbook
|
||||||
from openpyxl.cell.cell import Cell
|
from openpyxl.cell.cell import Cell
|
||||||
from openpyxl.worksheet.worksheet import Worksheet
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
from openpyxl.drawing.image import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -72,8 +73,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the DOCX into a structured document model.
|
# Parses the DOCX into a structured document model.
|
||||||
|
|
||||||
_log.info("starting to convert excel ...")
|
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=self.file.name or "file",
|
filename=self.file.name or "file",
|
||||||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
@ -85,7 +84,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
doc = self.convert_workbook(doc)
|
doc = self.convert_workbook(doc)
|
||||||
else:
|
else:
|
||||||
_log.warning("file is not valid")
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||||
)
|
)
|
||||||
@ -100,7 +98,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
def convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
||||||
_log.info("starting to convert_workbook excel ...")
|
|
||||||
|
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
|
|
||||||
@ -124,27 +121,28 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
def convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
||||||
_log.info(" => convert_sheet")
|
|
||||||
|
doc = self.find_tables_in_sheet(doc, sheet)
|
||||||
|
|
||||||
|
doc = self.find_images_in_sheet(doc, sheet)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
||||||
|
|
||||||
tables = self.find_data_tables(sheet)
|
tables = self.find_data_tables(sheet)
|
||||||
|
|
||||||
for excel_table in tables:
|
for excel_table in tables:
|
||||||
print(excel_table)
|
|
||||||
|
|
||||||
num_rows = excel_table["num_rows"]
|
num_rows = excel_table["num_rows"]
|
||||||
num_cols = excel_table["num_cols"]
|
num_cols = excel_table["num_cols"]
|
||||||
|
|
||||||
_log.info(f"({num_rows}, {num_cols})")
|
|
||||||
|
|
||||||
table_data = TableData(
|
table_data = TableData(
|
||||||
num_rows=num_rows,
|
num_rows=num_rows,
|
||||||
num_cols=num_cols,
|
num_cols=num_cols,
|
||||||
table_cells=[],
|
table_cells=[],
|
||||||
)
|
)
|
||||||
_log.info(f"({num_rows}, {num_cols})")
|
|
||||||
|
|
||||||
for excel_cell in excel_table["data"]:
|
for excel_cell in excel_table["data"]:
|
||||||
_log.info(excel_cell)
|
|
||||||
|
|
||||||
cell = TableCell(
|
cell = TableCell(
|
||||||
text=str(excel_cell["cell"].value),
|
text=str(excel_cell["cell"].value),
|
||||||
@ -157,17 +155,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
col_header=False, # col_header,
|
col_header=False, # col_header,
|
||||||
row_header=False, # ((not col_header) and html_cell.name=='th')
|
row_header=False, # ((not col_header) and html_cell.name=='th')
|
||||||
)
|
)
|
||||||
_log.info(cell)
|
|
||||||
table_data.table_cells.append(cell)
|
table_data.table_cells.append(cell)
|
||||||
|
|
||||||
_log.info(f" --> adding a table ({num_rows}, {num_cols})!")
|
doc.add_table(data=table_data, parent=self.parents[0])
|
||||||
|
|
||||||
try:
|
|
||||||
doc.add_table(data=table_data, parent=self.parents[0])
|
|
||||||
except Exception as e:
|
|
||||||
_log.warning(f"Could not add table: {str(e)}")
|
|
||||||
|
|
||||||
_log.info(f" --> added the table ({num_rows}, {num_cols})!")
|
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -175,7 +165,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"""
|
"""
|
||||||
Find all compact rectangular data tables in a sheet.
|
Find all compact rectangular data tables in a sheet.
|
||||||
"""
|
"""
|
||||||
_log.info("find_data_tables")
|
|
||||||
|
|
||||||
tables = [] # List to store found tables
|
tables = [] # List to store found tables
|
||||||
visited: set[Tuple[int, int]] = set() # Track already visited cells
|
visited: set[Tuple[int, int]] = set() # Track already visited cells
|
||||||
@ -183,7 +172,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Iterate over all cells in the sheet
|
# Iterate over all cells in the sheet
|
||||||
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
||||||
for rj, cell in enumerate(row):
|
for rj, cell in enumerate(row):
|
||||||
_log.info(f"({ri}, {rj}): {cell}")
|
|
||||||
|
|
||||||
# Skip empty or already visited cells
|
# Skip empty or already visited cells
|
||||||
if cell.value is None or (ri, rj) in visited:
|
if cell.value is None or (ri, rj) in visited:
|
||||||
@ -193,13 +181,10 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
table_bounds, visited_cells = self.find_table_bounds(
|
table_bounds, visited_cells = self.find_table_bounds(
|
||||||
sheet, ri, rj, visited
|
sheet, ri, rj, visited
|
||||||
)
|
)
|
||||||
_log.info(table_bounds)
|
|
||||||
|
|
||||||
visited.update(visited_cells) # Mark these cells as visited
|
visited.update(visited_cells) # Mark these cells as visited
|
||||||
tables.append(table_bounds)
|
tables.append(table_bounds)
|
||||||
|
|
||||||
_log.info(f"#-tables: {len(tables)}, #-cells: {len(visited)}")
|
|
||||||
|
|
||||||
return tables
|
return tables
|
||||||
|
|
||||||
def find_table_bounds(
|
def find_table_bounds(
|
||||||
@ -277,3 +262,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"num_cols": max_col + 1 - start_col,
|
"num_cols": max_col + 1 - start_col,
|
||||||
"data": data,
|
"data": data,
|
||||||
}, visited_cells
|
}, visited_cells
|
||||||
|
|
||||||
|
def find_images_in_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
||||||
|
|
||||||
|
# FIXME
|
||||||
|
"""
|
||||||
|
# Iterate over images in the sheet
|
||||||
|
for idx, image in enumerate(sheet._images): # Access embedded images
|
||||||
|
# Save the image to the output folder
|
||||||
|
image_path = f"{output_folder}/{sheet_name}_image_{idx + 1}.png"
|
||||||
|
with open(image_path, "wb") as img_file:
|
||||||
|
img_file.write(image.ref.blob)
|
||||||
|
print(f"Image saved to: {image_path}")
|
||||||
|
"""
|
||||||
|
|
||||||
|
return doc
|
||||||
|
Loading…
Reference in New Issue
Block a user