fixed some issues

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-01-24 16:25:10 +01:00
parent 8ec3f176eb
commit 60c2a860c4
6 changed files with 84 additions and 37 deletions

View File

@ -26,7 +26,7 @@ _log = logging.getLogger(__name__)
from typing import Any, List from typing import Any, List
from PIL import Image from PIL import Image as PILImage
from pydantic import BaseModel from pydantic import BaseModel
@ -327,19 +327,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument: ) -> DoclingDocument:
# Iterate over images in the sheet # Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore for idx, image in enumerate(sheet._images): # type: ignore
image_bytes = BytesIO(image.ref.blob) try:
pil_image = Image.open(image_bytes) pil_image = PILImage.open(image.ref)
doc.add_picture( doc.add_picture(
parent=self.parents[0], parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None, caption=None,
) )
except:
_log.error("could not extract the image from excel sheets")
"""
for idx, chart in enumerate(sheet._charts): # type: ignore for idx, chart in enumerate(sheet._charts): # type: ignore
try:
chart_path = f"chart_{idx + 1}.png" chart_path = f"chart_{idx + 1}.png"
_log.info( _log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}" f"Chart found, but dynamic rendering is required for: {chart_path}"
@ -348,22 +352,36 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
_log.info(f"Chart {idx + 1}:") _log.info(f"Chart {idx + 1}:")
# Chart type # Chart type
_log.info(f"Type: {type(chart).__name__}") # _log.info(f"Type: {type(chart).__name__}")
print(f"Type: {type(chart).__name__}")
# Title # Extract series data
if chart.title: for series_idx, series in enumerate(chart.series):
_log.info(f"Title: {chart.title}") #_log.info(f"Series {series_idx + 1}:")
else: print(f"Series {series_idx + 1} type: {type(series).__name__}")
_log.info("No title") #print(f"x-values: {series.xVal}")
#print(f"y-values: {series.yVal}")
# Data series print(f"xval type: {type(series.xVal).__name__}")
for series in chart.series:
_log.info(" => series ...")
_log.info(f"Data Series: {series.title}")
_log.info(f"Values: {series.values}")
_log.info(f"Categories: {series.categories}")
# Position xvals = []
# _log.info(f"Anchor Cell: {chart.anchor}") for _ in series.xVal.numLit.pt:
print(f"xval type: {type(_).__name__}")
if hasattr(_, 'v'):
xvals.append(_.v)
print(f"x-values: {xvals}")
yvals = []
for _ in series.yVal:
if hasattr(_, 'v'):
yvals.append(_.v)
print(f"y-values: {yvals}")
except Exception as exc:
print(exc)
continue
"""
return doc return doc

View File

@ -8,3 +8,4 @@ item-0 at level 0: unspecified: group _root_
item-7 at level 1: section: group sheet: Sheet3 item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3] item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3] item-9 at level 2: table with [7x3]
item-10 at level 2: picture

File diff suppressed because one or more lines are too long

View File

@ -49,3 +49,5 @@
| 3 | 6 | 7 | | 3 | 6 | 7 |
| 8 | 9 | 9 | | 8 | 9 | 9 |
| 10 | 9 | 9 | | 10 | 9 | 9 |
<!-- image -->

Binary file not shown.

View File

@ -53,7 +53,7 @@ def test_e2e_xlsx_conversions():
converter = get_converter() converter = get_converter()
for xlsx_path in xlsx_paths: for xlsx_path in xlsx_paths:
# print(f"converting {xlsx_path}") print(f"converting {xlsx_path}")
gt_path = ( gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name