mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
fixed some issues
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
8ec3f176eb
commit
60c2a860c4
@ -26,7 +26,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image as PILImage
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
@ -327,19 +327,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self, doc: DoclingDocument, sheet: Worksheet
|
self, doc: DoclingDocument, sheet: Worksheet
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
|
||||||
# Iterate over images in the sheet
|
# Iterate over byte images in the sheet
|
||||||
for idx, image in enumerate(sheet._images): # type: ignore
|
for idx, image in enumerate(sheet._images): # type: ignore
|
||||||
|
|
||||||
image_bytes = BytesIO(image.ref.blob)
|
try:
|
||||||
pil_image = Image.open(image_bytes)
|
pil_image = PILImage.open(image.ref)
|
||||||
|
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[0],
|
parent=self.parents[0],
|
||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
)
|
)
|
||||||
|
except:
|
||||||
|
_log.error("could not extract the image from excel sheets")
|
||||||
|
|
||||||
|
"""
|
||||||
for idx, chart in enumerate(sheet._charts): # type: ignore
|
for idx, chart in enumerate(sheet._charts): # type: ignore
|
||||||
|
try:
|
||||||
chart_path = f"chart_{idx + 1}.png"
|
chart_path = f"chart_{idx + 1}.png"
|
||||||
_log.info(
|
_log.info(
|
||||||
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
||||||
@ -348,22 +352,36 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.info(f"Chart {idx + 1}:")
|
_log.info(f"Chart {idx + 1}:")
|
||||||
|
|
||||||
# Chart type
|
# Chart type
|
||||||
_log.info(f"Type: {type(chart).__name__}")
|
# _log.info(f"Type: {type(chart).__name__}")
|
||||||
|
print(f"Type: {type(chart).__name__}")
|
||||||
|
|
||||||
# Title
|
# Extract series data
|
||||||
if chart.title:
|
for series_idx, series in enumerate(chart.series):
|
||||||
_log.info(f"Title: {chart.title}")
|
#_log.info(f"Series {series_idx + 1}:")
|
||||||
else:
|
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
||||||
_log.info("No title")
|
#print(f"x-values: {series.xVal}")
|
||||||
|
#print(f"y-values: {series.yVal}")
|
||||||
|
|
||||||
# Data series
|
print(f"xval type: {type(series.xVal).__name__}")
|
||||||
for series in chart.series:
|
|
||||||
_log.info(" => series ...")
|
|
||||||
_log.info(f"Data Series: {series.title}")
|
|
||||||
_log.info(f"Values: {series.values}")
|
|
||||||
_log.info(f"Categories: {series.categories}")
|
|
||||||
|
|
||||||
# Position
|
xvals = []
|
||||||
# _log.info(f"Anchor Cell: {chart.anchor}")
|
for _ in series.xVal.numLit.pt:
|
||||||
|
print(f"xval type: {type(_).__name__}")
|
||||||
|
if hasattr(_, 'v'):
|
||||||
|
xvals.append(_.v)
|
||||||
|
|
||||||
|
print(f"x-values: {xvals}")
|
||||||
|
|
||||||
|
yvals = []
|
||||||
|
for _ in series.yVal:
|
||||||
|
if hasattr(_, 'v'):
|
||||||
|
yvals.append(_.v)
|
||||||
|
|
||||||
|
print(f"y-values: {yvals}")
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
print(exc)
|
||||||
|
continue
|
||||||
|
"""
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
@ -8,3 +8,4 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-7 at level 1: section: group sheet: Sheet3
|
item-7 at level 1: section: group sheet: Sheet3
|
||||||
item-8 at level 2: table with [7x3]
|
item-8 at level 2: table with [7x3]
|
||||||
item-9 at level 2: table with [7x3]
|
item-9 at level 2: table with [7x3]
|
||||||
|
item-10 at level 2: picture
|
File diff suppressed because one or more lines are too long
@ -49,3 +49,5 @@
|
|||||||
| 3 | 6 | 7 |
|
| 3 | 6 | 7 |
|
||||||
| 8 | 9 | 9 |
|
| 8 | 9 | 9 |
|
||||||
| 10 | 9 | 9 |
|
| 10 | 9 | 9 |
|
||||||
|
|
||||||
|
<!-- image -->
|
Binary file not shown.
@ -53,7 +53,7 @@ def test_e2e_xlsx_conversions():
|
|||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
for xlsx_path in xlsx_paths:
|
for xlsx_path in xlsx_paths:
|
||||||
# print(f"converting {xlsx_path}")
|
print(f"converting {xlsx_path}")
|
||||||
|
|
||||||
gt_path = (
|
gt_path = (
|
||||||
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
|
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
|
||||||
|
Loading…
Reference in New Issue
Block a user