feat(xlsx): create a page for each worksheet in XLSX backend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-04-08 14:04:11 +02:00
parent e813f02943
commit 06a0ae8294
3 changed files with 247 additions and 61 deletions

View File

@ -1,31 +1,35 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Union
from typing import Any, Union
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ImageRef,
ProvenanceItem,
Size,
TableCell,
TableData,
)
from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image as PILImage
from pydantic import BaseModel
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
from typing import Any, List
from PIL import Image as PILImage
from pydantic import BaseModel
class ExcelCell(BaseModel):
row: int
@ -38,10 +42,10 @@ class ExcelCell(BaseModel):
class ExcelTable(BaseModel):
num_rows: int
num_cols: int
data: List[ExcelCell]
data: list[ExcelCell]
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
@override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
@ -63,12 +67,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(self.path_or_stream, Path):
self.workbook = load_workbook(filename=str(self.path_or_stream))
self.valid = True
self.valid = self.workbook is not None
except Exception as e:
self.valid = False
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override
@ -81,6 +85,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
def supports_pagination(cls) -> bool:
return True
def page_count(self) -> int:
if self.is_valid() and self.workbook:
return len(self.workbook.sheetnames)
else:
return 0
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
@ -117,6 +127,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
# Access the sheet by name
sheet = self.workbook[sheet_name]
idx = self.workbook.index(sheet)
# TODO: check concept of Size as number of rows and cols
doc.add_page(page_no=idx + 1, size=Size())
self.parents[0] = doc.add_group(
parent=None,
@ -142,38 +155,50 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:
tables = self._find_data_tables(sheet)
if self.workbook is not None:
tables = self._find_data_tables(sheet)
for excel_table in tables:
num_rows = excel_table.num_rows
num_cols = excel_table.num_cols
for excel_table in tables:
num_rows = excel_table.num_rows
num_cols = excel_table.num_cols
table_data = TableData(
num_rows=num_rows,
num_cols=num_cols,
table_cells=[],
)
for excel_cell in excel_table.data:
cell = TableCell(
text=excel_cell.text,
row_span=excel_cell.row_span,
col_span=excel_cell.col_span,
start_row_offset_idx=excel_cell.row,
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
start_col_offset_idx=excel_cell.col,
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
column_header=excel_cell.row == 0,
row_header=False,
table_data = TableData(
num_rows=num_rows,
num_cols=num_cols,
table_cells=[],
)
table_data.table_cells.append(cell)
doc.add_table(data=table_data, parent=self.parents[0])
for excel_cell in excel_table.data:
cell = TableCell(
text=excel_cell.text,
row_span=excel_cell.row_span,
col_span=excel_cell.col_span,
start_row_offset_idx=excel_cell.row,
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
start_col_offset_idx=excel_cell.col,
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
column_header=excel_cell.row == 0,
row_header=False,
)
table_data.table_cells.append(cell)
page_no = self.workbook.index(sheet) + 1
doc.add_table(
data=table_data,
parent=self.parents[0],
prov=ProvenanceItem(
page_no=page_no,
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
(0, 0, 0, 0), origin=CoordOrigin.BOTTOMLEFT
),
),
)
return doc
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
"""
Find all compact rectangular data tables in a sheet.
"""
@ -327,18 +352,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:
# Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore
try:
pil_image = PILImage.open(image.ref)
doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except:
_log.error("could not extract the image from excel sheets")
if self.workbook is not None:
# Iterate over byte images in the sheet
for image in sheet._images: # type: ignore[attr-defined]
try:
pil_image = PILImage.open(image.ref)
page_no = self.workbook.index(sheet) + 1
doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
prov=ProvenanceItem(
page_no=page_no,
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
(0, 0, 0, 0), origin=CoordOrigin.TOPLEFT
),
),
)
except:
_log.error("could not extract the image from excel sheets")
return doc

View File

@ -97,7 +97,22 @@
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -122,7 +137,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 1,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -661,7 +691,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -1564,7 +1609,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -1955,7 +2015,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -2346,7 +2421,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -2813,7 +2903,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -3275,5 +3380,27 @@
],
"key_value_items": [],
"form_items": [],
"pages": {}
"pages": {
"1": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 1
},
"2": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 2
},
"3": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 3
}
}
}

View File

@ -1,13 +1,18 @@
import os
import logging
from pathlib import Path
import pytest
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
_log = logging.getLogger(__name__)
GENERATE = GEN_TEST_DATA
@ -28,13 +33,15 @@ def get_converter():
return converter
def test_e2e_xlsx_conversions():
@pytest.fixture(scope="module")
def documents() -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []
xlsx_paths = get_xlsx_paths()
converter = get_converter()
for xlsx_path in xlsx_paths:
print(f"converting {xlsx_path}")
_log.debug(f"converting {xlsx_path}")
gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions():
doc: DoclingDocument = conv_result.document
assert doc, f"Failed to convert document from file {gt_path}"
documents.append((gt_path, doc))
return documents
def test_e2e_xlsx_conversions(documents):
for gt_path, doc in documents:
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
@ -57,3 +72,15 @@ def test_e2e_xlsx_conversions():
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"
def test_page_count():
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
filename=path.stem,
backend=MsExcelDocumentBackend,
)
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
assert backend.page_count() == 3