feat(xlsx): create a page for each worksheet in XLSX backend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-04-08 14:04:11 +02:00
parent e813f02943
commit 06a0ae8294
3 changed files with 247 additions and 61 deletions

View File

@ -1,31 +1,35 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Union from typing import Any, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupLabel, GroupLabel,
ImageRef, ImageRef,
ProvenanceItem,
Size,
TableCell, TableCell,
TableData, TableData,
) )
from openpyxl import load_workbook from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image as PILImage
from pydantic import BaseModel
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
from typing import Any, List
from PIL import Image as PILImage
from pydantic import BaseModel
class ExcelCell(BaseModel): class ExcelCell(BaseModel):
row: int row: int
@ -38,10 +42,10 @@ class ExcelCell(BaseModel):
class ExcelTable(BaseModel): class ExcelTable(BaseModel):
num_rows: int num_rows: int
num_cols: int num_cols: int
data: List[ExcelCell] data: list[ExcelCell]
class MsExcelDocumentBackend(DeclarativeDocumentBackend): class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
@override @override
def __init__( def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
@ -63,12 +67,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(self.path_or_stream, Path): elif isinstance(self.path_or_stream, Path):
self.workbook = load_workbook(filename=str(self.path_or_stream)) self.workbook = load_workbook(filename=str(self.path_or_stream))
self.valid = True self.valid = self.workbook is not None
except Exception as e: except Exception as e:
self.valid = False self.valid = False
raise RuntimeError( raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
@override @override
@ -81,6 +85,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return True return True
def page_count(self) -> int:
if self.is_valid() and self.workbook:
return len(self.workbook.sheetnames)
else:
return 0
@classmethod @classmethod
@override @override
def supported_formats(cls) -> set[InputFormat]: def supported_formats(cls) -> set[InputFormat]:
@ -117,6 +127,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
# Access the sheet by name # Access the sheet by name
sheet = self.workbook[sheet_name] sheet = self.workbook[sheet_name]
idx = self.workbook.index(sheet)
# TODO: check concept of Size as number of rows and cols
doc.add_page(page_no=idx + 1, size=Size())
self.parents[0] = doc.add_group( self.parents[0] = doc.add_group(
parent=None, parent=None,
@ -142,6 +155,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument: ) -> DoclingDocument:
if self.workbook is not None:
tables = self._find_data_tables(sheet) tables = self._find_data_tables(sheet)
for excel_table in tables: for excel_table in tables:
@ -169,11 +183,22 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
) )
table_data.table_cells.append(cell) table_data.table_cells.append(cell)
doc.add_table(data=table_data, parent=self.parents[0]) page_no = self.workbook.index(sheet) + 1
doc.add_table(
data=table_data,
parent=self.parents[0],
prov=ProvenanceItem(
page_no=page_no,
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
(0, 0, 0, 0), origin=CoordOrigin.BOTTOMLEFT
),
),
)
return doc return doc
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]: def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
""" """
Find all compact rectangular data tables in a sheet. Find all compact rectangular data tables in a sheet.
""" """
@ -327,16 +352,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument: ) -> DoclingDocument:
if self.workbook is not None:
# Iterate over byte images in the sheet # Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore for image in sheet._images: # type: ignore[attr-defined]
try: try:
pil_image = PILImage.open(image.ref) pil_image = PILImage.open(image.ref)
page_no = self.workbook.index(sheet) + 1
doc.add_picture( doc.add_picture(
parent=self.parents[0], parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None, caption=None,
prov=ProvenanceItem(
page_no=page_no,
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
(0, 0, 0, 0), origin=CoordOrigin.TOPLEFT
),
),
) )
except: except:
_log.error("could not extract the image from excel sheets") _log.error("could not extract the image from excel sheets")

View File

@ -97,7 +97,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "picture", "label": "picture",
"prov": [], "prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -122,7 +137,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "table", "label": "table",
"prov": [], "prov": [
{
"page_no": 1,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -661,7 +691,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "table", "label": "table",
"prov": [], "prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -1564,7 +1609,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "table", "label": "table",
"prov": [], "prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -1955,7 +2015,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "table", "label": "table",
"prov": [], "prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -2346,7 +2421,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "table", "label": "table",
"prov": [], "prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -2813,7 +2903,22 @@
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "table", "label": "table",
"prov": [], "prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 0.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [], "captions": [],
"references": [], "references": [],
"footnotes": [], "footnotes": [],
@ -3275,5 +3380,27 @@
], ],
"key_value_items": [], "key_value_items": [],
"form_items": [], "form_items": [],
"pages": {} "pages": {
"1": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 1
},
"2": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 2
},
"3": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 3
}
}
} }

View File

@ -1,13 +1,18 @@
import os import logging
from pathlib import Path from pathlib import Path
import pytest
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export from .verify_utils import verify_document, verify_export
_log = logging.getLogger(__name__)
GENERATE = GEN_TEST_DATA GENERATE = GEN_TEST_DATA
@ -28,13 +33,15 @@ def get_converter():
return converter return converter
def test_e2e_xlsx_conversions(): @pytest.fixture(scope="module")
def documents() -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []
xlsx_paths = get_xlsx_paths() xlsx_paths = get_xlsx_paths()
converter = get_converter() converter = get_converter()
for xlsx_path in xlsx_paths: for xlsx_path in xlsx_paths:
print(f"converting {xlsx_path}") _log.debug(f"converting {xlsx_path}")
gt_path = ( gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions():
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document
assert doc, f"Failed to convert document from file {gt_path}"
documents.append((gt_path, doc))
return documents
def test_e2e_xlsx_conversions(documents):
for gt_path, doc in documents:
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
@ -57,3 +72,15 @@ def test_e2e_xlsx_conversions():
assert verify_document( assert verify_document(
doc, str(gt_path) + ".json", GENERATE doc, str(gt_path) + ".json", GENERATE
), "document document" ), "document document"
def test_page_count():
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
filename=path.stem,
backend=MsExcelDocumentBackend,
)
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
assert backend.page_count() == 3