From 06a0ae8294fbc1106a4448131bc1bde73c3d7db8 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Tue, 8 Apr 2025 14:04:11 +0200 Subject: [PATCH] feat(xlsx): create a page for each worksheet in XLSX backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msexcel_backend.py | 130 ++++++++++------ .../groundtruth/docling_v2/test-01.xlsx.json | 143 +++++++++++++++++- tests/test_backend_msexcel.py | 35 ++++- 3 files changed, 247 insertions(+), 61 deletions(-) diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index d1cac1bd..fae6c783 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -1,31 +1,35 @@ import logging from io import BytesIO from pathlib import Path -from typing import Union +from typing import Any, Union from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, DoclingDocument, DocumentOrigin, GroupLabel, ImageRef, + ProvenanceItem, + Size, TableCell, TableData, ) from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet +from PIL import Image as PILImage +from pydantic import BaseModel from typing_extensions import override -from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.abstract_backend import ( + DeclarativeDocumentBackend, + PaginatedDocumentBackend, +) from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) -from typing import Any, List - -from PIL import Image as PILImage -from pydantic import BaseModel - class ExcelCell(BaseModel): row: int @@ -38,10 +42,10 @@ class ExcelCell(BaseModel): class ExcelTable(BaseModel): num_rows: int num_cols: int - data: List[ExcelCell] + data: list[ExcelCell] -class MsExcelDocumentBackend(DeclarativeDocumentBackend): +class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): @override def __init__( self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] @@ -63,12 +67,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): elif isinstance(self.path_or_stream, Path): self.workbook = load_workbook(filename=str(self.path_or_stream)) - self.valid = True + self.valid = self.workbook is not None except Exception as e: self.valid = False raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" + f"MsExcelDocumentBackend could not load document with hash {self.document_hash}" ) from e @override @@ -81,6 +85,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): def supports_pagination(cls) -> bool: return True + def page_count(self) -> int: + if self.is_valid() and self.workbook: + return len(self.workbook.sheetnames) + else: + return 0 + @classmethod @override def supported_formats(cls) -> set[InputFormat]: @@ -117,6 +127,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): # Access the sheet by name sheet = self.workbook[sheet_name] + idx = self.workbook.index(sheet) + # TODO: check concept of Size as number of rows and cols + doc.add_page(page_no=idx + 1, size=Size()) self.parents[0] = doc.add_group( parent=None, @@ -142,38 +155,50 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): self, doc: DoclingDocument, sheet: Worksheet ) -> DoclingDocument: - tables = self._find_data_tables(sheet) + if self.workbook is not None: + tables = self._find_data_tables(sheet) - for excel_table in tables: - num_rows = excel_table.num_rows - num_cols = excel_table.num_cols + for excel_table in tables: + num_rows = excel_table.num_rows + num_cols = excel_table.num_cols - table_data = TableData( - num_rows=num_rows, - num_cols=num_cols, - table_cells=[], - ) - - for excel_cell in excel_table.data: - - cell = TableCell( - text=excel_cell.text, - row_span=excel_cell.row_span, - col_span=excel_cell.col_span, - start_row_offset_idx=excel_cell.row, - end_row_offset_idx=excel_cell.row + excel_cell.row_span, - start_col_offset_idx=excel_cell.col, - end_col_offset_idx=excel_cell.col + excel_cell.col_span, - column_header=excel_cell.row == 0, - row_header=False, + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], ) - table_data.table_cells.append(cell) - doc.add_table(data=table_data, parent=self.parents[0]) + for excel_cell in excel_table.data: + + cell = TableCell( + text=excel_cell.text, + row_span=excel_cell.row_span, + col_span=excel_cell.col_span, + start_row_offset_idx=excel_cell.row, + end_row_offset_idx=excel_cell.row + excel_cell.row_span, + start_col_offset_idx=excel_cell.col, + end_col_offset_idx=excel_cell.col + excel_cell.col_span, + column_header=excel_cell.row == 0, + row_header=False, + ) + table_data.table_cells.append(cell) + + page_no = self.workbook.index(sheet) + 1 + doc.add_table( + data=table_data, + parent=self.parents[0], + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + (0, 0, 0, 0), origin=CoordOrigin.BOTTOMLEFT + ), + ), + ) return doc - def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]: + def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]: """ Find all compact rectangular data tables in a sheet. """ @@ -327,18 +352,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): self, doc: DoclingDocument, sheet: Worksheet ) -> DoclingDocument: - # Iterate over byte images in the sheet - for idx, image in enumerate(sheet._images): # type: ignore - - try: - pil_image = PILImage.open(image.ref) - - doc.add_picture( - parent=self.parents[0], - image=ImageRef.from_pil(image=pil_image, dpi=72), - caption=None, - ) - except: - _log.error("could not extract the image from excel sheets") + if self.workbook is not None: + # Iterate over byte images in the sheet + for image in sheet._images: # type: ignore[attr-defined] + try: + pil_image = PILImage.open(image.ref) + page_no = self.workbook.index(sheet) + 1 + doc.add_picture( + parent=self.parents[0], + image=ImageRef.from_pil(image=pil_image, dpi=72), + caption=None, + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + (0, 0, 0, 0), origin=CoordOrigin.TOPLEFT + ), + ), + ) + except: + _log.error("could not extract the image from excel sheets") return doc diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json index 173cd5fb..bf6c19c6 100644 --- a/tests/data/groundtruth/docling_v2/test-01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -97,7 +97,22 @@ "children": [], "content_layer": "body", "label": "picture", - "prov": [], + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -122,7 +137,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -661,7 +691,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -1564,7 +1609,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -1955,7 +2015,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -2346,7 +2421,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -2813,7 +2903,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -3275,5 +3380,27 @@ ], "key_value_items": [], "form_items": [], - "pages": {} + "pages": { + "1": { + "size": { + "width": 0.0, + "height": 0.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 0.0, + "height": 0.0 + }, + "page_no": 2 + }, + "3": { + "size": { + "width": 0.0, + "height": 0.0 + }, + "page_no": 3 + } + } } \ No newline at end of file diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 1844dff1..86b0c013 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -1,13 +1,18 @@ -import os +import logging from pathlib import Path +import pytest + +from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import ConversionResult, DoclingDocument +from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument from docling.document_converter import DocumentConverter from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export +_log = logging.getLogger(__name__) + GENERATE = GEN_TEST_DATA @@ -28,13 +33,15 @@ def get_converter(): return converter -def test_e2e_xlsx_conversions(): +@pytest.fixture(scope="module") +def documents() -> list[tuple[Path, DoclingDocument]]: + documents: list[dict[Path, DoclingDocument]] = [] xlsx_paths = get_xlsx_paths() converter = get_converter() for xlsx_path in xlsx_paths: - print(f"converting {xlsx_path}") + _log.debug(f"converting {xlsx_path}") gt_path = ( xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name @@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions(): doc: DoclingDocument = conv_result.document + assert doc, f"Failed to convert document from file {gt_path}" + documents.append((gt_path, doc)) + + return documents + + +def test_e2e_xlsx_conversions(documents): + for gt_path, doc in documents: pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" @@ -57,3 +72,15 @@ def test_e2e_xlsx_conversions(): assert verify_document( doc, str(gt_path) + ".json", GENERATE ), "document document" + + +def test_page_count(): + path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.XLSX, + filename=path.stem, + backend=MsExcelDocumentBackend, + ) + backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) + assert backend.page_count() == 3