diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 9ad2a360..90cac049 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -18,6 +18,7 @@ from docling_core.types.doc import ( TableData, ) from openpyxl import load_workbook +from openpyxl.chartsheet.chartsheet import Chartsheet from openpyxl.drawing.image import Image from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor from openpyxl.worksheet.worksheet import Worksheet @@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken if self.workbook is not None: # Iterate over all sheets - for sheet_name in self.workbook.sheetnames: - _log.info(f"Processing sheet: {sheet_name}") + for idx, name in enumerate(self.workbook.sheetnames): + _log.info(f"Processing sheet {idx}: {name}") - sheet = self.workbook[sheet_name] - page_no = self.workbook.index(sheet) + 1 + sheet = self.workbook[name] + page_no = idx + 1 # do not rely on sheet.max_column, sheet.max_row if there are images page = doc.add_page(page_no=page_no, size=Size(width=0, height=0)) self.parents[0] = doc.add_group( parent=None, label=GroupLabel.SECTION, - name=f"sheet: {sheet_name}", + name=f"sheet: {name}", content_layer=self._get_sheet_content_layer(sheet), ) doc = self._convert_sheet(doc, sheet) @@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken return doc - def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument: + def _convert_sheet( + self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet] + ) -> DoclingDocument: """Parse an Excel worksheet and attach its structure to a DoclingDocument Args: @@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken Returns: The updated DoclingDocument. """ + if isinstance(sheet, Worksheet): + doc = self._find_tables_in_sheet(doc, sheet) + doc = self._find_images_in_sheet(doc, sheet) - doc = self._find_tables_in_sheet(doc, sheet) - - doc = self._find_images_in_sheet(doc, sheet) + # TODO: parse charts in sheet return doc diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt similarity index 100% rename from tests/data/groundtruth/docling_v2/test-01.xlsx.itxt rename to tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json similarity index 100% rename from tests/data/groundtruth/docling_v2/test-01.xlsx.json rename to tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.md similarity index 100% rename from tests/data/groundtruth/docling_v2/test-01.xlsx.md rename to tests/data/groundtruth/docling_v2/xlsx_01.xlsx.md diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt similarity index 100% rename from tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt rename to tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json similarity index 100% rename from tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json rename to tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.md similarity index 100% rename from tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md rename to tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.md diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt new file mode 100644 index 00000000..8b435a35 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt @@ -0,0 +1,4 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: Duck Observations + item-2 at level 2: table with [7x3] + item-3 at level 1: section: group sheet: Duck Chart \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json new file mode 100644 index 00000000..449b5f40 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json @@ -0,0 +1,676 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.7.0", + "name": "xlsx_03_chartsheet", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 548415533138925042, + "filename": "xlsx_03_chartsheet.xlsx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "content_layer": "body", + "name": "sheet: Duck Observations", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "name": "sheet: Duck Chart", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 3.0, + "b": 7.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Saltwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "80", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "95", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "100", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "110", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "130", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 7, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Saltwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "80", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "95", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "100", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "110", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "130", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 3.0, + "height": 7.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 0.0, + "height": 0.0 + }, + "page_no": 2 + } + } +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md new file mode 100644 index 00000000..3638d491 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md @@ -0,0 +1,8 @@ +| Year | Freshwater Ducks | Saltwater Ducks | +|--------|--------------------|-------------------| +| 2019 | 120 | 80 | +| 2020 | 135 | 95 | +| 2021 | 150 | 100 | +| 2022 | 170 | 110 | +| 2023 | 160 | 120 | +| 2024 | 180 | 130 | \ No newline at end of file diff --git a/tests/data/xlsx/test-01.xlsx b/tests/data/xlsx/xlsx_01.xlsx similarity index 100% rename from tests/data/xlsx/test-01.xlsx rename to tests/data/xlsx/xlsx_01.xlsx diff --git a/tests/data/xlsx/sample_sales_data.xlsm b/tests/data/xlsx/xlsx_02_sample_sales_data.xlsm similarity index 100% rename from tests/data/xlsx/sample_sales_data.xlsm rename to tests/data/xlsx/xlsx_02_sample_sales_data.xlsm diff --git a/tests/data/xlsx/xlsx_03_chartsheet.xlsx b/tests/data/xlsx/xlsx_03_chartsheet.xlsx new file mode 100644 index 00000000..366859a2 Binary files /dev/null and b/tests/data/xlsx/xlsx_03_chartsheet.xlsx differ diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 25004c7a..d4d790c0 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -79,7 +79,7 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - path = next(item for item in get_excel_paths() if item.stem == "test-01") + path = next(item for item in get_excel_paths() if item.stem == "xlsx_01") in_doc = InputDocument( path_or_stream=path, format=InputFormat.XLSX, @@ -90,7 +90,7 @@ def test_pages(documents) -> None: assert backend.page_count() == 4 # number of pages from the converted document - doc = next(item for path, item in documents if path.stem == "test-01") + doc = next(item for path, item in documents if path.stem == "xlsx_01") assert len(doc.pages) == 4 # page sizes as number of cells @@ -98,3 +98,18 @@ def test_pages(documents) -> None: assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0) + + +def test_chartsheet(documents) -> None: + """Test the conversion of Chartsheets. + + Args: + documents: The paths and converted documents. + """ + doc = next(item for path, item in documents if path.stem == "xlsx_03_chartsheet") + assert len(doc.pages) == 2 + + # Chartseet content is for now ignored + assert doc.groups[1].name == "sheet: Duck Chart" + assert doc.pages[2].size.height == 0 + assert doc.pages[2].size.width == 0