From cce18b2ff7f14ab0643364055a1ec8a77479a211 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Fri, 10 Oct 2025 15:06:38 +0200 Subject: [PATCH] fix: deal with chartsheets in workbooks (#2433) * fix(xlsx): deal with chartsheets in workbooks Signed-off-by: Cesar Berrospi Ramis * tests(xlsx): align test file names Signed-off-by: Cesar Berrospi Ramis --------- Signed-off-by: Cesar Berrospi Ramis --- docling/backend/msexcel_backend.py | 22 +- .../{test-01.xlsx.itxt => xlsx_01.xlsx.itxt} | 0 .../{test-01.xlsx.json => xlsx_01.xlsx.json} | 0 .../{test-01.xlsx.md => xlsx_01.xlsx.md} | 0 ...xt => xlsx_02_sample_sales_data.xlsm.itxt} | 0 ...on => xlsx_02_sample_sales_data.xlsm.json} | 0 ...m.md => xlsx_02_sample_sales_data.xlsm.md} | 0 .../docling_v2/xlsx_03_chartsheet.xlsx.itxt | 4 + .../docling_v2/xlsx_03_chartsheet.xlsx.json | 676 ++++++++++++++++++ .../docling_v2/xlsx_03_chartsheet.xlsx.md | 8 + .../data/xlsx/{test-01.xlsx => xlsx_01.xlsx} | Bin ...ta.xlsm => xlsx_02_sample_sales_data.xlsm} | Bin tests/data/xlsx/xlsx_03_chartsheet.xlsx | Bin 0 -> 11896 bytes tests/test_backend_msexcel.py | 19 +- 14 files changed, 718 insertions(+), 11 deletions(-) rename tests/data/groundtruth/docling_v2/{test-01.xlsx.itxt => xlsx_01.xlsx.itxt} (100%) rename tests/data/groundtruth/docling_v2/{test-01.xlsx.json => xlsx_01.xlsx.json} (100%) rename tests/data/groundtruth/docling_v2/{test-01.xlsx.md => xlsx_01.xlsx.md} (100%) rename tests/data/groundtruth/docling_v2/{sample_sales_data.xlsm.itxt => xlsx_02_sample_sales_data.xlsm.itxt} (100%) rename tests/data/groundtruth/docling_v2/{sample_sales_data.xlsm.json => xlsx_02_sample_sales_data.xlsm.json} (100%) rename tests/data/groundtruth/docling_v2/{sample_sales_data.xlsm.md => xlsx_02_sample_sales_data.xlsm.md} (100%) create mode 100644 tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt create mode 100644 tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json create mode 100644 tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md rename tests/data/xlsx/{test-01.xlsx => xlsx_01.xlsx} (100%) rename tests/data/xlsx/{sample_sales_data.xlsm => xlsx_02_sample_sales_data.xlsm} (100%) create mode 100644 tests/data/xlsx/xlsx_03_chartsheet.xlsx diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 9ad2a360..90cac049 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -18,6 +18,7 @@ from docling_core.types.doc import ( TableData, ) from openpyxl import load_workbook +from openpyxl.chartsheet.chartsheet import Chartsheet from openpyxl.drawing.image import Image from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor from openpyxl.worksheet.worksheet import Worksheet @@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken if self.workbook is not None: # Iterate over all sheets - for sheet_name in self.workbook.sheetnames: - _log.info(f"Processing sheet: {sheet_name}") + for idx, name in enumerate(self.workbook.sheetnames): + _log.info(f"Processing sheet {idx}: {name}") - sheet = self.workbook[sheet_name] - page_no = self.workbook.index(sheet) + 1 + sheet = self.workbook[name] + page_no = idx + 1 # do not rely on sheet.max_column, sheet.max_row if there are images page = doc.add_page(page_no=page_no, size=Size(width=0, height=0)) self.parents[0] = doc.add_group( parent=None, label=GroupLabel.SECTION, - name=f"sheet: {sheet_name}", + name=f"sheet: {name}", content_layer=self._get_sheet_content_layer(sheet), ) doc = self._convert_sheet(doc, sheet) @@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken return doc - def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument: + def _convert_sheet( + self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet] + ) -> DoclingDocument: """Parse an Excel worksheet and attach its structure to a DoclingDocument Args: @@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken Returns: The updated DoclingDocument. """ + if isinstance(sheet, Worksheet): + doc = self._find_tables_in_sheet(doc, sheet) + doc = self._find_images_in_sheet(doc, sheet) - doc = self._find_tables_in_sheet(doc, sheet) - - doc = self._find_images_in_sheet(doc, sheet) + # TODO: parse charts in sheet return doc diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt similarity index 100% rename from tests/data/groundtruth/docling_v2/test-01.xlsx.itxt rename to tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json similarity index 100% rename from tests/data/groundtruth/docling_v2/test-01.xlsx.json rename to tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.md similarity index 100% rename from tests/data/groundtruth/docling_v2/test-01.xlsx.md rename to tests/data/groundtruth/docling_v2/xlsx_01.xlsx.md diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt similarity index 100% rename from tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt rename to tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json similarity index 100% rename from tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json rename to tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.md similarity index 100% rename from tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md rename to tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.md diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt new file mode 100644 index 00000000..8b435a35 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt @@ -0,0 +1,4 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: Duck Observations + item-2 at level 2: table with [7x3] + item-3 at level 1: section: group sheet: Duck Chart \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json new file mode 100644 index 00000000..449b5f40 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json @@ -0,0 +1,676 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.7.0", + "name": "xlsx_03_chartsheet", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 548415533138925042, + "filename": "xlsx_03_chartsheet.xlsx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "content_layer": "body", + "name": "sheet: Duck Observations", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "name": "sheet: Duck Chart", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 3.0, + "b": 7.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Saltwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "80", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "95", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "100", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "110", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "130", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 7, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Saltwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "80", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "95", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "100", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "110", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "130", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 3.0, + "height": 7.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 0.0, + "height": 0.0 + }, + "page_no": 2 + } + } +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md new file mode 100644 index 00000000..3638d491 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md @@ -0,0 +1,8 @@ +| Year | Freshwater Ducks | Saltwater Ducks | +|--------|--------------------|-------------------| +| 2019 | 120 | 80 | +| 2020 | 135 | 95 | +| 2021 | 150 | 100 | +| 2022 | 170 | 110 | +| 2023 | 160 | 120 | +| 2024 | 180 | 130 | \ No newline at end of file diff --git a/tests/data/xlsx/test-01.xlsx b/tests/data/xlsx/xlsx_01.xlsx similarity index 100% rename from tests/data/xlsx/test-01.xlsx rename to tests/data/xlsx/xlsx_01.xlsx diff --git a/tests/data/xlsx/sample_sales_data.xlsm b/tests/data/xlsx/xlsx_02_sample_sales_data.xlsm similarity index 100% rename from tests/data/xlsx/sample_sales_data.xlsm rename to tests/data/xlsx/xlsx_02_sample_sales_data.xlsm diff --git a/tests/data/xlsx/xlsx_03_chartsheet.xlsx b/tests/data/xlsx/xlsx_03_chartsheet.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..366859a27c6db40a021077f95505028a17569f38 GIT binary patch literal 11896 zcmeHN1y@{Kvc}zAgS)#12<{djxVyVML4vzO2oT)e3GVLh5Zs-F*GcY7a+y27;LTa9 z`*g3<^{uYGt7?BG@=~B+Xh0A^P(VOHL_i)x8(0CrKtKo(KtQNKP#_vYHr9?t){Z(# zZnj1aTJ)|~mW0`0Ae31^Ab{Wh@A`kd0yXhNvR#ZwBDdmCLfdqb%T)qUY{&k6DAWpk zU*_X_&Gj{2-Co~HFGE5FLQGeDh?{pf9WXQ&jg!$Lqd6-m9avz*tpE$JkxX+NT{V{5 zr;}oY$`Lk#-?oJ9C*K+ho3K0tLJeh4u4V>v>EjwiG51Sx(lyPmXv6S(^(~!;2vZ;7 z$(^xbV=@(pJonRt?$!h-n!Z}#ww+M93+XFr!+~!nx?2e>8=9X#LkSPQ6ZLYCxJAGg z>19zh4}SN)!?jqb`m(x%?<7!htJdF;+S3(-P47&mXKt2ZwN%S6UqqeKO2HI*aL`Bw zyMNpb2}a6S{+*g;wYsUpI^PBXB1zOxg-P`8W40}1FfA3`GxUYo_#2dh3*WxO`+*Q< z{2;74blo*@B$4KHs{JhOfc#g-`0K^akii%FD}-m?M(*9#V80c8b)}717UZ=7- zSS+kZF!Of1d;G``S3u6~L<+`d${ATur^3%YdBoE^ot07sqZGywHRbsP@X+UHP$2og z`EIQ;BgqxOAf*AW2@CLD9eX272L}2d@Bj1J|HqE{hw3GJJT#G8qfynQ8>5o;rINO4!%De#b#Z~{QZd|JJq2A5WNBM$nBZZ=s^3bvkLu8dv0Pk9Yf16m%?f7sNX#4WYhZJbj@#ApIRkE^KWJLPT zXZGB2;bP81LKTKnl!ys$#$c+k6cr}{S1Eop?Ce`~;Iqa>R1?pjh3l6wG%)ts<@S!A z-o?m|y;HSN*?$4MN4!AEM6ndb^U`AxSJx2#9X3M^3UnJLh7JP1cf-7SwYF)iC{UKA zzr$WpGE&-Z_V$hRJ3&8_LYV>lWMdIaBd{?``u8-jDDpTlu~LB=@8??}gcy>T(G4^=bxOgT4Y<8*|;tb)9 z{Dd7y_LbQ0kW`84ZBD6n6N@i7-G1B?Hd#1+g072tz#P%_2V~rR4_95IY>ui>1l0*t ztejI(1Hw{K8YZ;U2-6Ke4c}l)DYcL#m8Ot)GcMW>vglao%S{sG8T}}p2zjfaIf!{> zWfp3_NfTqN-G;8wm&?Aut?kt5jG;VClY@TKH`8l0#Sk9x7D!AFRNrSOp&EDyiNqmQ zGJ8Gs_Lg@F*hTyYA_AYpsWA*o>Wjv0efn88&fg|H2Vc1!uuw!>4HbC#q;Mgz*in=> zvOukxY?aGz>Ue^J2iF-tK5_K0^eZET2Y7!S^DB8>4MxD^PwlGy z)-`n}1ldir5IbC;bRovPo40I=?V%C~*>VvXvTJ@!PRBGT=7Sd}%TTi1>GnG1Uhiuj zV3I>yVpH|C9%;9jQTU6r;Jk?Hw6iWBfPMaFnD3+I3kL?+wge~;5DpL&Fu;O;MfyMO z_+L>U7{Js3zy1GxmBaycSHr-jSfI2sEys7~n-HwF{RFnCN`#BGlaxXR0`tJyWiA2nZmD01CR%*oM6s(>v9tU z!e_95G9byv&5=Vu2=ssi0>T1V#$O@O!PLme(ShOTjp;`$OplYX$zenaI;FYhS8am? zHEV|US5huA_b6^!IbKL6Qay*X;L$01SaTtSR~jUvX70_b{OHpA>Y}+-#SnH>$^s@; zn2AjHeQB;5`qkLsmZ9he#APK-HBj)^zWFX;#RT`GO|>>aAaDH7!L0;MS$bPc1sn(# zFyP$H4_Og8ooIG7AX^SN%z7p{+l*4BgreRLYjsvQ(Sh@-R!-=4B5w+u`<_3F)_oVA z+C)j;wQe<|=sdtc{;b4|=T1qcZYW1%fX{%w+bUPqNaztv*W=i)G0Jb&w#H2S z*K!XxD|znuz7m&{uG%J{+sR{I@eL&MQ)z5_eS1o%xYX$-J-gtbL;{v4`IJjDHo53j ze+DhXgyPknMPiS2d3z3RAncHYYq`-#ZmUyFanBajJsa<a$E0NDqy6n4%2f$f5xlSebg`rV6oiOEaj4O?sx@yJRYQfu(bd82i~35iC)p|V*bx3S z*4zr;;?o5Yx%Y0kauTaZ83E8EQkcy`XeV|gnFWYQF%5T-3auGt@S9>zwE-hQe zuSGC;TT)$J$&v!%0TbVn?sn^G%r>aW{rJt`I1zH}MG3kR!#fb+s%SCRFpZ83@6$n{L15Qx1G)~9Eko-}ZxtEdOXHbNb}#5vOcWF$ z+Y|FhX9*3f1D}1kQ~1a$53H|B?K%NNy^C66sjbz4+e4n0ctm%6m?BqMK93Zk0xyS* zXe?das(v|m6W#{QSQ~Qa+?-+UBsvCUi{+oElE^%wZqvG$IZ5kxG$*o{A=#?va8CAM z0n5y>bnM)!*028oC;FSZ#+UMUIJKq9kTwDsIyztG7Bwj!fu#jh6X( zgZWnR)49ryBOMA-r?MO^*Cawnc3*X+bTSQu2tMEwAa72by!sre1@E-$Ppc#jhy$}p2u9M9FPZJr zqPU@D9mHquUi&2pDJ!PJYEUf2KPpaUjj|o^kx?< z%6^kEyJ^xNrc~N;>2Y=Ew(6MJmimx^O6H1vqND0sf6lo?f^--LIo90tSat&OeBabe zn>40wwFi!oy6vW8*|hCw7RQ?#j$Z&x!tr@Ojtmb&0TDw6)F!WCVLYDVtd{ZwfgWT2 z=#77IRDmP{O1e;#78{8UE$n2aFjo@eQi>6aXDn*X2&crS5%xGetueT|SK=kkZiY#7 zgVh`1;VEbaEIzW>rDh-|Ms?0R$H~g2N>U@cMJjCBDT;dq>~FC>8wzk}M@yi!p!`?C zOO}0+wT2L?p^Zr=o$DPgr|D9!1DnLjIf(5jeILMad*&Kh^ zT>)lZ+y|6uc9sb1J3G~v$TEuoTX8{UXJ@mXWd$q&1<89=WV>i#G1D)-`UC-wP6hsa z$O*PQder{jzQwFoR+ocxqau?D3>9#rWd?~94V3lVBKawn8{|v zJnhuvI}G9p_E^tjN)vm~nwTXbi=afy+c4pdZFL~@N}ZozcA()sK=l>ij8YUBB2|0SFRUb?nW9k% zxgA~6R^|uqRR>AwC(kjHO9=|$!u)-Ck2nCL(%SxeVMdh9?qwZh6Hf(Q;)+kNcCHHb z!{?l={nwAua%?n^_qrKlBWu3M0xa%goKgjM)IgG71C|0G(!qgb_ z+LzknX$4|2DetEiJ4LqhC3L`%92OK6j)(5Jj?uPJt+o%9EWzG1=nWJ3%Cq<3?$tR6 zTGr~tF4D9b!L+e1iXZ4&%ywJD=hR*Y=Mvgs$$rSy+yK_2FMbA&ndYru%9^y7!q>6u zSZV9)6j!-?tep%h1Cu|_hAx%Il#vPNc95R*(Akb@VO~7w)dHrUkm;hiiM5tg6f9;z zA#6vWk`%|%nO;qKM=TSf$8)mNB{FNeLtErnj28ke>;=0!Ch7C;^YeCwkW$P7fvFg{ zE3tHF$)q6ewgC56%~J)XDx%Fqm^S9+U^XTDN`mD|mn(CEP14|14~lbMgs?y%%@EY$ zc~l^=7(dTQY~@Y}{G#p6ho(nk}Nuji^?sR&&^YfKAGuRgE^l(7sufeuy=^Ye^>uavgrI-KRVhj(&C-#+7vV z)4b4!C=;DXu{1w#{u}|1;nxxQ8-1iO0T3>eVu8KdNYHgYRQ`AggnRq}n;id0fO9U^LTdp}E&(sJzw8-6 zfQ<~_I@+6AoBT9u%gCWn5GFwRAN0FWL2I5emT+tX);Z*K4{g{94v%#HWL}ZiQ!8DU zW%o{FFzB3>bs-BTH42MI@)$K^x05yTM_xR{=6#rOO8WpU;|p5~Kbd4tPGmNNq0LXq z$8)oiD~f_CJDIMm{Eej3$U_5?I#9S~A6T876_h`+>lrdSy;6Nm+X06+2I}?A!4@T8 z>P$v8eP;bA?zv8j$gI3TY1p~IXJ_WLa;knYGQaC*`tXuX#=?CWI0x{2Q9zpfS0Zd^ zujleZLx#UT{7QtC6l|6Vk-R_FJo~!3gyWCNlyJ{XcE9~>e$aITiq*@M3JfcL@S`T^ zH-Kj3F+T{($9b}K@`$juDq}A|VO=s_)VtuX+l@X|Dj$sF^E^vR6JC;2^f^GKrI%rO zvw5TsvHq<^kx4TP6i|GruG-dWW2NNVA$R&ht*D$DaJ0)#cT@JPTF*xF7D?I;5n z)jKkxW<_A}BMQzfrnZ;l9`nR;Dq{UD!wj1lW+mbn-aS6$HI60iq(zMmh5}n4fO(jX zSsLvScuD}fsjoDGPchGEQDE_3A|9ED;M~EewZ(Fx+?72Eqkvd~h50IZx43anMt{4< zEfV*lc^_&b+R?K6UL^^Q&-`Y}y#&kEi~Hy~W4@4O@)BZ4(+v9pMp6_$z#aGi56$81 zd}3Ka%;fuXvs20(>Xoi?MK;nAz7-FNh_8>kNOdyLNMtMx2R~U?@HlE6W=>DPQrqLO zP@2dLJvG9?H*ykrM=CibJ0#y@Xl@#(Z+U`Q&)}Bc0Au*i=zds0b72Jz1hfRmIR5g6 zU+Eyj&)cu8Lv_h|i4)07NB8*+4cON>3yC&TWPQxKbTpATa??qSW}A57rDgeCNyW%k zk!P=9%z>otG)IpP{Rv*qNmI^LE?(a4uiHIUS*swh5}deq8<$^h8HZeD=PlRUo4G9T z?=zJ`c0%lfAiu*%J@=f=>Bv@brjdLqlZw|v^k>920re?U!meLpPTK+**lX#A%am!P zjP}=Hve*Mzd2{ApmqYlqUIu9mT;Q2lzz?dpFDyTNZ;EQ*?7fMyUrLo4lynRT*min+ zRb*nI7xGG&YP7WNK>mSSZ&)3?3MP0*r*?CpITYcJ_{JIMS9e%pNbmLf+8AQ?YKUR= z)3Q>9v(Oz*xdvTEeBAFM7*PeC{%;E2vw*Adw1iR+X%pIMB+!Z1zbAex_(46T>e~i% zx9l$7+1cu56gsjh!V~RL8PQj7DxeO%#{&FDw?O?aYucGmeGPdm;ig6BOYOK$>}*z% zh02`m^%MIwzx0&T-U@h1^N<|hK(T!BKpIcBK zbRzR@BIKHxu$voDGBYdv)`l@cXU;%BHF6YIX`C`AsM6Bmqn?CQ)ht!0{fP|yM{LTjJkK$WiSyvGQX9Gnr?HQcY+Okf_Cj1j-8nl^68 zMA8#8N>@)D@iitZ;Wx=k05jfL=F=psl;qsexC)@^kQc&;^K482!Q^t@z%RE_R{T)! zS|lbNJaz(nbyOd4{-&UVk$yd$P#*SJy9jL7*Iv4kDq%3-_8=kWHAWXp*4HNl6@SKg zDST7C3OcssRDLG`4+BLrq$KMyDi&0R+*CrYdYkxk6i_hfB=535To=lW4(Y;oQyi`k zQ!TcJ&dMhJUq*^HKMuXZ7e4R!x?WL;FUnf0@j==(GMIr03MjjG1S-GsWOK>~i$FeW z%SxKH45MpY4+~tOVvIulFS+(kvbi;8)7S63@N+q#}lxm1LUYt5EEva{xL(9%YD z(t##UYFENLXy!A1vW!uMXwnrGn@N5vr+}?oHojxw(y1;}6np|__YBhemdX_GbCt!F zd65UZp8h_Vc6;VKDqEjY@BAezkyh(zoa;^n8u16QjH#)u9EtYFg(EgKIIgrE&3B*P z)y>psnZX_xEk?7OxMkOn=@41Xg%o`pWIywvUW$vSn;1;=G%;FI4t-1;INxf*;4QP* z*lMcNF~HS2eDZOmd*ap_8hcAUV&^`3I(iOQ{lvD)Yw8cu8Fven+8)s>yo8dwnx+33 zl5;$#%4^!*Njyb2V4ZH4kcXxd*Um2FcfnpbB7g6xjHnt8oo)Hb zaYLMud2jzw$S8<1-I)zV6G?n3ebTB7To2Q9zb*v6ZyB$?{@~3J&51g|(C)V$!fmEy zzDVqs$ge)6bDgccLB;XZ?!e1Lbq;g5gVLp;9sj7g9DQ^H@*oxl5g-S2LZtUdK#c|V zWIGU2HmP268ierole*)V9J{+a&v+VwkBemVi z>**Nm_JRW9Jy;lQD-LydE8oW3`z_|j8!-EbWWU?bgj{E+Qd2G8IAvQ4L(#Y?`AX9Y z%B@wN)2Bf4}9VsSHTlkFn8aUDLF~AC? z>O9}ZhTh+Q-O42Cn{nPEf4;&p)x~;7kRk?QKei3zoLK{F-wXyAg(HU8sd$$(+@pJA z!6>W#pxiQ|pGY3(?07z!vt;GInP4Jc^ia}lAN1SA%D2EH=sb#t+4BW=1HKPoRI(~U zybCD8EWt-lACH%u86D4d*<+;mY2zdA`bU^i48$Y?;!pA0gB5rCvkoM^luLB}QGEpA ztF;!u6EcAMB)`=EH;MVB`=3lZR5@l^+H+$XhnvB*UYMTD$1R#Hya|+ph}V%;sK*gp*(FTv$=`mqoO8Ue8-i*MzC3p5ingFiUT0_UpNXn&1N%i3Kye@dze5J}!VGN;EXG#iQYeRf=DV167Qe3xf^@rc+leXM$eyl61SI zP?~Fo#EdK0Jw8EDD|>m+Ldhc=Uf2Qk`gg7gEZr5vL`#UU;K+-hHhBa!N(9W;c00=1 zJn0MQ)XuPFg&)D_Utz&L5|tA9thpK1MA+n#%fF+NzkXIESACpEm?$t~yRV-4XBSqt`54m! zu;O)q6(jv=#d@~3|1)BM_5O90yww6!Qqa5zj`*QkIqaJhMWdBx4RZnlgUF9FmG8MR zD#>z+lMUOw=h*Esz;EOqocR&N_A))lP7#y`aG*?ErS2VLZ6QYQhMTMgKXUMwwb=(L zG)pI~z8Q@2eUkP1PWrB64k;VlPU@ZGgQzelec>Y7xl+tfi<(nuwFhopD$;Uo^enPhDZx$Bjip_Elg z*elqZZ@O9W`fzc%4)DeVmb^v6p?1`BqDhST6iQ`MU{tFx7r>4dYm!U8lI(V~z90A) zYkc~@;Ermu3DVH+L?Ea%i-TyWesr``Gk`m&N93UeB9?sP+cqC)aKu^1VZI1VdB;$( zmx{^l3FcN@_Ayrb^+z8A1L(Wt^2H?jK^B}kJ{b<~h(VZ% zu-ALL$7}W=)}zSN8YmD!l=@gSds5)N%CLY+_*EB%N>=%Ps0p8M4!3QQ1&of# literal 0 HcmV?d00001 diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 25004c7a..d4d790c0 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -79,7 +79,7 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - path = next(item for item in get_excel_paths() if item.stem == "test-01") + path = next(item for item in get_excel_paths() if item.stem == "xlsx_01") in_doc = InputDocument( path_or_stream=path, format=InputFormat.XLSX, @@ -90,7 +90,7 @@ def test_pages(documents) -> None: assert backend.page_count() == 4 # number of pages from the converted document - doc = next(item for path, item in documents if path.stem == "test-01") + doc = next(item for path, item in documents if path.stem == "xlsx_01") assert len(doc.pages) == 4 # page sizes as number of cells @@ -98,3 +98,18 @@ def test_pages(documents) -> None: assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0) + + +def test_chartsheet(documents) -> None: + """Test the conversion of Chartsheets. + + Args: + documents: The paths and converted documents. + """ + doc = next(item for path, item in documents if path.stem == "xlsx_03_chartsheet") + assert len(doc.pages) == 2 + + # Chartseet content is for now ignored + assert doc.groups[1].name == "sheet: Duck Chart" + assert doc.pages[2].size.height == 0 + assert doc.pages[2].size.width == 0