diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 220c05fa..9ad2a360 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -1,10 +1,11 @@ import logging from io import BytesIO from pathlib import Path -from typing import Any, Union, cast +from typing import Any, Optional, Union, cast from docling_core.types.doc import ( BoundingBox, + ContentLayer, CoordOrigin, DocItem, DoclingDocument, @@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken parent=None, label=GroupLabel.SECTION, name=f"sheet: {sheet_name}", + content_layer=self._get_sheet_content_layer(sheet), ) doc = self._convert_sheet(doc, sheet) width, height = self._find_page_size(doc, page_no) @@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken """ if self.workbook is not None: + content_layer = self._get_sheet_content_layer(sheet) tables = self._find_data_tables(sheet) for excel_table in tables: @@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken origin=CoordOrigin.TOPLEFT, ), ), + content_layer=content_layer, ) return doc @@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken The updated DoclingDocument. """ if self.workbook is not None: + content_layer = self._get_sheet_content_layer(sheet) # Iterate over byte images in the sheet for item in sheet._images: # type: ignore[attr-defined] try: @@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken anchor, origin=CoordOrigin.TOPLEFT ), ), + content_layer=content_layer, ) except Exception: _log.error("could not extract the image from excel sheets") @@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b return (right - left, bottom - top) + + @staticmethod + def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]: + return ( + None + if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE + else ContentLayer.INVISIBLE + ) diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json index f72f62c1..cfa618d0 100644 --- a/tests/data/groundtruth/docling_v2/test-01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -4,7 +4,7 @@ "name": "test-01", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "binary_hash": 13665052226482254103, + "binary_hash": 5648670872883457266, "filename": "test-01.xlsx" }, "furniture": { @@ -25,6 +25,9 @@ }, { "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/3" } ], "content_layer": "body", @@ -85,6 +88,20 @@ "content_layer": "body", "name": "sheet: Sheet3", "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/6" + } + ], + "content_layer": "invisible", + "name": "sheet: Sheet4", + "label": "section" } ], "texts": [], @@ -3382,6 +3399,95 @@ ] }, "annotations": [] + }, + { + "self_ref": "#/tables/6", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "invisible", + "label": "table", + "prov": [ + { + "page_no": 4, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 1.0, + "b": 2.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "header", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 2, + "num_cols": 1, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "header", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + }, + "annotations": [] } ], "key_value_items": [], @@ -3407,6 +3513,13 @@ "height": 36.0 }, "page_no": 3 + }, + "4": { + "size": { + "width": 0.0, + "height": 0.0 + }, + "page_no": 4 } } } \ No newline at end of file diff --git a/tests/data/xlsx/test-01.xlsx b/tests/data/xlsx/test-01.xlsx index ea35b723..5ce87531 100644 Binary files a/tests/data/xlsx/test-01.xlsx and b/tests/data/xlsx/test-01.xlsx differ diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 6dd73425..25004c7a 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -87,13 +87,14 @@ def test_pages(documents) -> None: backend=MsExcelDocumentBackend, ) backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) - assert backend.page_count() == 3 + assert backend.page_count() == 4 # number of pages from the converted document doc = next(item for path, item in documents if path.stem == "test-01") - assert len(doc.pages) == 3 + assert len(doc.pages) == 4 # page sizes as number of cells assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) + assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)