feat(msexcel): set ContentLayer.INVISIBLE for invisible sheet (#1876)

* feat(msexcel): ignore invisible sheet

* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>

I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: ca391f4908f44f301de54a97057f0b809f5ce66c

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* retain invisible sheet with ContentLayer.INVISIBLE

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* update UT

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* fix: use Optional for python3.9

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>

I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: a34371a90e

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

---------

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
This commit is contained in:
Qiefan Jiang
2025-09-01 19:53:45 +08:00
committed by GitHub
parent be26044f14
commit a283ccff25
4 changed files with 132 additions and 4 deletions

View File

@@ -1,10 +1,11 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Any, Union, cast from typing import Any, Optional, Union, cast
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox, BoundingBox,
ContentLayer,
CoordOrigin, CoordOrigin,
DocItem, DocItem,
DoclingDocument, DoclingDocument,
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
parent=None, parent=None,
label=GroupLabel.SECTION, label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}", name=f"sheet: {sheet_name}",
content_layer=self._get_sheet_content_layer(sheet),
) )
doc = self._convert_sheet(doc, sheet) doc = self._convert_sheet(doc, sheet)
width, height = self._find_page_size(doc, page_no) width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
""" """
if self.workbook is not None: if self.workbook is not None:
content_layer = self._get_sheet_content_layer(sheet)
tables = self._find_data_tables(sheet) tables = self._find_data_tables(sheet)
for excel_table in tables: for excel_table in tables:
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
origin=CoordOrigin.TOPLEFT, origin=CoordOrigin.TOPLEFT,
), ),
), ),
content_layer=content_layer,
) )
return doc return doc
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
The updated DoclingDocument. The updated DoclingDocument.
""" """
if self.workbook is not None: if self.workbook is not None:
content_layer = self._get_sheet_content_layer(sheet)
# Iterate over byte images in the sheet # Iterate over byte images in the sheet
for item in sheet._images: # type: ignore[attr-defined] for item in sheet._images: # type: ignore[attr-defined]
try: try:
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
anchor, origin=CoordOrigin.TOPLEFT anchor, origin=CoordOrigin.TOPLEFT
), ),
), ),
content_layer=content_layer,
) )
except Exception: except Exception:
_log.error("could not extract the image from excel sheets") _log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
return (right - left, bottom - top) return (right - left, bottom - top)
@staticmethod
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
return (
None
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
else ContentLayer.INVISIBLE
)

View File

@@ -4,7 +4,7 @@
"name": "test-01", "name": "test-01",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"binary_hash": 13665052226482254103, "binary_hash": 5648670872883457266,
"filename": "test-01.xlsx" "filename": "test-01.xlsx"
}, },
"furniture": { "furniture": {
@@ -25,6 +25,9 @@
}, },
{ {
"$ref": "#/groups/2" "$ref": "#/groups/2"
},
{
"$ref": "#/groups/3"
} }
], ],
"content_layer": "body", "content_layer": "body",
@@ -85,6 +88,20 @@
"content_layer": "body", "content_layer": "body",
"name": "sheet: Sheet3", "name": "sheet: Sheet3",
"label": "section" "label": "section"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/6"
}
],
"content_layer": "invisible",
"name": "sheet: Sheet4",
"label": "section"
} }
], ],
"texts": [], "texts": [],
@@ -3382,6 +3399,95 @@
] ]
}, },
"annotations": [] "annotations": []
},
{
"self_ref": "#/tables/6",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "invisible",
"label": "table",
"prov": [
{
"page_no": 4,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 1.0,
"b": 2.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "header",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 2,
"num_cols": 1,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "header",
"column_header": true,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
} }
], ],
"key_value_items": [], "key_value_items": [],
@@ -3407,6 +3513,13 @@
"height": 36.0 "height": 36.0
}, },
"page_no": 3 "page_no": 3
},
"4": {
"size": {
"width": 0.0,
"height": 0.0
},
"page_no": 4
} }
} }
} }

Binary file not shown.

View File

@@ -87,13 +87,14 @@ def test_pages(documents) -> None:
backend=MsExcelDocumentBackend, backend=MsExcelDocumentBackend,
) )
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
assert backend.page_count() == 3 assert backend.page_count() == 4
# number of pages from the converted document # number of pages from the converted document
doc = next(item for path, item in documents if path.stem == "test-01") doc = next(item for path, item in documents if path.stem == "test-01")
assert len(doc.pages) == 3 assert len(doc.pages) == 4
# page sizes as number of cells # page sizes as number of cells
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)