feat(msexcel): set ContentLayer.INVISIBLE for invisible sheet (#1876)

* feat(msexcel): ignore invisible sheet

* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>

I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: ca391f4908f44f301de54a97057f0b809f5ce66c

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* retain invisible sheet with ContentLayer.INVISIBLE

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* update UT

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* fix: use Optional for python3.9

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>

I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: a34371a90e

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>

---------

Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
This commit is contained in:
Qiefan Jiang
2025-09-01 19:53:45 +08:00
committed by GitHub
parent be26044f14
commit a283ccff25
4 changed files with 132 additions and 4 deletions

View File

@@ -1,10 +1,11 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Any, Union, cast
from typing import Any, Optional, Union, cast
from docling_core.types.doc import (
BoundingBox,
ContentLayer,
CoordOrigin,
DocItem,
DoclingDocument,
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
parent=None,
label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}",
content_layer=self._get_sheet_content_layer(sheet),
)
doc = self._convert_sheet(doc, sheet)
width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
"""
if self.workbook is not None:
content_layer = self._get_sheet_content_layer(sheet)
tables = self._find_data_tables(sheet)
for excel_table in tables:
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
origin=CoordOrigin.TOPLEFT,
),
),
content_layer=content_layer,
)
return doc
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
The updated DoclingDocument.
"""
if self.workbook is not None:
content_layer = self._get_sheet_content_layer(sheet)
# Iterate over byte images in the sheet
for item in sheet._images: # type: ignore[attr-defined]
try:
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
anchor, origin=CoordOrigin.TOPLEFT
),
),
content_layer=content_layer,
)
except Exception:
_log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
return (right - left, bottom - top)
@staticmethod
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
return (
None
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
else ContentLayer.INVISIBLE
)