mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
feat(msexcel): set ContentLayer.INVISIBLE for invisible sheet (#1876)
* feat(msexcel): ignore invisible sheet
* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>
I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: ca391f4908f44f301de54a97057f0b809f5ce66c
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* retain invisible sheet with ContentLayer.INVISIBLE
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* update UT
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* fix: use Optional for python3.9
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>
I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: a34371a90e
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
---------
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Union, cast
|
||||
from typing import Any, Optional, Union, cast
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
ContentLayer,
|
||||
CoordOrigin,
|
||||
DocItem,
|
||||
DoclingDocument,
|
||||
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
parent=None,
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"sheet: {sheet_name}",
|
||||
content_layer=self._get_sheet_content_layer(sheet),
|
||||
)
|
||||
doc = self._convert_sheet(doc, sheet)
|
||||
width, height = self._find_page_size(doc, page_no)
|
||||
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
"""
|
||||
|
||||
if self.workbook is not None:
|
||||
content_layer = self._get_sheet_content_layer(sheet)
|
||||
tables = self._find_data_tables(sheet)
|
||||
|
||||
for excel_table in tables:
|
||||
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
),
|
||||
content_layer=content_layer,
|
||||
)
|
||||
|
||||
return doc
|
||||
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
The updated DoclingDocument.
|
||||
"""
|
||||
if self.workbook is not None:
|
||||
content_layer = self._get_sheet_content_layer(sheet)
|
||||
# Iterate over byte images in the sheet
|
||||
for item in sheet._images: # type: ignore[attr-defined]
|
||||
try:
|
||||
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
anchor, origin=CoordOrigin.TOPLEFT
|
||||
),
|
||||
),
|
||||
content_layer=content_layer,
|
||||
)
|
||||
except Exception:
|
||||
_log.error("could not extract the image from excel sheets")
|
||||
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
||||
|
||||
return (right - left, bottom - top)
|
||||
|
||||
@staticmethod
|
||||
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
|
||||
return (
|
||||
None
|
||||
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
|
||||
else ContentLayer.INVISIBLE
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user