mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(msexcel): set ContentLayer.INVISIBLE for invisible sheet (#1876)
* feat(msexcel): ignore invisible sheet
* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>
I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: ca391f4908f44f301de54a97057f0b809f5ce66c
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* retain invisible sheet with ContentLayer.INVISIBLE
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* update UT
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* fix: use Optional for python3.9
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
* DCO Remediation Commit for Qiefan Jiang <jiangqiefan@bytedance.com>
I, Qiefan Jiang <jiangqiefan@bytedance.com>, hereby add my Signed-off-by to this commit: a34371a90e
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
---------
Signed-off-by: Qiefan Jiang <jiangqiefan@bytedance.com>
This commit is contained in:
@@ -1,10 +1,11 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Union, cast
|
from typing import Any, Optional, Union, cast
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
|
ContentLayer,
|
||||||
CoordOrigin,
|
CoordOrigin,
|
||||||
DocItem,
|
DocItem,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
parent=None,
|
parent=None,
|
||||||
label=GroupLabel.SECTION,
|
label=GroupLabel.SECTION,
|
||||||
name=f"sheet: {sheet_name}",
|
name=f"sheet: {sheet_name}",
|
||||||
|
content_layer=self._get_sheet_content_layer(sheet),
|
||||||
)
|
)
|
||||||
doc = self._convert_sheet(doc, sheet)
|
doc = self._convert_sheet(doc, sheet)
|
||||||
width, height = self._find_page_size(doc, page_no)
|
width, height = self._find_page_size(doc, page_no)
|
||||||
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
|
content_layer = self._get_sheet_content_layer(sheet)
|
||||||
tables = self._find_data_tables(sheet)
|
tables = self._find_data_tables(sheet)
|
||||||
|
|
||||||
for excel_table in tables:
|
for excel_table in tables:
|
||||||
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
origin=CoordOrigin.TOPLEFT,
|
origin=CoordOrigin.TOPLEFT,
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
content_layer=content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
The updated DoclingDocument.
|
The updated DoclingDocument.
|
||||||
"""
|
"""
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
|
content_layer = self._get_sheet_content_layer(sheet)
|
||||||
# Iterate over byte images in the sheet
|
# Iterate over byte images in the sheet
|
||||||
for item in sheet._images: # type: ignore[attr-defined]
|
for item in sheet._images: # type: ignore[attr-defined]
|
||||||
try:
|
try:
|
||||||
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
anchor, origin=CoordOrigin.TOPLEFT
|
anchor, origin=CoordOrigin.TOPLEFT
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
content_layer=content_layer,
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
_log.error("could not extract the image from excel sheets")
|
_log.error("could not extract the image from excel sheets")
|
||||||
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
||||||
|
|
||||||
return (right - left, bottom - top)
|
return (right - left, bottom - top)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
|
||||||
|
return (
|
||||||
|
None
|
||||||
|
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
|
||||||
|
else ContentLayer.INVISIBLE
|
||||||
|
)
|
||||||
|
|||||||
115
tests/data/groundtruth/docling_v2/test-01.xlsx.json
vendored
115
tests/data/groundtruth/docling_v2/test-01.xlsx.json
vendored
@@ -4,7 +4,7 @@
|
|||||||
"name": "test-01",
|
"name": "test-01",
|
||||||
"origin": {
|
"origin": {
|
||||||
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
"binary_hash": 13665052226482254103,
|
"binary_hash": 5648670872883457266,
|
||||||
"filename": "test-01.xlsx"
|
"filename": "test-01.xlsx"
|
||||||
},
|
},
|
||||||
"furniture": {
|
"furniture": {
|
||||||
@@ -25,6 +25,9 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/2"
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/3"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@@ -85,6 +88,20 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"name": "sheet: Sheet3",
|
"name": "sheet: Sheet3",
|
||||||
"label": "section"
|
"label": "section"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/tables/6"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "invisible",
|
||||||
|
"name": "sheet: Sheet4",
|
||||||
|
"label": "section"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"texts": [],
|
"texts": [],
|
||||||
@@ -3382,6 +3399,95 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"annotations": []
|
"annotations": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/tables/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "invisible",
|
||||||
|
"label": "table",
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 4,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 1.0,
|
||||||
|
"b": 2.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"data": {
|
||||||
|
"table_cells": [
|
||||||
|
{
|
||||||
|
"row_span": 1,
|
||||||
|
"col_span": 1,
|
||||||
|
"start_row_offset_idx": 0,
|
||||||
|
"end_row_offset_idx": 1,
|
||||||
|
"start_col_offset_idx": 0,
|
||||||
|
"end_col_offset_idx": 1,
|
||||||
|
"text": "header",
|
||||||
|
"column_header": true,
|
||||||
|
"row_header": false,
|
||||||
|
"row_section": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"row_span": 1,
|
||||||
|
"col_span": 1,
|
||||||
|
"start_row_offset_idx": 1,
|
||||||
|
"end_row_offset_idx": 2,
|
||||||
|
"start_col_offset_idx": 0,
|
||||||
|
"end_col_offset_idx": 1,
|
||||||
|
"text": "1",
|
||||||
|
"column_header": false,
|
||||||
|
"row_header": false,
|
||||||
|
"row_section": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"num_rows": 2,
|
||||||
|
"num_cols": 1,
|
||||||
|
"grid": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"row_span": 1,
|
||||||
|
"col_span": 1,
|
||||||
|
"start_row_offset_idx": 0,
|
||||||
|
"end_row_offset_idx": 1,
|
||||||
|
"start_col_offset_idx": 0,
|
||||||
|
"end_col_offset_idx": 1,
|
||||||
|
"text": "header",
|
||||||
|
"column_header": true,
|
||||||
|
"row_header": false,
|
||||||
|
"row_section": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"row_span": 1,
|
||||||
|
"col_span": 1,
|
||||||
|
"start_row_offset_idx": 1,
|
||||||
|
"end_row_offset_idx": 2,
|
||||||
|
"start_col_offset_idx": 0,
|
||||||
|
"end_col_offset_idx": 1,
|
||||||
|
"text": "1",
|
||||||
|
"column_header": false,
|
||||||
|
"row_header": false,
|
||||||
|
"row_section": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"annotations": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"key_value_items": [],
|
"key_value_items": [],
|
||||||
@@ -3407,6 +3513,13 @@
|
|||||||
"height": 36.0
|
"height": 36.0
|
||||||
},
|
},
|
||||||
"page_no": 3
|
"page_no": 3
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"size": {
|
||||||
|
"width": 0.0,
|
||||||
|
"height": 0.0
|
||||||
|
},
|
||||||
|
"page_no": 4
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
BIN
tests/data/xlsx/test-01.xlsx
vendored
BIN
tests/data/xlsx/test-01.xlsx
vendored
Binary file not shown.
@@ -87,13 +87,14 @@ def test_pages(documents) -> None:
|
|||||||
backend=MsExcelDocumentBackend,
|
backend=MsExcelDocumentBackend,
|
||||||
)
|
)
|
||||||
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
||||||
assert backend.page_count() == 3
|
assert backend.page_count() == 4
|
||||||
|
|
||||||
# number of pages from the converted document
|
# number of pages from the converted document
|
||||||
doc = next(item for path, item in documents if path.stem == "test-01")
|
doc = next(item for path, item in documents if path.stem == "test-01")
|
||||||
assert len(doc.pages) == 3
|
assert len(doc.pages) == 4
|
||||||
|
|
||||||
# page sizes as number of cells
|
# page sizes as number of cells
|
||||||
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
|
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
|
||||||
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
|
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
|
||||||
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
|
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
|
||||||
|
assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)
|
||||||
|
|||||||
Reference in New Issue
Block a user