mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
feat(xlsx): create a page for each worksheet in XLSX backend
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
e813f02943
commit
06a0ae8294
@ -1,31 +1,35 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from typing import Any, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
ProvenanceItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
from PIL import Image as PILImage
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
from typing import Any, List
|
||||
|
||||
from PIL import Image as PILImage
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ExcelCell(BaseModel):
|
||||
row: int
|
||||
@ -38,10 +42,10 @@ class ExcelCell(BaseModel):
|
||||
class ExcelTable(BaseModel):
|
||||
num_rows: int
|
||||
num_cols: int
|
||||
data: List[ExcelCell]
|
||||
data: list[ExcelCell]
|
||||
|
||||
|
||||
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||
@override
|
||||
def __init__(
|
||||
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
||||
@ -63,12 +67,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
||||
|
||||
self.valid = True
|
||||
self.valid = self.workbook is not None
|
||||
except Exception as e:
|
||||
self.valid = False
|
||||
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
@ -81,6 +85,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True
|
||||
|
||||
def page_count(self) -> int:
|
||||
if self.is_valid() and self.workbook:
|
||||
return len(self.workbook.sheetnames)
|
||||
else:
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
@ -117,6 +127,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Access the sheet by name
|
||||
sheet = self.workbook[sheet_name]
|
||||
idx = self.workbook.index(sheet)
|
||||
# TODO: check concept of Size as number of rows and cols
|
||||
doc.add_page(page_no=idx + 1, size=Size())
|
||||
|
||||
self.parents[0] = doc.add_group(
|
||||
parent=None,
|
||||
@ -142,6 +155,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
self, doc: DoclingDocument, sheet: Worksheet
|
||||
) -> DoclingDocument:
|
||||
|
||||
if self.workbook is not None:
|
||||
tables = self._find_data_tables(sheet)
|
||||
|
||||
for excel_table in tables:
|
||||
@ -169,11 +183,22 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
table_data.table_cells.append(cell)
|
||||
|
||||
doc.add_table(data=table_data, parent=self.parents[0])
|
||||
page_no = self.workbook.index(sheet) + 1
|
||||
doc.add_table(
|
||||
data=table_data,
|
||||
parent=self.parents[0],
|
||||
prov=ProvenanceItem(
|
||||
page_no=page_no,
|
||||
charspan=(0, 0),
|
||||
bbox=BoundingBox.from_tuple(
|
||||
(0, 0, 0, 0), origin=CoordOrigin.BOTTOMLEFT
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
return doc
|
||||
|
||||
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
|
||||
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
||||
"""
|
||||
Find all compact rectangular data tables in a sheet.
|
||||
"""
|
||||
@ -327,16 +352,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
self, doc: DoclingDocument, sheet: Worksheet
|
||||
) -> DoclingDocument:
|
||||
|
||||
if self.workbook is not None:
|
||||
# Iterate over byte images in the sheet
|
||||
for idx, image in enumerate(sheet._images): # type: ignore
|
||||
|
||||
for image in sheet._images: # type: ignore[attr-defined]
|
||||
try:
|
||||
pil_image = PILImage.open(image.ref)
|
||||
|
||||
page_no = self.workbook.index(sheet) + 1
|
||||
doc.add_picture(
|
||||
parent=self.parents[0],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
prov=ProvenanceItem(
|
||||
page_no=page_no,
|
||||
charspan=(0, 0),
|
||||
bbox=BoundingBox.from_tuple(
|
||||
(0, 0, 0, 0), origin=CoordOrigin.TOPLEFT
|
||||
),
|
||||
),
|
||||
)
|
||||
except:
|
||||
_log.error("could not extract the image from excel sheets")
|
||||
|
@ -97,7 +97,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 3,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -122,7 +137,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -661,7 +691,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -1564,7 +1609,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -1955,7 +2015,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -2346,7 +2421,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 3,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -2813,7 +2903,22 @@
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 3,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -3275,5 +3380,27 @@
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 0.0,
|
||||
"height": 0.0
|
||||
},
|
||||
"page_no": 1
|
||||
},
|
||||
"2": {
|
||||
"size": {
|
||||
"width": 0.0,
|
||||
"height": 0.0
|
||||
},
|
||||
"page_no": 2
|
||||
},
|
||||
"3": {
|
||||
"size": {
|
||||
"width": 0.0,
|
||||
"height": 0.0
|
||||
},
|
||||
"page_no": 3
|
||||
}
|
||||
}
|
||||
}
|
@ -1,13 +1,18 @@
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
@ -28,13 +33,15 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_xlsx_conversions():
|
||||
@pytest.fixture(scope="module")
|
||||
def documents() -> list[tuple[Path, DoclingDocument]]:
|
||||
documents: list[dict[Path, DoclingDocument]] = []
|
||||
|
||||
xlsx_paths = get_xlsx_paths()
|
||||
converter = get_converter()
|
||||
|
||||
for xlsx_path in xlsx_paths:
|
||||
print(f"converting {xlsx_path}")
|
||||
_log.debug(f"converting {xlsx_path}")
|
||||
|
||||
gt_path = (
|
||||
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
|
||||
@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions():
|
||||
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
assert doc, f"Failed to convert document from file {gt_path}"
|
||||
documents.append((gt_path, doc))
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def test_e2e_xlsx_conversions(documents):
|
||||
for gt_path, doc in documents:
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
||||
|
||||
@ -57,3 +72,15 @@ def test_e2e_xlsx_conversions():
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
|
||||
|
||||
def test_page_count():
|
||||
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
format=InputFormat.XLSX,
|
||||
filename=path.stem,
|
||||
backend=MsExcelDocumentBackend,
|
||||
)
|
||||
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
||||
assert backend.page_count() == 3
|
||||
|
Loading…
Reference in New Issue
Block a user