mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
feat(xlsx): create a page for each worksheet in XLSX backend
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
e813f02943
commit
06a0ae8294
@ -1,31 +1,35 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
BoundingBox,
|
||||||
|
CoordOrigin,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
ImageRef,
|
ImageRef,
|
||||||
|
ProvenanceItem,
|
||||||
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
from openpyxl.worksheet.worksheet import Worksheet
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
from pydantic import BaseModel
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import (
|
||||||
|
DeclarativeDocumentBackend,
|
||||||
|
PaginatedDocumentBackend,
|
||||||
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
from typing import Any, List
|
|
||||||
|
|
||||||
from PIL import Image as PILImage
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
class ExcelCell(BaseModel):
|
class ExcelCell(BaseModel):
|
||||||
row: int
|
row: int
|
||||||
@ -38,10 +42,10 @@ class ExcelCell(BaseModel):
|
|||||||
class ExcelTable(BaseModel):
|
class ExcelTable(BaseModel):
|
||||||
num_rows: int
|
num_rows: int
|
||||||
num_cols: int
|
num_cols: int
|
||||||
data: List[ExcelCell]
|
data: list[ExcelCell]
|
||||||
|
|
||||||
|
|
||||||
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||||
@override
|
@override
|
||||||
def __init__(
|
def __init__(
|
||||||
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
||||||
@ -63,12 +67,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elif isinstance(self.path_or_stream, Path):
|
elif isinstance(self.path_or_stream, Path):
|
||||||
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
||||||
|
|
||||||
self.valid = True
|
self.valid = self.workbook is not None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@ -81,6 +85,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def page_count(self) -> int:
|
||||||
|
if self.is_valid() and self.workbook:
|
||||||
|
return len(self.workbook.sheetnames)
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@override
|
@override
|
||||||
def supported_formats(cls) -> set[InputFormat]:
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
@ -117,6 +127,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Access the sheet by name
|
# Access the sheet by name
|
||||||
sheet = self.workbook[sheet_name]
|
sheet = self.workbook[sheet_name]
|
||||||
|
idx = self.workbook.index(sheet)
|
||||||
|
# TODO: check concept of Size as number of rows and cols
|
||||||
|
doc.add_page(page_no=idx + 1, size=Size())
|
||||||
|
|
||||||
self.parents[0] = doc.add_group(
|
self.parents[0] = doc.add_group(
|
||||||
parent=None,
|
parent=None,
|
||||||
@ -142,6 +155,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self, doc: DoclingDocument, sheet: Worksheet
|
self, doc: DoclingDocument, sheet: Worksheet
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
|
||||||
|
if self.workbook is not None:
|
||||||
tables = self._find_data_tables(sheet)
|
tables = self._find_data_tables(sheet)
|
||||||
|
|
||||||
for excel_table in tables:
|
for excel_table in tables:
|
||||||
@ -169,11 +183,22 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
table_data.table_cells.append(cell)
|
table_data.table_cells.append(cell)
|
||||||
|
|
||||||
doc.add_table(data=table_data, parent=self.parents[0])
|
page_no = self.workbook.index(sheet) + 1
|
||||||
|
doc.add_table(
|
||||||
|
data=table_data,
|
||||||
|
parent=self.parents[0],
|
||||||
|
prov=ProvenanceItem(
|
||||||
|
page_no=page_no,
|
||||||
|
charspan=(0, 0),
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
(0, 0, 0, 0), origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
|
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
||||||
"""
|
"""
|
||||||
Find all compact rectangular data tables in a sheet.
|
Find all compact rectangular data tables in a sheet.
|
||||||
"""
|
"""
|
||||||
@ -327,16 +352,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self, doc: DoclingDocument, sheet: Worksheet
|
self, doc: DoclingDocument, sheet: Worksheet
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
|
||||||
|
if self.workbook is not None:
|
||||||
# Iterate over byte images in the sheet
|
# Iterate over byte images in the sheet
|
||||||
for idx, image in enumerate(sheet._images): # type: ignore
|
for image in sheet._images: # type: ignore[attr-defined]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pil_image = PILImage.open(image.ref)
|
pil_image = PILImage.open(image.ref)
|
||||||
|
page_no = self.workbook.index(sheet) + 1
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[0],
|
parent=self.parents[0],
|
||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
|
prov=ProvenanceItem(
|
||||||
|
page_no=page_no,
|
||||||
|
charspan=(0, 0),
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
(0, 0, 0, 0), origin=CoordOrigin.TOPLEFT
|
||||||
|
),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
except:
|
except:
|
||||||
_log.error("could not extract the image from excel sheets")
|
_log.error("could not extract the image from excel sheets")
|
||||||
|
@ -97,7 +97,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "picture",
|
"label": "picture",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 3,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -122,7 +137,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -661,7 +691,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -1564,7 +1609,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -1955,7 +2015,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -2346,7 +2421,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 3,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -2813,7 +2903,22 @@
|
|||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 3,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -3275,5 +3380,27 @@
|
|||||||
],
|
],
|
||||||
"key_value_items": [],
|
"key_value_items": [],
|
||||||
"form_items": [],
|
"form_items": [],
|
||||||
"pages": {}
|
"pages": {
|
||||||
|
"1": {
|
||||||
|
"size": {
|
||||||
|
"width": 0.0,
|
||||||
|
"height": 0.0
|
||||||
|
},
|
||||||
|
"page_no": 1
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"size": {
|
||||||
|
"width": 0.0,
|
||||||
|
"height": 0.0
|
||||||
|
},
|
||||||
|
"page_no": 2
|
||||||
|
},
|
||||||
|
"3": {
|
||||||
|
"size": {
|
||||||
|
"width": 0.0,
|
||||||
|
"height": 0.0
|
||||||
|
},
|
||||||
|
"page_no": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,13 +1,18 @@
|
|||||||
import os
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
from .test_data_gen_flag import GEN_TEST_DATA
|
from .test_data_gen_flag import GEN_TEST_DATA
|
||||||
from .verify_utils import verify_document, verify_export
|
from .verify_utils import verify_document, verify_export
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
GENERATE = GEN_TEST_DATA
|
GENERATE = GEN_TEST_DATA
|
||||||
|
|
||||||
|
|
||||||
@ -28,13 +33,15 @@ def get_converter():
|
|||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_e2e_xlsx_conversions():
|
@pytest.fixture(scope="module")
|
||||||
|
def documents() -> list[tuple[Path, DoclingDocument]]:
|
||||||
|
documents: list[dict[Path, DoclingDocument]] = []
|
||||||
|
|
||||||
xlsx_paths = get_xlsx_paths()
|
xlsx_paths = get_xlsx_paths()
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
for xlsx_path in xlsx_paths:
|
for xlsx_path in xlsx_paths:
|
||||||
print(f"converting {xlsx_path}")
|
_log.debug(f"converting {xlsx_path}")
|
||||||
|
|
||||||
gt_path = (
|
gt_path = (
|
||||||
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
|
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
|
||||||
@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions():
|
|||||||
|
|
||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
|
assert doc, f"Failed to convert document from file {gt_path}"
|
||||||
|
documents.append((gt_path, doc))
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
def test_e2e_xlsx_conversions(documents):
|
||||||
|
for gt_path, doc in documents:
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
||||||
|
|
||||||
@ -57,3 +72,15 @@ def test_e2e_xlsx_conversions():
|
|||||||
assert verify_document(
|
assert verify_document(
|
||||||
doc, str(gt_path) + ".json", GENERATE
|
doc, str(gt_path) + ".json", GENERATE
|
||||||
), "document document"
|
), "document document"
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_count():
|
||||||
|
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=path,
|
||||||
|
format=InputFormat.XLSX,
|
||||||
|
filename=path.stem,
|
||||||
|
backend=MsExcelDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
||||||
|
assert backend.page_count() == 3
|
||||||
|
Loading…
Reference in New Issue
Block a user