docling(xlsx): add bounding boxes and page size information in cell units

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-04-09 17:40:40 +02:00
parent 81f38e960a
commit d001097376
3 changed files with 122 additions and 53 deletions

View File

@ -1,11 +1,12 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Any, Union from typing import Any, Union, cast
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox, BoundingBox,
CoordOrigin, CoordOrigin,
DocItem,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupLabel, GroupLabel,
@ -16,9 +17,11 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from openpyxl import load_workbook from openpyxl import load_workbook
from openpyxl.drawing.image import Image
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image as PILImage from PIL import Image as PILImage
from pydantic import BaseModel from pydantic import BaseModel, NonNegativeInt, PositiveInt
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import (
@ -50,21 +53,37 @@ class ExcelCell(BaseModel):
class ExcelTable(BaseModel): class ExcelTable(BaseModel):
"""Represents an Excel table. """Represents an Excel table on a worksheet.
Attributes: Attributes:
anchor: The column and row indices of the upper-left cell of the table
(0-based index).
num_rows: The number of rows in the table. num_rows: The number of rows in the table.
num_cols: The number of columns in the table. num_cols: The number of columns in the table.
data: The data in the table, represented as a list of ExcelCell objects. data: The data in the table, represented as a list of ExcelCell objects.
""" """
anchor: tuple[NonNegativeInt, NonNegativeInt]
num_rows: int num_rows: int
num_cols: int num_cols: int
data: list[ExcelCell] data: list[ExcelCell]
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
"""Backend for parsing Excel workbooks.""" """Backend for parsing Excel workbooks.
The backend converts an Excel workbook into a DoclingDocument object.
Each worksheet is converted into a separate page.
The following elements are parsed:
- Cell contents, parsed as tables. If two groups of cells are disconnected
between each other, they will be parsed as two different tables.
- Images, parsed as PictureItem objects.
The DoclingDocument tables and pictures have their provenance information, including
the position in their original Excel worksheet. The position is represented by a
bounding box object with the cell indices as units (0-based index). The size of this
bounding box is the number of columns and rows that the table or picture spans.
"""
@override @override
def __init__( def __init__(
@ -106,7 +125,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
@override @override
def is_valid(self) -> bool: def is_valid(self) -> bool:
_log.info(f"valid: {self.valid}") _log.debug(f"valid: {self.valid}")
return self.valid return self.valid
@classmethod @classmethod
@ -170,19 +189,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
for sheet_name in self.workbook.sheetnames: for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}") _log.info(f"Processing sheet: {sheet_name}")
# Access the sheet by name
sheet = self.workbook[sheet_name] sheet = self.workbook[sheet_name]
idx = self.workbook.index(sheet) page_no = self.workbook.index(sheet) + 1
# TODO: check concept of Size as number of rows and cols # do not rely on sheet.max_column, sheet.max_row if there are images
doc.add_page(page_no=idx + 1, size=Size()) page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
self.parents[0] = doc.add_group( self.parents[0] = doc.add_group(
parent=None, parent=None,
label=GroupLabel.SECTION, label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}", name=f"sheet: {sheet_name}",
) )
doc = self._convert_sheet(doc, sheet) doc = self._convert_sheet(doc, sheet)
width, height = self._find_page_size(doc, page_no)
page.size = Size(width=width, height=height)
else: else:
_log.error("Workbook is not initialized.") _log.error("Workbook is not initialized.")
@ -222,6 +241,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
tables = self._find_data_tables(sheet) tables = self._find_data_tables(sheet)
for excel_table in tables: for excel_table in tables:
origin_col = excel_table.anchor[0]
origin_row = excel_table.anchor[1]
num_rows = excel_table.num_rows num_rows = excel_table.num_rows
num_cols = excel_table.num_cols num_cols = excel_table.num_cols
@ -254,7 +275,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
page_no=page_no, page_no=page_no,
charspan=(0, 0), charspan=(0, 0),
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
(0, 0, 0, 0), origin=CoordOrigin.BOTTOMLEFT (
origin_col,
origin_row,
origin_col + num_cols,
origin_row + num_rows,
),
origin=CoordOrigin.TOPLEFT,
), ),
), ),
) )
@ -322,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
row_span = 1 row_span = 1
col_span = 1 col_span = 1
# _log.info(sheet.merged_cells.ranges)
for merged_range in sheet.merged_cells.ranges: for merged_range in sheet.merged_cells.ranges:
if ( if (
@ -346,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
col_span=col_span, col_span=col_span,
) )
) )
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
# Mark all cells in the span as visited # Mark all cells in the span as visited
for span_row in range(ri, ri + row_span): for span_row in range(ri, ri + row_span):
@ -355,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
return ( return (
ExcelTable( ExcelTable(
anchor=(start_col, start_row),
num_rows=max_row + 1 - start_row, num_rows=max_row + 1 - start_row,
num_cols=max_col + 1 - start_col, num_cols=max_col + 1 - start_col,
data=data, data=data,
@ -446,13 +472,21 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
Returns: Returns:
The updated DoclingDocument. The updated DoclingDocument.
""" """
if self.workbook is not None: if self.workbook is not None:
# Iterate over byte images in the sheet # Iterate over byte images in the sheet
for image in sheet._images: # type: ignore[attr-defined] for item in sheet._images: # type: ignore[attr-defined]
try: try:
pil_image = PILImage.open(image.ref) image: Image = cast(Image, item)
pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
page_no = self.workbook.index(sheet) + 1 page_no = self.workbook.index(sheet) + 1
anchor = (0, 0, 0, 0)
if isinstance(image.anchor, TwoCellAnchor):
anchor = (
image.anchor._from.col,
image.anchor._from.row,
image.anchor.to.col + 1,
image.anchor.to.row + 1,
)
doc.add_picture( doc.add_picture(
parent=self.parents[0], parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
@ -461,7 +495,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
page_no=page_no, page_no=page_no,
charspan=(0, 0), charspan=(0, 0),
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
(0, 0, 0, 0), origin=CoordOrigin.TOPLEFT anchor, origin=CoordOrigin.TOPLEFT
), ),
), ),
) )
@ -469,3 +503,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
_log.error("could not extract the image from excel sheets") _log.error("could not extract the image from excel sheets")
return doc return doc
@staticmethod
def _find_page_size(
doc: DoclingDocument, page_no: PositiveInt
) -> tuple[float, float]:
left: float = -1.0
top: float = -1.0
right: float = -1.0
bottom: float = -1.0
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
if not isinstance(item, DocItem):
continue
for provenance in item.prov:
bbox = provenance.bbox
left = min(left, bbox.l) if left != -1 else bbox.l
right = max(right, bbox.r) if right != -1 else bbox.r
top = min(top, bbox.t) if top != -1 else bbox.t
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
return (right - left, bottom - top)

View File

@ -101,10 +101,10 @@
{ {
"page_no": 3, "page_no": 3,
"bbox": { "bbox": {
"l": 0.0, "l": 8.0,
"t": 0.0, "t": 18.0,
"r": 0.0, "r": 13.0,
"b": 0.0, "b": 36.0,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
@ -143,9 +143,9 @@
"bbox": { "bbox": {
"l": 0.0, "l": 0.0,
"t": 0.0, "t": 0.0,
"r": 0.0, "r": 3.0,
"b": 0.0, "b": 7.0,
"coord_origin": "BOTTOMLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
@ -697,9 +697,9 @@
"bbox": { "bbox": {
"l": 0.0, "l": 0.0,
"t": 0.0, "t": 0.0,
"r": 0.0, "r": 4.0,
"b": 0.0, "b": 9.0,
"coord_origin": "BOTTOMLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
@ -1613,11 +1613,11 @@
{ {
"page_no": 2, "page_no": 2,
"bbox": { "bbox": {
"l": 0.0, "l": 6.0,
"t": 0.0, "t": 4.0,
"r": 0.0, "r": 9.0,
"b": 0.0, "b": 9.0,
"coord_origin": "BOTTOMLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
@ -2019,11 +2019,11 @@
{ {
"page_no": 2, "page_no": 2,
"bbox": { "bbox": {
"l": 0.0, "l": 2.0,
"t": 0.0, "t": 13.0,
"r": 0.0, "r": 5.0,
"b": 0.0, "b": 18.0,
"coord_origin": "BOTTOMLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
@ -2427,9 +2427,9 @@
"bbox": { "bbox": {
"l": 0.0, "l": 0.0,
"t": 0.0, "t": 0.0,
"r": 0.0, "r": 3.0,
"b": 0.0, "b": 7.0,
"coord_origin": "BOTTOMLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
@ -2907,11 +2907,11 @@
{ {
"page_no": 3, "page_no": 3,
"bbox": { "bbox": {
"l": 0.0, "l": 4.0,
"t": 0.0, "t": 6.0,
"r": 0.0, "r": 7.0,
"b": 0.0, "b": 13.0,
"coord_origin": "BOTTOMLEFT" "coord_origin": "TOPLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
@ -3383,22 +3383,22 @@
"pages": { "pages": {
"1": { "1": {
"size": { "size": {
"width": 0.0, "width": 3.0,
"height": 0.0 "height": 7.0
}, },
"page_no": 1 "page_no": 1
}, },
"2": { "2": {
"size": { "size": {
"width": 0.0, "width": 9.0,
"height": 0.0 "height": 18.0
}, },
"page_no": 2 "page_no": 2
}, },
"3": { "3": {
"size": { "size": {
"width": 0.0, "width": 13.0,
"height": 0.0 "height": 36.0
}, },
"page_no": 3 "page_no": 3
} }

View File

@ -57,7 +57,7 @@ def documents() -> list[tuple[Path, DoclingDocument]]:
return documents return documents
def test_e2e_xlsx_conversions(documents): def test_e2e_xlsx_conversions(documents) -> None:
for gt_path, doc in documents: for gt_path, doc in documents:
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
@ -74,7 +74,13 @@ def test_e2e_xlsx_conversions(documents):
), "document document" ), "document document"
def test_page_count(): def test_pages(documents) -> None:
"""Test the page count and page size of converted documents.
Args:
documents: The paths and converted documents.
"""
# number of pages from the backend method
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=path, path_or_stream=path,
@ -84,3 +90,12 @@ def test_page_count():
) )
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
assert backend.page_count() == 3 assert backend.page_count() == 3
# number of pages from the converted document
doc = [item for path, item in documents if path.stem == "test-01"][0]
assert len(doc.pages) == 3
# page sizes as number of cells
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)