diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index aeaed4f1..2bcc34d7 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -8,6 +8,7 @@ from docling_core.types.doc import ( ContentLayer, CoordOrigin, DocItem, + DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, @@ -31,6 +32,7 @@ from docling.backend.abstract_backend import ( DeclarativeDocumentBackend, PaginatedDocumentBackend, ) +from docling.datamodel.backend_options import MsExcelBackendOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -116,18 +118,22 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken @override def __init__( - self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + options: MsExcelBackendOptions = MsExcelBackendOptions(), ) -> None: """Initialize the MsExcelDocumentBackend object. Parameters: in_doc: The input document object. path_or_stream: The path or stream to the Excel file. + options: Backend options for Excel parsing. Raises: RuntimeError: An error occurred parsing the file. """ - super().__init__(in_doc, path_or_stream) + super().__init__(in_doc, path_or_stream, options) # Initialise the parents for the hierarchy self.max_levels = 10 @@ -277,51 +283,83 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken content_layer = self._get_sheet_content_layer(sheet) tables = self._find_data_tables(sheet) + treat_singleton_as_text = ( + isinstance(self.options, MsExcelBackendOptions) + and self.options.treat_singleton_as_text + ) + for excel_table in tables: origin_col = excel_table.anchor[0] origin_row = excel_table.anchor[1] num_rows = excel_table.num_rows num_cols = excel_table.num_cols - table_data = TableData( - num_rows=num_rows, - num_cols=num_cols, - table_cells=[], - ) - - for excel_cell in excel_table.data: - cell = TableCell( - text=excel_cell.text, - row_span=excel_cell.row_span, - col_span=excel_cell.col_span, - start_row_offset_idx=excel_cell.row, - end_row_offset_idx=excel_cell.row + excel_cell.row_span, - start_col_offset_idx=excel_cell.col, - end_col_offset_idx=excel_cell.col + excel_cell.col_span, - column_header=excel_cell.row == 0, - row_header=False, - ) - table_data.table_cells.append(cell) - - page_no = self.workbook.index(sheet) + 1 - doc.add_table( - data=table_data, - parent=self.parents[0], - prov=ProvenanceItem( - page_no=page_no, - charspan=(0, 0), - bbox=BoundingBox.from_tuple( - ( - origin_col, - origin_row, - origin_col + num_cols, - origin_row + num_rows, + if ( + treat_singleton_as_text + and num_rows == 1 + and num_cols == 1 + and excel_table.data + ): + page_no = self.workbook.index(sheet) + 1 + doc.add_text( + text=excel_table.data[0].text, + label=DocItemLabel.TEXT, + parent=self.parents[0], + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + ( + origin_col, + origin_row, + origin_col + num_cols, + origin_row + num_rows, + ), + origin=CoordOrigin.TOPLEFT, ), - origin=CoordOrigin.TOPLEFT, ), - ), - content_layer=content_layer, - ) + content_layer=content_layer, + ) + else: + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + for excel_cell in excel_table.data: + cell = TableCell( + text=excel_cell.text, + row_span=excel_cell.row_span, + col_span=excel_cell.col_span, + start_row_offset_idx=excel_cell.row, + end_row_offset_idx=excel_cell.row + excel_cell.row_span, + start_col_offset_idx=excel_cell.col, + end_col_offset_idx=excel_cell.col + excel_cell.col_span, + column_header=excel_cell.row == 0, + row_header=False, + ) + table_data.table_cells.append(cell) + + page_no = self.workbook.index(sheet) + 1 + doc.add_table( + data=table_data, + parent=self.parents[0], + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + ( + origin_col, + origin_row, + origin_col + num_cols, + origin_row + num_rows, + ), + origin=CoordOrigin.TOPLEFT, + ), + ), + content_layer=content_layer, + ) return doc diff --git a/docling/datamodel/backend_options.py b/docling/datamodel/backend_options.py index 594be689..ea5cf70c 100644 --- a/docling/datamodel/backend_options.py +++ b/docling/datamodel/backend_options.py @@ -71,12 +71,26 @@ class PdfBackendOptions(BaseBackendOptions): password: Optional[SecretStr] = None +class MsExcelBackendOptions(BaseBackendOptions): + """Options specific to the MS Excel backend.""" + + kind: Literal["xlsx"] = Field("xlsx", exclude=True, repr=False) + treat_singleton_as_text: bool = Field( + False, + description=( + "Whether to treat singleton cells (1x1 tables with empty neighboring " + "cells) as TextItem instead of TableItem." + ), + ) + + BackendOptions = Annotated[ Union[ DeclarativeBackendOptions, HTMLBackendOptions, MarkdownBackendOptions, PdfBackendOptions, + MsExcelBackendOptions, ], Field(discriminator="kind"), ] diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt new file mode 100644 index 00000000..30e0db58 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt @@ -0,0 +1,4 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: Duck Observations + item-2 at level 2: table with [1x1] + item-3 at level 2: table with [7x2] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json new file mode 100644 index 00000000..04441a9f --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json @@ -0,0 +1,578 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.7.0", + "name": "xlsx_05_table_with_title", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 18126553641942797758, + "filename": "xlsx_05_table_with_title.xlsx", + "uri": null + }, + "furniture": { + "self_ref": "#/furniture", + "parent": null, + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "parent": null, + "children": [ + { + "cref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "cref": "#/body" + }, + "children": [ + { + "cref": "#/tables/0" + }, + { + "cref": "#/tables/1" + } + ], + "content_layer": "body", + "name": "sheet: Duck Observations", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 1.0, + "t": 1.0, + "r": 2.0, + "b": 2.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "image": null, + "data": { + "table_cells": [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Number of freshwater ducks per year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 1, + "num_cols": 1, + "grid": [ + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Number of freshwater ducks per year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/1", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 1.0, + "t": 3.0, + "r": 3.0, + "b": 10.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "image": null, + "data": { + "table_cells": [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 7, + "num_cols": 2, + "grid": [ + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 2.0, + "height": 7.0 + }, + "image": null, + "page_no": 1 + } + } +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md new file mode 100644 index 00000000..346a85ab --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md @@ -0,0 +1,11 @@ +| Number of freshwater ducks per year | +|---------------------------------------| + +| Year | Freshwater Ducks | +|--------|--------------------| +| 2019 | 120 | +| 2020 | 135 | +| 2021 | 150 | +| 2022 | 170 | +| 2023 | 160 | +| 2024 | 180 | \ No newline at end of file diff --git a/tests/data/xlsx/xlsx_05_table_with_title.xlsx b/tests/data/xlsx/xlsx_05_table_with_title.xlsx new file mode 100644 index 00000000..b7a04de7 Binary files /dev/null and b/tests/data/xlsx/xlsx_05_table_with_title.xlsx differ diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 6084a4b6..31cb59b4 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -6,9 +6,10 @@ import pytest from openpyxl import load_workbook from docling.backend.msexcel_backend import MsExcelDocumentBackend +from docling.datamodel.backend_options import MsExcelBackendOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument -from docling.document_converter import DocumentConverter +from docling.document_converter import DocumentConverter, ExcelFormatOption from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export @@ -227,6 +228,49 @@ def test_inflated_rows_handling(documents) -> None: ) +def test_table_with_title(): + """Test that singleton cells with non-numeric content are treated as TextItem. + + When treat_singleton_as_text option is enabled, 1x1 tables containing non-numeric + text should be converted to TextItem instead of TableItem. This test verifies that + xlsx_05_table_with_title.xlsx is correctly parsed with this option. + """ + path = next( + item for item in get_excel_paths() if item.stem == "xlsx_05_table_with_title" + ) + + # Create converter with treat_singleton_as_text=True + options = MsExcelBackendOptions(treat_singleton_as_text=True) + format_options = {InputFormat.XLSX: ExcelFormatOption(backend_options=options)} + converter = DocumentConverter( + allowed_formats=[InputFormat.XLSX], format_options=format_options + ) + + conv_result: ConversionResult = converter.convert(path) + doc: DoclingDocument = conv_result.document + + # With treat_singleton_as_text=True, the singleton title cell should be a TextItem + texts = list(doc.texts) + tables = list(doc.tables) + + assert len(texts) == 1, f"Should have 1 text item (the title), got {len(texts)}" + assert len(tables) == 1, f"Should have 1 table, got {len(tables)}" + + # Verify the text item contains the title + assert texts[0].text == "Number of freshwater ducks per year", ( + f"Text should be 'Number of freshwater ducks per year', got '{texts[0].text}'" + ) + + # Verify table dimensions + table = tables[0] + assert table.data.num_rows == 7, ( + f"Table should have 7 rows, got {table.data.num_rows}" + ) + assert table.data.num_cols == 2, ( + f"Table should have 2 columns, got {table.data.num_cols}" + ) + + def test_bytesio_stream(): """Test that Excel files can be loaded from BytesIO streams.