From 54cd6d7406ee9431d45e8bcd1b0ae53389cfb56c Mon Sep 17 00:00:00 2001 From: glypt <63284048+glypt@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:25:32 +0100 Subject: [PATCH] fix: do not consider singleton cells in xlsx as TableItems but rather TextItems (#2589) fix: do not handle 1x1 cell as a tableitem but as a textitem Signed-off-by: glypt <8trash-can8@protonmail.ch> --- docling/backend/msexcel_backend.py | 116 ++-- docling/datamodel/backend_options.py | 14 + .../xlsx_05_table_with_title.xlsx.itxt | 4 + .../xlsx_05_table_with_title.xlsx.json | 578 ++++++++++++++++++ .../xlsx_05_table_with_title.xlsx.md | 11 + tests/data/xlsx/xlsx_05_table_with_title.xlsx | Bin 0 -> 6335 bytes tests/test_backend_msexcel.py | 46 +- 7 files changed, 729 insertions(+), 40 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt create mode 100644 tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json create mode 100644 tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md create mode 100644 tests/data/xlsx/xlsx_05_table_with_title.xlsx diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index aeaed4f1..2bcc34d7 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -8,6 +8,7 @@ from docling_core.types.doc import ( ContentLayer, CoordOrigin, DocItem, + DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, @@ -31,6 +32,7 @@ from docling.backend.abstract_backend import ( DeclarativeDocumentBackend, PaginatedDocumentBackend, ) +from docling.datamodel.backend_options import MsExcelBackendOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -116,18 +118,22 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken @override def __init__( - self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + options: MsExcelBackendOptions = MsExcelBackendOptions(), ) -> None: """Initialize the MsExcelDocumentBackend object. Parameters: in_doc: The input document object. path_or_stream: The path or stream to the Excel file. + options: Backend options for Excel parsing. Raises: RuntimeError: An error occurred parsing the file. """ - super().__init__(in_doc, path_or_stream) + super().__init__(in_doc, path_or_stream, options) # Initialise the parents for the hierarchy self.max_levels = 10 @@ -277,51 +283,83 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken content_layer = self._get_sheet_content_layer(sheet) tables = self._find_data_tables(sheet) + treat_singleton_as_text = ( + isinstance(self.options, MsExcelBackendOptions) + and self.options.treat_singleton_as_text + ) + for excel_table in tables: origin_col = excel_table.anchor[0] origin_row = excel_table.anchor[1] num_rows = excel_table.num_rows num_cols = excel_table.num_cols - table_data = TableData( - num_rows=num_rows, - num_cols=num_cols, - table_cells=[], - ) - - for excel_cell in excel_table.data: - cell = TableCell( - text=excel_cell.text, - row_span=excel_cell.row_span, - col_span=excel_cell.col_span, - start_row_offset_idx=excel_cell.row, - end_row_offset_idx=excel_cell.row + excel_cell.row_span, - start_col_offset_idx=excel_cell.col, - end_col_offset_idx=excel_cell.col + excel_cell.col_span, - column_header=excel_cell.row == 0, - row_header=False, - ) - table_data.table_cells.append(cell) - - page_no = self.workbook.index(sheet) + 1 - doc.add_table( - data=table_data, - parent=self.parents[0], - prov=ProvenanceItem( - page_no=page_no, - charspan=(0, 0), - bbox=BoundingBox.from_tuple( - ( - origin_col, - origin_row, - origin_col + num_cols, - origin_row + num_rows, + if ( + treat_singleton_as_text + and num_rows == 1 + and num_cols == 1 + and excel_table.data + ): + page_no = self.workbook.index(sheet) + 1 + doc.add_text( + text=excel_table.data[0].text, + label=DocItemLabel.TEXT, + parent=self.parents[0], + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + ( + origin_col, + origin_row, + origin_col + num_cols, + origin_row + num_rows, + ), + origin=CoordOrigin.TOPLEFT, ), - origin=CoordOrigin.TOPLEFT, ), - ), - content_layer=content_layer, - ) + content_layer=content_layer, + ) + else: + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + for excel_cell in excel_table.data: + cell = TableCell( + text=excel_cell.text, + row_span=excel_cell.row_span, + col_span=excel_cell.col_span, + start_row_offset_idx=excel_cell.row, + end_row_offset_idx=excel_cell.row + excel_cell.row_span, + start_col_offset_idx=excel_cell.col, + end_col_offset_idx=excel_cell.col + excel_cell.col_span, + column_header=excel_cell.row == 0, + row_header=False, + ) + table_data.table_cells.append(cell) + + page_no = self.workbook.index(sheet) + 1 + doc.add_table( + data=table_data, + parent=self.parents[0], + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + ( + origin_col, + origin_row, + origin_col + num_cols, + origin_row + num_rows, + ), + origin=CoordOrigin.TOPLEFT, + ), + ), + content_layer=content_layer, + ) return doc diff --git a/docling/datamodel/backend_options.py b/docling/datamodel/backend_options.py index 594be689..ea5cf70c 100644 --- a/docling/datamodel/backend_options.py +++ b/docling/datamodel/backend_options.py @@ -71,12 +71,26 @@ class PdfBackendOptions(BaseBackendOptions): password: Optional[SecretStr] = None +class MsExcelBackendOptions(BaseBackendOptions): + """Options specific to the MS Excel backend.""" + + kind: Literal["xlsx"] = Field("xlsx", exclude=True, repr=False) + treat_singleton_as_text: bool = Field( + False, + description=( + "Whether to treat singleton cells (1x1 tables with empty neighboring " + "cells) as TextItem instead of TableItem." + ), + ) + + BackendOptions = Annotated[ Union[ DeclarativeBackendOptions, HTMLBackendOptions, MarkdownBackendOptions, PdfBackendOptions, + MsExcelBackendOptions, ], Field(discriminator="kind"), ] diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt new file mode 100644 index 00000000..30e0db58 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt @@ -0,0 +1,4 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: Duck Observations + item-2 at level 2: table with [1x1] + item-3 at level 2: table with [7x2] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json new file mode 100644 index 00000000..04441a9f --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json @@ -0,0 +1,578 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.7.0", + "name": "xlsx_05_table_with_title", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 18126553641942797758, + "filename": "xlsx_05_table_with_title.xlsx", + "uri": null + }, + "furniture": { + "self_ref": "#/furniture", + "parent": null, + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "parent": null, + "children": [ + { + "cref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "cref": "#/body" + }, + "children": [ + { + "cref": "#/tables/0" + }, + { + "cref": "#/tables/1" + } + ], + "content_layer": "body", + "name": "sheet: Duck Observations", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 1.0, + "t": 1.0, + "r": 2.0, + "b": 2.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "image": null, + "data": { + "table_cells": [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Number of freshwater ducks per year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 1, + "num_cols": 1, + "grid": [ + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Number of freshwater ducks per year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/1", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 1.0, + "t": 3.0, + "r": 3.0, + "b": 10.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "image": null, + "data": { + "table_cells": [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 7, + "num_cols": 2, + "grid": [ + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 2.0, + "height": 7.0 + }, + "image": null, + "page_no": 1 + } + } +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md new file mode 100644 index 00000000..346a85ab --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md @@ -0,0 +1,11 @@ +| Number of freshwater ducks per year | +|---------------------------------------| + +| Year | Freshwater Ducks | +|--------|--------------------| +| 2019 | 120 | +| 2020 | 135 | +| 2021 | 150 | +| 2022 | 170 | +| 2023 | 160 | +| 2024 | 180 | \ No newline at end of file diff --git a/tests/data/xlsx/xlsx_05_table_with_title.xlsx b/tests/data/xlsx/xlsx_05_table_with_title.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..b7a04de7bc5451d34bca51b7775199d9c659b1cf GIT binary patch literal 6335 zcmaJ_1z1!4+urB~fssl|*AP%z1c@;~Qo6y>IR!=wQ%V{n1SCX3L_nmw8zd#9q!}gB zAm8@B|Lc>#-}k=HuEjY!&+nYy{q(H~#m1oq5D*Xmf_McpdMYUeX5FI1a+$62Ae z-F1R&{H(S<2iP;q-ZA&Jb}prjIvq_L2?BV>M1~{C4>g-*`Q=c3 zgxty8SnM$NpXPu0iMW9*J56rZ4BNRoaYjY0ty0HFDgIFbL2 z(^Z6C=uCU0gDL!X^udTq;GBamKg{GhXR3X=#w*T2d48qOv2>8*VU5fBNU*)+mI5a7bFEps1!wxzu;BE{xN^hnZr^k~8@pY}1>@t*r6;hxKba<=5sud#G(!ebLrl zRP+4s?0&}lWdv$UNv5>#a9}@DvszZ`DSJ(wMArG9;!<%NaG+x!vQvv)Ds+Qm-&>e1 z*m}}>np-vhni*ec=>Y8>H%ZcU;O1uiZ1RN;L39YpdFQ-8TeqwxO&xqy&e-)1C;Hi42bf7Q#j0+3#8E zyOI38@$Ce68F%+^vheoll-d2H=M(JDQ&Ur8cDsYJ9(-;v>iaG{`;>Q}?Rzs7>OuZ< zYSK$MMJ9uH&D~ufvM&AmIPt}lOyS;7d(cYC)V#_rSAlQLvdVb6r*GpJUM#y04veD|6ZF^4Y-I83KFSzNjKy*@YtP zOjXHR$rqw<z(zU_`AnsL$Uz z)F%-i@@W-6S0T=JdF8G^&|Ww$H~?A@arDOFl}4mj_wd>p**U*+X|dcyid5&#h3$gv zVN~|yYQ8T;&3T=h*gNQO#+RE8DTrCg!a#F%{L~Su!R;4M2WBBdOF^04^h@_g&gFBn znC*TrTeuotVqtQStwL%}}OLNFJtj;8C#?+7vKEze_ z(zWMor;Z`>i+UGk+j9SQ`^5g_8C6s{D;ui`kz_k)?3f=x>!9O~NYZhj;3kl1uNGd$ z9~^PlAkHDTf#Ckk(+0K4>DXopf;hq}@}Bxy&t-YzT4E_@8_;}ev7_#N1GdOT#`^j} z0IzW9B&vy1gOUbyH5J7VbA}voa==Mext^gnD3GVXZ`a7!*cq9if}mR#iBmQ~1lpRk z?)SuvxgRwrQb*~NF-_FH;wCjmw_j9txQ1v`Wv5 z?e>6*n4g@}t!^E^D8 zhxfejBndd8Y_ur?Of}q9tMaRmvfaL=c)777qd_HBhAvr3$(bNwC7Ru`m9L{3)&9Xh zg_qBJ1PjvP`vMFI1+0FU~K${E8fOOZ22RyJ!rehJUSLA&seDiZi50&G-}47yWz zaY|K2FTG=JwwkRYwmbdYu#NLA7@?+A+$`_6!oqpWl zJ{^4nj*eiF6#71rM|dK&DTHK4>o0H7Z6&2!e4L3o>h52^d;9z2n4N9#oJ1#nEz@`X zb?a5ryF%r^#s_}Zm*EELSKMXZz1$2Og62c5FK>wq_@;pe4s84*kZbU?tgG8r{#x9?R z&;%sL>zsNgzL2it^_y3R+}majI%;-K#fG_Am^d(kJR$2x$H9Bhzr_LF@9x9X&ej(0 zDfHLZUlpWYe>!^lHl__BK*VtL93i(@_TJc+7K9*10RFigJ#5Krdhr!rN1IZ4g30z$ z220JZiwkxuujq*+U-|iUr+TqyYc;=>-3V<^_U5PT@zS|G8`)^>I!f%2ZdP-yUz37m=)B#%h3nR`^`~rLjPS9BQk7F5LPyVcCE$okqd9N6w)36@BkS6XRHttGMewvKx%X7Iq(P}uUfIU)-0Qw)_ zqsmaf(1ML_#niS2jBus&&{`qL%7OkNhNGK>XE{#$>=G1sUy{`sLr`yO=8L@BZG*)! z)$_y=V8W5r`e8PqkTlB22T`1a3#29;ze1yzu@bDSG*_HPxnys*%PNT3I^co8Lc#BV zG=e4k!NtzPZZ2WUUK~t%Z8F}5Ktu{MVIGjUXarf}bq}_MD?pBlRcpvTR1@=?SWMR6 ztMk%dtdAJC$LXSPw!IH$DqX7{M%>9e*x~&!$2I_6WMz=r5Dd{=y56EVH#UJa@RM}A z(A7;btJgn(KD5SPd9G>fPF1$6x4wU? zKJIlpKM&`+d#f=0ECk={W3paMbLoJhOY!5LD(8UlA;o(Om#}wZ8Pcq!+}s5hMw3ZN z&OtASM3F9@^*P!Zs$#p6kmTnhdl~BY;}a)H#IJvVdGm`TyY0^i;`Od*NS`rdvJbn> z{DwFV0HBW^Z~l4qq5e-5wzKrGwb6ll*t@{~D#kBjg&$xGlOVkU6qhnqxGClJ;>mPx zq*_>zNSF(4v=f%Pz|w0= z?7>9UaRycRW849vey3vtZtxZS(uG9@t|PJzguL6JK)tn|kG9TSf0kJdgRyuSufJsO zN2N%S)8Rek!RbKqBmKVZr`6Bkmmz0(7%1eo>vy`*8&w4U8x+LoEB>C9|DKQIn>AX& zB+qtyBY2O*zR3|}=hv#K)Ut{)HQjV!{2ZR!7V_ipJ{2X&i?Y^e_kh4(R&Qq2T=-|f zoKUlHJ;QLUdQNNqw{5+@+nKCPFu1QeyT6Lsb{6P{h`DN{QRaE?3Ka$>WSVH&HX4Khbj|u_kGY1v!p7BF)5F!x zQ^?xY!xq!KA+>v5TF_*|5g*$e6<07DHIn6Xn2r`7&|xgS^yPfKc=Fcn@tGtUp(d-| z4@)nYbnW-Yk}^q1FTfig4o+?&`K933pzmy1uMZryW8&VDUW=km3L=yYQ5r$4H7jId(jmG`JSyvgyBr3JPOx+u z^oqEgamAUg(l!h+qhb^p(sj2-ks0q6TkJm#tE(W-4Bnl9Dz8KPd5LrSTLk@kgX>>? z=m;^oqvca@;mGr&)f@C8o?D~E7@85ZSv232-uRN#bVSxUfBawt|DCD!j*@i$+@Up83V3z z-Y=pMEu4r=rFW*`x5pf%mE-o+#EK?}RN;W<$v=brjGUVgKtUp$Wz|t1R~kC0*=7%$lzvdYEkkocLO9KmDufXO!B=|heNv-V$;1!m;`9R-g);jYeq;WhPE_{1SFgdM@A2(bcDK4dZ z6gyi1^mOe~kM#Ftf4Uw0oLN?5MCL-Fh+^`ZL$wee4?uvFj7z}v;*B_@xqIG%DfIO+ z9)InV2@`hQux?-^KZ~QpTWW_lUkBz%UrJaDvMai3XS!>PMh~>4q85P(lDcDBlSrx) zHb04zy$!JoD4EUN&he}(tFLoz0Bo8Tv#*75K+sF{Q}3bQ{~X#~@DNw#{5A?=NC?S-&!>%6ya) z&Y~w~U`M(~;QBr0_#8E8K=`w4t}$n34Ir7v>Jh6IfRCF#!CGy;_~d=0Tl`FgkCuZ< z%ID8Z{1z!4+~2}IHP5h{kTk~I!$L56w)E@fkWHaOJ_A!1LKP5St9hxT!X#lIt zl5ZMBDx7^t<$Q3kpyMe+WHxh0|7=vMtsRe)r;l&w9gYV&B8sdM1g)z?tf`e`DZ3y! zx&8h$9?pYn7eUrITb@Ic3JRY0ho0+^{+e07ofl~L7<;$eux8?Hq%G~z0H1e7P8$Dt zj#IMuw~1kT|6Bf<_x*F7s}vW8 z3H>eH=sTm=xypi-1N_+<{kg)`vWe*-ev1$q&nq3ppNd!0Hm0in zmWSwE{X4t-r^f!Nel<~Ha{F%~r@W&6pG^O!_SJOo- None: ) +def test_table_with_title(): + """Test that singleton cells with non-numeric content are treated as TextItem. + + When treat_singleton_as_text option is enabled, 1x1 tables containing non-numeric + text should be converted to TextItem instead of TableItem. This test verifies that + xlsx_05_table_with_title.xlsx is correctly parsed with this option. + """ + path = next( + item for item in get_excel_paths() if item.stem == "xlsx_05_table_with_title" + ) + + # Create converter with treat_singleton_as_text=True + options = MsExcelBackendOptions(treat_singleton_as_text=True) + format_options = {InputFormat.XLSX: ExcelFormatOption(backend_options=options)} + converter = DocumentConverter( + allowed_formats=[InputFormat.XLSX], format_options=format_options + ) + + conv_result: ConversionResult = converter.convert(path) + doc: DoclingDocument = conv_result.document + + # With treat_singleton_as_text=True, the singleton title cell should be a TextItem + texts = list(doc.texts) + tables = list(doc.tables) + + assert len(texts) == 1, f"Should have 1 text item (the title), got {len(texts)}" + assert len(tables) == 1, f"Should have 1 table, got {len(tables)}" + + # Verify the text item contains the title + assert texts[0].text == "Number of freshwater ducks per year", ( + f"Text should be 'Number of freshwater ducks per year', got '{texts[0].text}'" + ) + + # Verify table dimensions + table = tables[0] + assert table.data.num_rows == 7, ( + f"Table should have 7 rows, got {table.data.num_rows}" + ) + assert table.data.num_cols == 2, ( + f"Table should have 2 columns, got {table.data.num_cols}" + ) + + def test_bytesio_stream(): """Test that Excel files can be loaded from BytesIO streams.