From 34b7353cd32ab2e6ce9337f0fe73efd7b1a34ef7 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sat, 16 Nov 2024 08:34:09 +0100 Subject: [PATCH] added the unit tests Signed-off-by: Peter Staar --- .../groundtruth/docling_v2/test-01.xlsx.itxt | 7 + .../groundtruth/docling_v2/test-01.xlsx.json | 2289 +++++++++++++++++ .../groundtruth/docling_v2/test-01.xlsx.md | 33 + tests/data/xlsx/test-01.xlsx | Bin 0 -> 20214 bytes tests/test_msexcel.py | 77 + 5 files changed, 2406 insertions(+) create mode 100644 tests/data/groundtruth/docling_v2/test-01.xlsx.itxt create mode 100644 tests/data/groundtruth/docling_v2/test-01.xlsx.json create mode 100644 tests/data/groundtruth/docling_v2/test-01.xlsx.md create mode 100644 tests/data/xlsx/test-01.xlsx create mode 100644 tests/test_msexcel.py diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.itxt b/tests/data/groundtruth/docling_v2/test-01.xlsx.itxt new file mode 100644 index 00000000..72db0426 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.itxt @@ -0,0 +1,7 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: Sheet1 + item-2 at level 2: table with [7x3] + item-3 at level 1: section: group sheet: Sheet2 + item-4 at level 2: table with [9x4] + item-5 at level 2: table with [5x3] + item-6 at level 2: table with [5x3] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json new file mode 100644 index 00000000..941525bc --- /dev/null +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -0,0 +1,2289 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "test-01", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 6822153538473622425, + "filename": "test-01.xlsx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "name": "sheet: Sheet1", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/tables/3" + } + ], + "name": "sheet: Sheet2", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "first ", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "second ", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "third", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "-3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "-6", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 7, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "first ", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "second ", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "third", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "0", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "-3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "-6", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "col-1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "col-2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "col-3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "col-4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "16", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "15", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "20", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "18", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "24", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "14", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "21", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "28", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "16", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "24", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "32", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 9, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "col-1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "col-2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "col-3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "col-4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "16", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "15", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "20", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "18", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "24", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "14", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "21", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "28", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "16", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "24", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "32", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "col-1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "col-2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "col-3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 5, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "col-1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "col-2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "col-3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "col-1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "col-2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "col-3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 5, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "col-1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "col-2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "col-3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.md b/tests/data/groundtruth/docling_v2/test-01.xlsx.md new file mode 100644 index 00000000..17afcfd4 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.md @@ -0,0 +1,33 @@ +| first | second | third | +|----------|-----------|---------| +| 1 | 5 | 9 | +| 2 | 4 | 6 | +| 3 | 3 | 3 | +| 4 | 2 | 0 | +| 5 | 1 | -3 | +| 6 | 0 | -6 | + +| col-1 | col-2 | col-3 | col-4 | +|---------|---------|---------|---------| +| 1 | 2 | 3 | 4 | +| 2 | 4 | 6 | 8 | +| 3 | 6 | 9 | 12 | +| 4 | 8 | 12 | 16 | +| 5 | 10 | 15 | 20 | +| 6 | 12 | 18 | 24 | +| 7 | 14 | 21 | 28 | +| 8 | 16 | 24 | 32 | + +| col-1 | col-2 | col-3 | +|---------|---------|---------| +| 1 | 2 | 3 | +| 2 | 4 | 6 | +| 3 | 6 | 9 | +| 4 | 8 | 12 | + +| col-1 | col-2 | col-3 | +|---------|---------|---------| +| 1 | 2 | 3 | +| 2 | 4 | 6 | +| 3 | 6 | 9 | +| 4 | 8 | 12 | \ No newline at end of file diff --git a/tests/data/xlsx/test-01.xlsx b/tests/data/xlsx/test-01.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5a87d4f646dd7333b7d6724c304efd86f48ec060 GIT binary patch literal 20214 zcmeHv1#?_UvbC6*nVFfHnZc4QW@ct)3oVN+i`*k~361Jw)zCQ#Q23fL#d^Ra?vwncg;* zIjxnXhad)N4ZCXvJ4m)O5;p-PNB1o*W5}8$W>hvQBZ46-RoP7R;(X9^}}ZNEEB|YCcP!RI5Ko z!BpOr?c^rPgt0NAF!3xq+1nP&+O85S6AV9*O@!%{_weOr#MSc4l71{N)v1?`l!BU% z%C11jc%Z@ufFLPT`%x;z)Uq?+XM4s=4wwm=+;v?LKBa*h{_hY@=Y9jGzSKlABmIn} zteZ_S`~mfh=-q)Hu$fHPCqO<14TlOkUUNupSJNAJV1>cz@%?^0aHc!CtC4uJUH61 zthCf{Gq`|&@TC@6%e?!12PD=iT_60LH>eam8{JRNcWmEP^z988K;b{^yiS#Y`1<{c z%)1#wzuUR4gRzw(J>4I#|7YL-i;enkqL;?Y%J(q9gq%w}hYsG&t;He;NxKS4v=FNJ z_(`t9*G1=$V6SzM;~=PD1p&92JmgQn3!@ZXsvZrM5$522d ze&PDN%lmI5VOoD(Nd*P~z=H$;fPWtuH!FHqTL()6TU*ON9AT;IxosL3%FAo+9X~<4 z^2)PRfK)z$K!p;ws`u6C$#UgC|e8m(MW4J;D@?8kWtP>w- zs>==Pmi;Bvlt8an0@zRe)Qk%{RaN+0qCbZh1BRi}H+h^1%H|9QM22cgG*b!QCb{*i zb$^9_1L@I&C-0h{Qbe7|HS5#N0AyK?^-QY5&@5OxmTTrd%*>pSa8^Pilt3yd*xIIw^D?BHy5A zoo28+oJ*&h&L5s_>RRu>(tOuSwnd4yYY^DWYr7jum{sJ& zmID-fbL$=kaNIeQ0Xw{HM|%%QlDD%;?q{+}ul8b1Y=@(6E#B>C1|CqpLK)=Z8wI!& zFKP)2hvq{&^Dj4Nujq$vX6-ysN29mCeRzE6VL0)g3%JR19a+lvW{u9oTH)3?FhEZI zbQ)NApl^Z9yLO%C%A*dNY#z1Nv&ti0o?IfjeAy<=U0-2I2|57Qr%qbhB+Nwl^TKQ^$P4DvMFpL@YYz z6jV(1Rynsw&lBPD87u=%{1j8*?Nkh&i%cc&6rxoLoz@(GR})Sz0{Vp5hhqtxScc5S z`9)_d%d>u$?dW)pa2{=~ml>pId~~Y~fvz8Ec^YUc#d*0)#4JMrWyH$?n4S-lm1#13Qo%Lnsh08)Dd0_m)o-!VkDX5I*(FnBf1?|9@sgrD^bO8n!2vu{P5SwXX@~g z$7|i~#kP-ZA9}gYpKRZ#et^w>IqBWJzU}RN7%17B3$OYzboolh%j)KW+E+8PV;b3a zGQ#x0gX8a$I;q`tv$%o+%a-EvZmj=p#!^N3`j7ACEB^iz3jhM}-HiWqcYinLf4M%u z_t5rzGWg$pwZ@NG^fJH%od>rEPPN&`zbXuKc~S^An1r?NF7)X-z=)J|Bw;Y!J3t79+9Dl;JT01 z>jN|NU~9d>q8%O6?t?vffeAen4xNzoiMBex{88=m8Zzt~_QfdQ!(4Dwr7WqsJ<0mNEWY2er2&=zr3j1h_b% zmG=mV4jcdg<6R~GasZBI#>P&L^gnNme|UoQ1ntNj0+f(*^nL!YyP}8GT3Ff8c&nu* zt)!R6`u*HEi_yXtgQd52a>`Sf>wFjrJk+m_I^No?>9s`Qm2#=&SyLprAYA$uOV-ej z#|K==irYNr7UeSFpuB-O^dklIkvUb|3qWoGLF1^kH|-Ocj@h7NYrWT#TXKe_`*;N| zBOh)37j_8CKH!94&U|#a78*PyhM8ee5-{%75k8V#AbdY5(-!Pt5`miIjLoQ+RCG~u7VkzC; zo_Oup7XmqNv|A7k{{B-0zq-=3mto&|;tow_7v2Iw&fRiJx{ zS{IeJaTk~OE0*yP3+&DjzBZjjI6Y4x5n5_Md3I~MQvJuM*9Y3IL%v?4%ANk?waSYK zb0HEA1p+Wbgik$yQVPXcswm&`HpC7Qh~w9i@pq_wc{T4_NU_hdSdL;!y(4`=JEUM? zH>lrwa60@~>+#f?6S>?-D<)7m{K@(Z$Qbh<-ktZ`+xzeL_Li6+4Hro8?xYPCwVt!` z!6cz{dOyUMTPa6b+v*x&#f3*!2eOmc`q(+nc$|g0BRsqNl1_ZuC1H%wXS$tDB5^;k z!dN*jSTLELS&chULFgK|`QF!F*Sq*c#3YB__5D6_$eRY<&;p;j9-|J^${YpJycCia zNRng{s>nYxCa-?I!;(H@*7Tf<$e)P>R1;rXBPfy6JY3UT0ySUIvjoC?FSn2k5@oK! z?@=2f*Ey)xMTvzM5~Tupa<+lAk_~abU*jc9uTWqrfI5pZlj=tIPtE-)Ts5VHjQou( zjnu>_wc4PhEOnJC(Vwh=ZZfw$5 zU7(%Am8HWk2k6G=rn&h^i+{|4|2>uR1L#B@y-%ZiF#n@LF#Tx|S}|!12*DfFH~a)0 z$d1ryS+S}}D6I6Vb0cRpjKY-$h8EoHsxRl<2&64aI@zBEu)v1B04F$)hh+{ODgN6!SwFS&;6M&I8a&%aW zm{zd#;dfD*xIB#?XgY^c1f!hs2!t27)i2dAS1M(nD^cEp{%@CQL6TWk_M0(*&ze#NH5+4wNUP4h}Z)7JGkVO-r-=x|Ao zD~~qf%yWTm9(ykuPxJD<`NKn>@^fue{*K${ptm6}HAtSV_0OhR<3mKFY+FH-4^|M9 zk$yQZ3Z9=ncL)Hv`VkRcJbEQ*o<9a-TY-kv_5|P!H6nw5@K5A?z`MbG<1V~rm6}ZJ zA9yToz+-ypaU-p%oW4|5DHl|HcBLv@(rNzscn{lKZ@(xz<%yqJ%7w1SnU5BVDN1n# z_RBbu)@%Y9V1jk?wgJU}uhmh^Ng&){dShMYyWb{CrrB9Y#^4h^^+Mn(`xfuSf;1}ZLvML^1` zlw~G~!YwlU$o`@xayOzq1e9wODK3T*A(aizL6+U!Y`5iqW_F62D}iWoW90f&^PiigRUfK54IqP59$t5 zx3bj^0R&|Z0htL(2MvSs45e`BoFaR40uE;-{}2 z;yUjiyVGV2c~W7#{b;Cva2edqf~Y6^eIxn0^Q9bvzfGd*CGzUsRsA*Bu?$K=`7yu< zOqB%TbOIs@YTnWDh~!n#!o^|yc!Zl=^W8|No7^SI-3f?BtNVb88<30ONd5KA8qq{; zkC!_BisUO+P_PE`aF?6CD;JQ&*H%$X@91+{+oYQa5iXv8oF8uWY6TnLdw@vv$MWG% zhv{TyY;8>c^Yy2%JkcDB!EQk903Uo47}cI1J@nM)oGf6!J@eT5ti3BCnHZuoZYQAB zLX83lbucUK4`eC6r`xeIIAeOon5z+Q45O3g0&pdn1`?}vY zFu)$;c$ZL;{0*9jWKVJd`s^N$XeesJy!ymLo@_^IMObOe=o6;A#DW&`cW;ODotU$2 zVAQx|f>`-oz%!3HrSF1uRHh73d$^hLAU;}lh=!mI1{xXAUXUJ!YOvlN8qtihJc*ka zvwWH~hf!JKYTwsZDDW8@WfNzCnaRd$Ees?vPM&!tA)1na$tFQfwBh$tu0|l6l<*8b zP~Cq!e%PNVS#Qzm18+i5_B4`qFueLQvIlf^H91O|G;^B!N@2ASSmpz)XMxUcar{=s z2i#eM$z$H1SITV0^5T+gD4lgO{jFsN%7U#ux1+v?^N{69e{I@UAm#P(jQJCHj($%! zwnB*Q$)=~z;`uC1E1>cHK3qHBQC)XjFnd;heFTBS zxfi=9TbfVs5Ok83I-MeCFy?#`;~=uC3!VU)ZAiO&>>I6auQ!ngZ|)a;==7Oy(702^ zxlF1_tAuh%#!dLIuLEx{H?D^rZ?7xxy01ggUyPqid|zMZ8a7@MJw8|jQ|i1vl&I_a z-dxiA-oN7Oa@F2}WF~DPVLZ$JiM5IZ|5XsO&;`rNy9>qn_Yz|>D zqnFEDA@iZe8h`A3DltV{sF5f|aUOkUh@jnMJSd$Ao&kWr`A7;auQoKz+w|eHeGFr} z-576Y_}=CeRq(ZxHWfHxm02cJIz=l9q@u)I{{*Bx&zfnXrVDRAa~G*D#Wkj=w@e0E z%489$MDQeGbQirPx}aU1aoSEn;Y7-DQRr^BSa)^jsGyHfvT^^DTb|vC5@#19HTE+^ z$)M<^6|IT!ssz=K>v)DwVva<4())3K2eh0ogdx5APEU+l18hnZ9%s|;L7eo&q0e3j zTO7^~`I0LAK2#HgB8*&n!=U{8w@_1eo8aju&UeaEG_DiH@lVXwxdIPIJh}}O(Gm!%_y^i!I4~mSh0;FT~+mqbpIl`+J3m!mPZGz8KIt`o@+6!#%Y*ki*fk6*>AtQxX$QbGRl$(lve%D+Fk1!? z>5(O86RS6aB96QShpnqeFAh^SRH9pNfl)-2dXVQ29Y=gcrUB`AcDMKnuXfXj+9BOm zLtlZTQ{r&DthENf&474(2v)xQ+!n}30Pe)>d1VQ6YGHWG4wLOg#X+{goUd%#kZ)o+Bfn$Ulq~TB>W&XEmi!V& zZLvy-P5q_dn62?afn}18)W#j5nFg_pok^&Jr+{4v?tB0X zt%^=h_JD?)@`Xavs4QY|*2j-el#Vz3Ez6web2Cp-L&JywWqsQ6t}F{}F$G`gpzGLw z)QWa-K7omI(myqj?gXF=dRldGL)W~v>Nq|t%wv=3AQSVDKf1z;ek9q z0;1P!_2&A@w#T28EZk_9tJ2B0XjiCh)5mG67FlIZ_;akd%@Gl<}VWl+v&cOK#=NmonjUB7DOn zF^&zT;U%8HG+1NlfmLg3h31j3m@v(j-QShYE6-a`+kGnLlc;Y8NDk7nm%p6}~t)Ltp3Eqj(iNVUj2#;nz+ z1s-rK~uw50tzW~D_LR~^$gLZ`WYWVa_lM}Qi>M)=sq6$ z%mo)stU7(IERoHL($4}r+AMt8%*tRkRXBe9i7aL}Z%S?949G6L+)}5y!KuJV%FJB| zX!nj#6cTc$W?Ab>E*Fu}YNXAJL*BA1T{Lf<0^9&daU)=Ws|8il;*ICpmfWRx-XjG6IStj$`xe2hX{+1*2E5x( zqpkogNC)s>Lw6zF5baQ;&!QRe?0aL%R>I3?RWITl!&&zBx$?mGwe|KpDN)0(fpur= z-8TXB%NUtkx(!$KGkw;G@8L5aq}Uf4$owHMM3S7e2LnZ!9*))r-ee&BWWcq_VP!XM zW@iFL~fxo@+QXKby@$N z`=b1%(`Nb(#zx9c4(2wdKULc+wkraN@qJUIyIpuloBJr6D8u8!BKagZg|!pZjFS~p z_0z4(4Z&dLfY-y7FM{K}WLvZTR&r7BUN&Ixu49;!`Iggj=D9!>2UPh)5bZNjPnXXq zOiiJO^r{d!(@CAlhQ34VAh6O}x}&7r(FeNJ&;UBSP>Q03lKiE^T_XQWcaoxY*21`L z=iUnP#Q;{EGIx$!9uD2Lx}d66gcHuBVs_2aDc>sJ>?l5Ew+YSf29UO(^9Mfd=>a8_ zjhO*o=?jk|PjLT9e08PWmeqHY_Iuy$_-nDt$U)!r56S8O`sdF8Ud`I}o!h_inZI>i z=I)I^BrkLd6_W%NDb1-a-vaX(B9U~)*{6x9zCBq1o6jqimCe)7w8vg)pEd2AT@dG+ zJ{5zoC$fW<7>4zufz*=GP#-zN_1ph{x<% zY6eGQjXRkd3?$Xk9I!4itT#K17ta%wrZ*`oxA`+U2e7p7Kfo;c8*2A^-n22M{$mu$#hH}y6#3?@kJq(*>bu2w`>-kKcU{`6UYcqD?q8yvJ6A=J~ zJZ`WETiE{KfYf?qB0^_d&OuR)v5YXwL>)2$yAMPF!L*Ffv~x=Xr-kKj3j!_pfDvUh z5W@R7izR2+ zm=jv#z9@|C%&d-E$t#pds3Ehquz$!CvbzTMs8c@ za9K>1m0dU9jvCA!^1)W^w=TP&R8qHhe^}gde!1?8J3%-xq&WXB$ZH%=2P$z)(xxta zBZ-SR>&wmD1=K+7eUV2Rf#}dnGOfn0br}J@LgWlEF&*Q=4@ta$S|&%alLap8*?BigZ&~M^S9^%b?iNU>Uz(?#%S!BK_$>iltw!yE;Kr`mk@4&p* zK;D8GA9xbkRx)`sFl9qP`&~mmpVC7_UA1&dv|%D7M`q_|G;_QK`H@8C1*Q|Q1WZCk zJ*?hEk=zLPK*a~wq|-m^8N8v#n)>p$miaB(k`idT%?`|*p#P(j6DWG{vv^O0$*2GK zgkbnSA-pgB{=EG~XEkhBB#?Y`_1=KLFvN!9YVbG_UkYSzcj;w>Bbx5(Z)h4xZ#NH_ zq|&Yo>%Bgy+z3k6VyxH$S>Zegl08wlEZjJ&&mDg~XWCy?P|WASZ+R^`;9`k!kKZ4u zdU^`In(uU$lDrIJ$egKa`Ih%|wZ#8+!G+gHO|+{jXmA7&*NW3;=t^qLXwfj@Cd#ao zc^R-j*|14OjHJ_7odMwh-OD@=$5XMOCh}3(&kN8$Dz_Gs$H#);+Bb<@t=Ac-URwoi`2i;Fj_ zY-~*N^v=LzS{=II#kv;Ug%ZLdaeAwm`69)U$%+R3ReBvEOW2-=@%k>J!j7G%0`|}< zhunviniN-DCUk&L&HDWD$2}kIGoUm-tu3BgX-v9P0k`^iPQpcf)UWX|RT+lO@epoD z-?UkobDcvPhct9V(j~Ta0aF)lc;ToThe^+yg}^tW!c>KKy{DdbbG}WD+UHsN2qkV8 zE@U@M#3wIGNiRW39}duVrCkg30Ox}iwed-N7~@a~YT56!6O@jxX}pY}Z;_V2JUpvh z_!1-4-%x4si$r%zj3)`P+A#HMvj=5mUL(pq{kO=w%G66_m{YgP%qC6MVxI*=c5r3k z8szV+$sflnW)qG-1|oH$FyPdIvU2JN@EUd>_D zkFQ^fG~6`f#m*dSo1UXVui-53v&Gh`2-wxZXD#i1T19aJ^wB$cj)Vmx9FQH{_AN)BcNR;s!c61zN69o3+AyGh8w6MC9--y3Cb9`g;RPKqE< z2rxwuUMU9Uk;o&(5g8pcnGln%@k=LWX7zR0HJ<~RO}6+J2^zg8N~u^Q7&s@BTbAcN z%j@L5@$GDiNASQeUvs8iDy}w4F|Krj&~k`>b0mt1s;G)-=N`I$PedVGP_Jj9kSU9i zF>Z5{7-TlXb&sc3DLc3WlwU3wEE1cD4bjvKp280Slr~1(+@Yahno` z3G@iZ*c2V2-wX}kmS$9~0#;x#qoJy2OHrqo`uSY?D^+=Xae?Jic}ASMEfECejTT(ZDiCISuS08(HJla^SM z%)pPbz>g-Ko0*?%Qy_t=PC~`0(VI;xDYFl|X^2Gyxk#OgJMW*e(Nmwk>p z=Ow}FuuoBA^V#7lvwL0^suPGRK9?Sl${XeQ(tj2enCl90Sjg~Ww-hve>@}*}Zc|jT zU@qfky_Rd8!9sQ;Nbp#eKN+;gXAw5%up(onC*d3}Bg4?qwsw3b#`3X5FqVHSIsa9& z$Bw~THZ;TBa{A`6Y<;(NAqFggB3<@P2~N07qe-U)+Nrf4e;z@Iv&OA95FtMglQ7$q zNMx*6mq6$+Qu?LPO9aBMT|7tt|6U|041qA)HVPqn&=N5l5r9Uhu%?gjx&dP9WSu(9 z4WaEtEV&8+k>w~kI4lc{h#rRMxdIw-IPFw0$lN?6Y>An0yWLA?nUsjf33$>qYlqa} z9L&8{gl8X#Q8+Bw+c31fQJM0||1Chrqy71|)3`!Mas!=HtQU8?c zvYUa(4Nr^&-BGPrVJ>1rXOc8%_*K{vQ035H>)b~Hb(x6BmpBEoJ$Xe9@>E>~$EqV2 zD2MLuP-fwZGe=z&M?4OQQdOGR_4aHRJn$2~5L&;N%?h$dk3T;|v$mssvG_KD4T~b- zwuu1)XttxY5g0mUsfuy@wL=Yyuk=}c?P;e-AJ|>J$QfPJ(Br7H9VzkK&Ng}+8OHO> zC>vujZn~XHp%tLL+d_B^k^inpdyo-`mQX&x6_Rc|@(=g}5U`jLNFF+g0wDDpSRdCV z?9mJ_$=iMK@nyYO47tOBTA2N8xO|WpDbVF_j=3#Go%B)zow83n3FB;?ye_gq*h+<7 zu#*mP`wJ+~`Qe_F6;FiG6^v`yytPsT#Uilxj&!M9q%uGsN>| za-H}hljGsIY`;`|?}aDl)Z#@he)g5Bg^$U6)DQz@MN_y_l9Ms9Ki@QZEadFagSzfe z!I)E@$yZrdN?lP_1PT>!q$-yC0RBm{F2?I60qsd&UQ@gy43vu-lwQ2!kf-SNF!-w; zzV~9`L*JvSvU|q?Xa9j3QJ1V`#hJ+3yeepfN3kxq@1#ND&co(XsEwLA+R+NW`2(FP z8JQ8>f}|#za$#OG9r=D~B}s7qrZD84jV+fK@pi1+0Rs$D+tqoBq9-{6;hi4&Bp4GPeTt()QH zK!JnSPDytO@!<(8FvX@ScToaFxVs22bmwX`ciui~OnW^jNKU(AlNReT;3=z@9QVi+ zo)j zU$Kd`?KTI(D|M2u0I<{ddR6731&h3dL+iSRX>Yc0;CAPebnB#PU0=h4;MRIO5Gc== z5b~}I9%dgCo}DxSU2w5iNZCWuiT@+{ZzgJL%|SbxFI;-B|R1ly`(7A-61H{AS&ys*XVf2%*bNuKs3X`Wvarb!Wq{}=Pn!tUx)VVWT_F(b5g8Vd z0}%mj@6mLb(alL~n`{Isz{xXcQjM%Veg#!TA%1NEaN+L2@NMrLZ0!2s;N%`2N5!9# z^c4ZIQ-yLW$H~rCFMas~xhpiVh&hvk)QtpY(;RTw4@st~6#nHH0Ip>+ZqZuuAALL! zrcFVfmIo-=a+Peu>I+&?Y_Y7)ZooDJ8Dk{lr#t{9uFxM=)RE%1 z=Ja5mdW=b=g3Oujhxg_7H`%PWDwqQcG1>r%{7&O2BtV~l&v;*2{L7{9*0)FCB(Q7M z0joi@V22?!1f7IDP~}N;F9DSS!HcI)f*;+|aWNy_p8HUuQNleP*Pa!s6ONY=E~Q-N zCF>N zJ@Ed}Hy$zQ$}dn9_b2TMmLfZ~zso3XJQvnev@XJPhibGG&n&Z}HH0*ED*9yZd4j#j z%Vh;B&0#sXLzn?|rS(b=QnobUaluTnWhdc=KeQp9J06~-M^nNINJ~;L;khU@!d9d{ zVLWd7%G}DnS?ycldU(t$k3w;|mTO3tdN~o9*s3ztKVzf1cEJG3Tm8dQ2 zlS)Q~vrHok`GRDnR|5U21{Hoff4YizZmGE1`LqgSv3wi#or`=)zA4vpR;>?3SxElF z=jN5KRxwUT&2V{m4W|MeONnL{%m=03wyuks-Z_5Dr!(zG-`Z4mPOSadmAluWh9GOr z@xn3-7ND=C_$3lh(@Vmvf!c}9#CRXvhP~!lvrnxnjgg-hu5p=63W*uw;qv}PdomRYT=+16u!Jav3XA}cwe0P zBh%(zRLj=N*1_@Te3qLaXw%JrAXl4s|cu3#oi zh}6aXblmaaOF}=TE|I!QhEpd2C$@6GX*nNKS;BF*2nEc?3We()PbkWio~uIPwzH|V zL`z@Yy15tt%^Uopw}~I@(%p|3pYFe@?63meb0{$+Wh0=Ggn_u6x`kM$7j9vZWNK9x zR4h--S1Wy+WeFD69g=<#s>zr=CRY0pR*x|?EEh?=7O0GXvXo|tuHu^_Z!r!#Op`g@ z^OhPCrZw@@H8m_ONAjk7%ZAS_)KU2D!eGyBOsdoF#(5pr zKc=R7D5j$gyl=F!5&u5@{~WfTNf;-ZTDB`3Xg>IQZ-HtM4Di5x&5Q9?vv?#9+9(Nf zfn?0^Ph!%wWssu9D4!Q`M}5CKN({T;n5NdZ2&~BBA%%Y-zT9c@I$)sp(NQ05%B!XX z#c$~hiY^ia$h1IIAY4a}flw=Xjlz zCl-sh=_FK`iKZ$mD#Lk06H;eys?vD|leiMJ8njK`?PQ{DYXa^vgQ(>0u6aEqJ{VL{ zIC7!7YblnGQjr*o00NB_m!`>S;`K6l5mZq*Qessp6f_ljh<#|FL_mY_SkaaR$PJ*t zf_Z6olzykCYj|u--Cy$k9nKr5XDCERpf!zz95_C;U8wSL8AeFKWixf5mM(9r@NT-iIp|a?Szp($4ME3dl!0t+wdS0!C{3H3>{z%YShCXfTHVJ$! zRKyk=3jyNqUAwK{n3GOM?B6Hp5VX1lvkg6awBIJg;E*mskC&4)I*5xXUkaWWXG7H|a+l#su1JcHzB z(R#f>L}`JLC)7khX6TGm`3K%6-cT6p5mZCDu7y<*J?Sgu;Wln*{sVyU6JM!mxw`7n z#_8goo`so9bt_=P&bAbP(>N>zpbTKol-6X=LGHIwk(ui)o}og-QW_Id!$;85SQ8nn z>p8AT!w*_*h{5KO;Cx}s$JQkck-_Gln)mU-PI5cDaJ>M=LbxrIXF%`$tDcnmc=Mw; zq~N$H7;)VlC*#;PDY^8nO=(oH=ppsKjdd0|z4KCwOGCP3^p~(mB@Z~55L1EXg$y1Z ziZTcwxwyD7US_Kk_C|qL7!Vx&6*fPXe62$RK}Y8)3Kbm%C0=CXG+=Kv`fHnwEDuGU ztd|^CJc>oNxKYd;BIokpTB(maBZ*AKg{&mUE0$!lDY?D(57B0VtXGPnZqkdO-`53E zza69wlB$Z4OT*c0hy<`=6x@8N{h&0{$T{Cw|LBEvZ@xX?gF^lyxDhimD7Sn|Zhko6 z&mRT8#c=yFd%fZOVxCB%xO|u3#&MlPGGES)U)KD}5%tW1cOA%p*E`gSZxP75itcB` zmd_3^M=2cPM=u>_!y4pGJb?hEku183)HDQwd>4SXhym(2E^a(!JspKsoi(4K(Hrd_ z*x)6O&SA*dq^Vfk?fbQr4>POoNbPU}f`bW1u>>Zd3{;2P(#mmDxj%A0XC?fWmg7lH z%PSfWEJvBVuk}#gCPU^x0mGFbAm}>*W~1CEMB&Trob{Al&LEezbYb?ACcx&*n0sQF z8?bs48t0FappW-Y^LZ zVME7njASuqg*_y%7-Wy5E?6zXbv+~;esi08@^^d`XwmsdLe8H$(@C`rDMSi4hs;S4 zzVO#jNWIV;dj^_6Pai+^jB@*82Ct8hq=)p7V2NnlF5uO@8X2r@>?B?k?(ID#wb!@b z$!{_|4nXHZSN}h z1p(LR8+baPhY$IDWht;l7A^6Sx{^jk+k%hi-qfu$)ztF&6=JO8`uIVV&S(5v(Jm-g zX-_vCqqQxB{MSkO9RC~hJML@>2(FIkDEs4d!KVopNe>l;1mxr@5wjo~{WUHvrpOAZ z*N{@$dQV_zT%2?3*i!o7F{^#Cgm%v6Peu=phmUWJvI7!O>#r%fh9(Xy;9hD}lW++r z=LbGtMaK!~&Gt&%>W+`iL|k>`Eb*n+1ZNG|DDw}Jpk z1jc&4Xkn2iwEimGG1@p(IBCPp6omAo2hXcge{iOF+3_UEe@zM4LH50OI9g*b-HkY42;tQ-C40KDoUMuq%y#dC`GKLQ$)g}x|Va71Z*w*{ayT5z1 zx~|2IbOeYpkub78%BZ^CcBLI~Hx%0oNb)M{$d6xF)%=FD6c4uKO+~3rHKrn>g!g#9 zLAm_xjA8{{7o5j2=Ul-v#xAF=I&xojpegQ*tKjI9(nx4=wcm0a?N=-Ju`fEZf;bx; zVVwsXBm1fWuRxB(oqC*uR3VYo-Ytn_2_oOD2>rH|#DtVOQPK(6x3?u<`*r?k3Np zZv74ZSYXUFH@Y4}VRuN(a6qg=%w-AR&P(*f3`*$?uq3{TE)r9G> zJOd@IP4ldoT)}AmN&F%>;iGX!e-XbKtESYl6P?B{Af8na`po+Mz1#Bovk#965|Y9| zN198?4)G&OzWk}7ratN}Gpm|#9G}M+Q(VeM)ChQ>fg4W8y0gBU^kc7 zEyzo>X}g9;*H5H^cu#@AE!sLa6&0}sr&-*Guwa7;d*lh1$fLl?RHPpCxIAVU1#D_V zFCT#wkD}6*c$LI#t_6s_&=PGAV)lvsJ~0i+A)YdNF~WCi(CKe@xI$fs1<0CI)^hcw z;t)E2yc(Hn!O-TjsiU7j2$i`4f<*ps{K1N$WAD;K$vg!>fIn9}_o9aVQ1<{cepf#= zy__%&O}DFOKC0h76DSv`o}7k|bq(LHpk*UH-5L3N!|9R*-}fxt?&g3u40^&|4+~6r zkaTe%Dqt|Ll0$0DZ$2((l&@FppGo35_ym3FaUni!p3y0Qb)367EQ62meBQcs0z^D$ zjnOP06RO(m*a55OQqk|+b+L^)@*Ff+k^h)>Xbuk=eZyI_R~OGJL>a#i;Wd~#hBbrp z=%zXAzY5b)h*dW7gI)BPDM^7B;j)m=*6RS+f351Q-dcZvqco5-J)jpc#R#|`uAkIjddz~ zjb1!OIDb=~~My_DnQnI!kIEd}IyMj`zE~F$}!F!aa zLYsB@rUo8-%8PJ14Ixrv?>~Ee=XU>{QvUyAem`fl|6_jtuFyxVDCTKHvRHqlE2JfptwqTxtS57ghnukX$!t!-pDdAhXz9KgeHD2(IW%%oikn>;uuY=^v8?5oRs~yLT%~LT9NE@(PDw1Iy$YX4`pzVX z`<;4UHZ@2hjA6Z_d`eK#GRcg-^DNbxYbP`0vpEgtH0g;K4_4Pfox@P@CuiHR1T|9% zq?>$=_pQ-hfeX1E|43eaAsxiT8m_%u0hMAT=zTZ%b`aOWGdqwl^3L%8 zVG#VVSN*5opTxWm0{;8`*J8<^L-1>N`sVK7%$x{{u&o6WoTx5< zLwv7=O*872L3p~&N3V($k74e4^X7Nt@y;vciz8L2*K5W@F_AYMtO<@8IqpHGD58s@ zQxb!whDr~4YT3aSQ`eG6QV?O4LW7nr+taYTfF>iyMIAk5LL^sT8LXij*IkQxoV<%k z^<~yuWSn!a?L4>uII2!EY1c2=kq5F|6Bc z$Ve_n1$k*$c@BLNqt=SVFIjK1zyDdQ@Ov#g;;;wd%>QpVepy3lY}FsdZ6S9;8(n6$ z=|&WJ>5jtR>?}RK0k$Z@IaipHIO%M4O)1Bws&YtP&!48Y+@N)|DI<5xvMeBmDCcuZ!yzgMp-a!VTJF(t(q> z_BF_B(bFaGte;A+Y{6i-^SPqIF3nE_GZs|FTu(NwOty+xk^=Y59b9r;)-sDv1DaZ^ z4xTmCXAC1O8u`BLF~D{Ma8SSOlF*vz1}zegBAVq+o4byFXxtKY zuDHuH6QEib0+iADP1M~p!BN=VTHo3dcwX90k-%KMiE^^bQ<%}^86cvGgu@BJwN7<3 zv|!D!=N^0oczoLF6RI0ovS}k|zX?G$v(gVhb3W@iK#&0if1p!bGRQ7|9eA}Serzt| zU6tvtGK=4FTu0Gb(ZTPghPoD6WT$1CF$np)Y)cMg8!Tv?J*nXeM?ni#fv)Ok7ZAwA zu_uzpE(+JzD-Fggx|>{)dV9V%+-Snh+Jxi55WDw!%V2N_b$) zP;?M4*lo*rE$N6}LZ9XA=?K>lGo3-a0HSd8x?vsu=VK-6lVvVGzpEbGyQ@d|yXxuN z+5Mm9y<6*FS9*e=?aF(l)j9AjKFk_6r-mYoe`&Ua)^%kzV7!Z|T)yGAdXlixxRV9- zdc}n$IA9mz2fR!^df9n>l4EVcq{xp(hf(xp+$h`bzP=aQM!lO-X01I!>9M_a zuaBbEIq(=uM8=feii8&Xl}Yh3(X^99Lv)m%V`%Lxb_5@btHEsB+%2c7KJGny?j_El zu4;u)&cZ-mzH2DEIyDm(w=~uG3fh^mjQ=!%*Vf>%g?%DjfN%f0;KHN!7=%3sAUl`6D2-lSm z8l=#fHiw&M;Z4UI&V*W7*n!jV^Y}u)K)Yl50=Knl?rvN0L()DZ2adJ|5x$*?>N5~KmS64d(Zg(pGu*BNBO;C^cM;~!at(? zbkx72{9ae~3x)qZQ}?|uzgL<44)A-v`Y!+qoSy){ve$oa`g>aNucpsVL+N%(h! z-!p4}Aw0c@Z|}YTcL@9U;PrQu-?I^ap}@ZLfOnLCZJ7O$qWE*a?01CUa|V7P6jA>j z;a3L1?+Cw>wOA5x8Gc^^`-Sk0^X~{h#rQ{r z|5*$NzP}^Mtk&fL>t$fd2|t ofA7nG1{r^E&M)@2=6{783euqOtN{Q3?fq{a6aWB9@{f1_55Fhxn*aa+ literal 0 HcmV?d00001 diff --git a/tests/test_msexcel.py b/tests/test_msexcel.py new file mode 100644 index 00000000..15122313 --- /dev/null +++ b/tests/test_msexcel.py @@ -0,0 +1,77 @@ +import json +import os +from pathlib import Path + +from docling.backend.msword_backend import MsWordDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import ( + ConversionResult, + InputDocument, + SectionHeaderItem, +) +from docling.document_converter import DocumentConverter + +GENERATE = True + + +def get_xlsx_paths(): + + # Define the directory you want to search + directory = Path("./tests/data/xlsx/") + + # List all PDF files in the directory and its subdirectories + pdf_files = sorted(directory.rglob("*.xlsx")) + return pdf_files + + +def get_converter(): + + converter = DocumentConverter(allowed_formats=[InputFormat.XLSX]) + + return converter + + +def verify_export(pred_text: str, gtfile: str): + + if not os.path.exists(gtfile) or GENERATE: + with open(gtfile, "w") as fw: + fw.write(pred_text) + + return True + + else: + with open(gtfile, "r") as fr: + true_text = fr.read() + + assert pred_text == true_text, "pred_itxt==true_itxt" + return pred_text == true_text + + +def test_e2e_xlsx_conversions(): + + xlsx_paths = get_xlsx_paths() + converter = get_converter() + + for xlsx_path in xlsx_paths: + # print(f"converting {xlsx_path}") + + gt_path = ( + xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name + ) + + conv_result: ConversionResult = converter.convert(xlsx_path) + + doc: DoclingDocument = conv_result.document + + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export( + pred_itxt, str(gt_path) + ".itxt" + ), "export to indented-text" + + pred_json: str = json.dumps(doc.export_to_dict(), indent=2) + assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"