diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 02f8c86d..18b0418c 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -2,7 +2,7 @@ import logging import re from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union import docx from docling_core.types.doc import ( @@ -14,6 +14,8 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docx.oxml.table import CT_Tc +from docx.table import Table, _Cell from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError @@ -449,30 +451,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return def handle_tables(self, element, docx_obj, doc): - - # Function to check if a cell has a colspan (gridSpan) - def get_colspan(cell): - grid_span = cell._element.xpath("@w:gridSpan") - if grid_span: - return int(grid_span[0]) # Return the number of columns spanned - return 1 # Default is 1 (no colspan) - - # Function to check if a cell has a rowspan (vMerge) - def get_rowspan(cell): - v_merge = cell._element.xpath("@w:vMerge") - if v_merge: - return v_merge[ - 0 - ] # 'restart' indicates the beginning of a rowspan, others are continuation - return 1 - - table = docx.table.Table(element, docx_obj) - + table: Table = Table(element, docx_obj) num_rows = len(table.rows) - num_cols = 0 - for row in table.rows: - # Calculate the max number of columns - num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells)) + num_cols = len(table.columns) + _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") if num_rows == 1 and num_cols == 1: cell_element = table.rows[0].cells[0] @@ -481,52 +463,47 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.walk_linear(cell_element._element, docx_obj, doc) return - # Initialize the table grid - table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] - - data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) - + data = TableData(num_rows=num_rows, num_cols=num_cols) + cell_set: set[CT_Tc] = set() for row_idx, row in enumerate(table.rows): + _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") col_idx = 0 - for c, cell in enumerate(row.cells): - row_span = get_rowspan(cell) - col_span = get_colspan(cell) + while col_idx < num_cols: + cell: _Cell = row.cells[col_idx] + _log.debug( + f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" + ) + if cell is None or cell._tc in cell_set: + _log.debug(f" skipped since repeated content") + col_idx += cell.grid_span + continue + else: + cell_set.add(cell._tc) - cell_text = cell.text - # In case cell doesn't return text via docx library: - if len(cell_text) == 0: - cell_xml = cell._element + spanned_idx = row_idx + spanned_tc: Optional[CT_Tc] = cell._tc + while spanned_tc == cell._tc: + spanned_idx += 1 + spanned_tc = ( + table.rows[spanned_idx].cells[col_idx]._tc + if spanned_idx < num_rows + else None + ) + _log.debug(f" spanned before row {spanned_idx}") - texts = [""] - for elem in cell_xml.iter(): - if elem.tag.endswith("t"): # tags that contain text - if elem.text: - texts.append(elem.text) - # Join the collected text - cell_text = " ".join(texts).strip() - - # Find the next available column in the grid - while table_grid[row_idx][col_idx] is not None: - col_idx += 1 - - # Fill the grid with the cell value, considering rowspan and colspan - for i in range(row_span if row_span == "restart" else 1): - for j in range(col_span): - table_grid[row_idx + i][col_idx + j] = "" - - cell = TableCell( - text=cell_text, - row_span=row_span, - col_span=col_span, - start_row_offset_idx=row_idx, - end_row_offset_idx=row_idx + row_span, + table_cell = TableCell( + text=cell.text, + row_span=spanned_idx - row_idx, + col_span=cell.grid_span, + start_row_offset_idx=row.grid_cols_before + row_idx, + end_row_offset_idx=row.grid_cols_before + spanned_idx, start_col_offset_idx=col_idx, - end_col_offset_idx=col_idx + col_span, + end_col_offset_idx=col_idx + cell.grid_span, col_header=False, row_header=False, ) - - data.table_cells.append(cell) + data.table_cells.append(table_cell) + col_idx += cell.grid_span level = self.get_level() doc.add_table(data=data, parent=self.parents[level - 1]) diff --git a/tests/data/docx/word_tables.docx b/tests/data/docx/word_tables.docx new file mode 100644 index 00000000..1513796e Binary files /dev/null and b/tests/data/docx/word_tables.docx differ diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html new file mode 100644 index 00000000..30f6e8d3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.itxt b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt new file mode 100644 index 00000000..dd42eb0a --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt @@ -0,0 +1,19 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section_header: Test with tables + item-3 at level 3: paragraph: A uniform table + item-4 at level 3: table with [3x3] + item-5 at level 3: paragraph: + item-6 at level 3: paragraph: A non-uniform table with horizontal spans + item-7 at level 3: table with [3x3] + item-8 at level 3: paragraph: + item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns + item-10 at level 3: table with [3x4] + item-11 at level 3: paragraph: + item-12 at level 3: paragraph: A non-uniform table with vertical spans + item-13 at level 3: table with [5x3] + item-14 at level 3: paragraph: + item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells + item-16 at level 3: table with [9x5] + item-17 at level 3: paragraph: + item-18 at level 3: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json new file mode 100644 index 00000000..957a83c8 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -0,0 +1,2356 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "word_tables", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 8379738677198259833, + "filename": "word_tables.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "header-0", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/tables/3" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/tables/4" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test with tables", + "text": "Test with tables", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A uniform table", + "text": "A uniform table" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans", + "text": "A non-uniform table with horizontal spans" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans in inner columns", + "text": "A non-uniform table with horizontal spans in inner columns" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with vertical spans", + "text": "A non-uniform table with vertical spans" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with all kinds of spans and empty cells", + "text": "A non-uniform table with all kinds of spans and empty cells" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 5, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 9, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.md b/tests/data/groundtruth/docling_v2/word_tables.docx.md new file mode 100644 index 00000000..90123c3e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.md @@ -0,0 +1,44 @@ +## Test with tables + +A uniform table + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|--------------|--------------| +| Cell 1.0 | Cell 1.1 | Cell 1.2 | +| Cell 2.0 | Cell 2.1 | Cell 2.2 | + +A non-uniform table with horizontal spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|---------------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | + +A non-uniform table with horizontal spans in inner columns + +| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 | +|--------------|---------------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 | + +A non-uniform table with vertical spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | + +A non-uniform table with all kinds of spans and empty cells + +| Header 0.0 | Header 0.1 | Header 0.2 | | | +|--------------|---------------------|--------------|----|---------------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 | +| | | | | Merged Cell 4.4 5.4 | +| | | | | | +| | | | | | +| | | | | Cell 8.4 | \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 9edcb3e6..86bd837d 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str): with open(gtfile, "r") as fr: true_text = fr.read() - assert pred_text == true_text, "pred_itxt==true_itxt" return pred_text == true_text @@ -101,3 +100,7 @@ def test_e2e_docx_conversions(): pred_json: str = json.dumps(doc.export_to_dict(), indent=2) assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + + if docx_path.name == "word_tables.docx": + pred_html: str = doc.export_to_html() + assert verify_export(pred_html, str(gt_path) + ".html"), "export to html" diff --git a/word_tables.html b/word_tables.html new file mode 100644 index 00000000..30f6e8d3 --- /dev/null +++ b/word_tables.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file