docling/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
Clément Doumouro bba05d1c37 fix(layout,table): orientation-aware layout and table detection
Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
2025-07-09 17:03:58 +02:00

619 lines
17 KiB
JSON
Vendored

{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"name": "ocr_test",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 3906211175708501508,
"filename": "ocr_test.pdf"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 103.33,
"t": 519.86,
"r": 560.95,
"b": 234.07,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 245.02,
"t": 106.57,
"r": 307.59,
"b": 120.29,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Column 0",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 358.65,
"t": 106.57,
"r": 421.22,
"b": 120.29,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Column 1",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 472.27,
"t": 106.57,
"r": 534.84,
"b": 120.29,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "Column 2",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 123.52,
"t": 174.07,
"r": 200.67,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "this is row 0",
"column_header": false,
"row_header": true,
"row_section": false
},
{
"bbox": {
"l": 241.65,
"t": 174.07,
"r": 310.71,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "some cells",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 347.4,
"t": 174.07,
"r": 431.1,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "have content",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 491.4,
"t": 174.07,
"r": 515.79,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "and",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 130.27,
"t": 242.7,
"r": 194.46,
"b": 256.41,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "and row 1",
"column_header": false,
"row_header": true,
"row_section": false
},
{
"bbox": {
"l": 373.27,
"t": 242.7,
"r": 406.59,
"b": 256.41,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "other",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 486.9,
"t": 242.7,
"r": 518.61,
"b": 256.41,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "have",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 116.77,
"t": 315.82,
"r": 207.76,
"b": 329.54,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "and last row 2",
"column_header": false,
"row_header": true,
"row_section": false
},
{
"bbox": {
"l": 251.77,
"t": 315.82,
"r": 299.73,
"b": 329.54,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "nothing",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 484.65,
"t": 315.82,
"r": 522.85,
"b": 329.54,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "inside",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 4,
"num_cols": 4,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 245.02,
"t": 106.57,
"r": 307.59,
"b": 120.29,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Column 0",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 358.65,
"t": 106.57,
"r": 421.22,
"b": 120.29,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Column 1",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 472.27,
"t": 106.57,
"r": 534.84,
"b": 120.29,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "Column 2",
"column_header": true,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 123.52,
"t": 174.07,
"r": 200.67,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "this is row 0",
"column_header": false,
"row_header": true,
"row_section": false
},
{
"bbox": {
"l": 241.65,
"t": 174.07,
"r": 310.71,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "some cells",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 347.4,
"t": 174.07,
"r": 431.1,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "have content",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 491.4,
"t": 174.07,
"r": 515.79,
"b": 187.79,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "and",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 130.27,
"t": 242.7,
"r": 194.46,
"b": 256.41,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "and row 1",
"column_header": false,
"row_header": true,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 373.27,
"t": 242.7,
"r": 406.59,
"b": 256.41,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "other",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 486.9,
"t": 242.7,
"r": 518.61,
"b": 256.41,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "have",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 116.77,
"t": 315.82,
"r": 207.76,
"b": 329.54,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "and last row 2",
"column_header": false,
"row_header": true,
"row_section": false
},
{
"bbox": {
"l": 251.77,
"t": 315.82,
"r": 299.73,
"b": 329.54,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "nothing",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 484.65,
"t": 315.82,
"r": 522.85,
"b": 329.54,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "inside",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 842.0,
"height": 595.0
},
"page_no": 1
}
}
}