docling/tests/data/groundtruth/docling_v2/example_03.html.json
Peter W. J. Staar e25d557c06
refactor: add the contentlayer to html-backend (#1040)
* added the contentlayer to html-backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the handle_image function

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted code of html backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* test(html): add more info if a test case fails

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* refactor(html): put parsed item in body if doc has no header

In case an HTML does not have any header tag, all parsed items are placed in
DoclingDocument's body content layer.
HTML paragraphs ('p' tags) are parsed as text items with paragraph label.
Update test ground truth accoring to the changes above.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore: set TextItem label to 'text' instead of 'paragraph'

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-03-02 10:37:53 -05:00

646 lines
16 KiB
JSON

{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"name": "example_03",
"origin": {
"mimetype": "text/html",
"binary_hash": 17768514429310008971,
"filename": "example_03.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/3"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/5"
},
"children": [
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/texts/3"
},
"children": [
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "ordered_list"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/texts/9"
},
"children": [
{
"$ref": "#/texts/10"
},
{
"$ref": "#/texts/11"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "ordered_list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/13"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Example Document",
"text": "Example Document"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Introduction",
"text": "Introduction",
"level": 2
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Background",
"text": "Background",
"level": 2
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/texts/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "First item in unordered list",
"text": "First item in unordered list",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested item 1",
"text": "Nested item 1",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested item 2",
"text": "Nested item 2",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Second item in unordered list",
"text": "Second item in unordered list",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "First item in ordered list",
"text": "First item in ordered list",
"enumerated": true,
"marker": "1"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested ordered item 1",
"text": "Nested ordered item 1",
"enumerated": true,
"marker": "1."
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested ordered item 2",
"text": "Nested ordered item 2",
"enumerated": true,
"marker": "2."
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Second item in ordered list",
"text": "Second item in ordered list",
"enumerated": true,
"marker": "2."
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Data Table",
"text": "Data Table",
"level": 2
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/13"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Header 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Header 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Header 3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 1, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 2, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 2, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 2, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 4,
"num_cols": 3,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Header 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Header 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Header 3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 1, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 2, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 2, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 2, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
}
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}