mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
* feat: Switch default layout model to DOCLING_LAYOUT_HERON. Update the unit test data. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Use default layout model in model_downloader default args Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use default layout model in model_downloader default args Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docling-models tag for TableFormer Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT (from linux CPU) Signed-off-by: Ubuntu <ubuntu@ip-172-31-30-253.eu-central-1.compute.internal> * fix: Ensure that the visualisations happen on copies of the page image Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Pinpoint docling-ibm-models to the fix branch for the ReadingOrderPredictor Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Update uv.lock Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Update tests GT to match the Heron layout model and the improved reading order model in Linux Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Introduce the verify_doctags optional parameter in conversion tests to control if a doctags comparison should take place. Skip doctags comparisons for certain tests. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Generate tests GT on Mac Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Remove the pinning of the docling-ibm-models and use the release 3.9.1 Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Ubuntu <ubuntu@ip-172-31-30-253.eu-central-1.compute.internal> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-30-253.eu-central-1.compute.internal>
394 lines
14 KiB
JSON
Vendored
394 lines
14 KiB
JSON
Vendored
{
|
|
"schema_name": "DoclingDocument",
|
|
"version": "1.6.0",
|
|
"name": "picture_classification",
|
|
"origin": {
|
|
"mimetype": "application/pdf",
|
|
"binary_hash": 6445357065749877499,
|
|
"filename": "picture_classification.pdf"
|
|
},
|
|
"furniture": {
|
|
"self_ref": "#/furniture",
|
|
"children": [],
|
|
"content_layer": "furniture",
|
|
"name": "_root_",
|
|
"label": "unspecified"
|
|
},
|
|
"body": {
|
|
"self_ref": "#/body",
|
|
"children": [
|
|
{
|
|
"$ref": "#/texts/0"
|
|
},
|
|
{
|
|
"$ref": "#/texts/1"
|
|
},
|
|
{
|
|
"$ref": "#/pictures/0"
|
|
},
|
|
{
|
|
"$ref": "#/texts/3"
|
|
},
|
|
{
|
|
"$ref": "#/texts/4"
|
|
},
|
|
{
|
|
"$ref": "#/texts/5"
|
|
},
|
|
{
|
|
"$ref": "#/pictures/1"
|
|
},
|
|
{
|
|
"$ref": "#/texts/7"
|
|
},
|
|
{
|
|
"$ref": "#/texts/8"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"name": "_root_",
|
|
"label": "unspecified"
|
|
},
|
|
"groups": [],
|
|
"texts": [
|
|
{
|
|
"self_ref": "#/texts/0",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "section_header",
|
|
"prov": [
|
|
{
|
|
"page_no": 1,
|
|
"bbox": {
|
|
"l": 133.77,
|
|
"t": 667.08,
|
|
"r": 252.35,
|
|
"b": 654.48,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
15
|
|
]
|
|
}
|
|
],
|
|
"orig": "Figures Example",
|
|
"text": "Figures Example",
|
|
"level": 1
|
|
},
|
|
{
|
|
"self_ref": "#/texts/1",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "text",
|
|
"prov": [
|
|
{
|
|
"page_no": 1,
|
|
"bbox": {
|
|
"l": 133.77,
|
|
"t": 642.22,
|
|
"r": 477.48,
|
|
"b": 502.0,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
887
|
|
]
|
|
}
|
|
],
|
|
"orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.",
|
|
"text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."
|
|
},
|
|
{
|
|
"self_ref": "#/texts/2",
|
|
"parent": {
|
|
"$ref": "#/pictures/0"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "caption",
|
|
"prov": [
|
|
{
|
|
"page_no": 1,
|
|
"bbox": {
|
|
"l": 226.89,
|
|
"t": 262.75,
|
|
"r": 384.36,
|
|
"b": 254.05,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
35
|
|
]
|
|
}
|
|
],
|
|
"orig": "Figure 1: This is an example image.",
|
|
"text": "Figure 1: This is an example image."
|
|
},
|
|
{
|
|
"self_ref": "#/texts/3",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "text",
|
|
"prov": [
|
|
{
|
|
"page_no": 1,
|
|
"bbox": {
|
|
"l": 133.77,
|
|
"t": 238.85,
|
|
"r": 477.48,
|
|
"b": 122.54,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
747
|
|
]
|
|
}
|
|
],
|
|
"orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.",
|
|
"text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."
|
|
},
|
|
{
|
|
"self_ref": "#/texts/4",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "furniture",
|
|
"label": "page_footer",
|
|
"prov": [
|
|
{
|
|
"page_no": 1,
|
|
"bbox": {
|
|
"l": 303.13,
|
|
"t": 96.17,
|
|
"r": 308.11,
|
|
"b": 87.46,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
1
|
|
]
|
|
}
|
|
],
|
|
"orig": "1",
|
|
"text": "1"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/5",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "text",
|
|
"prov": [
|
|
{
|
|
"page_no": 2,
|
|
"bbox": {
|
|
"l": 133.77,
|
|
"t": 664.04,
|
|
"r": 477.48,
|
|
"b": 523.83,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
887
|
|
]
|
|
}
|
|
],
|
|
"orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.",
|
|
"text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."
|
|
},
|
|
{
|
|
"self_ref": "#/texts/6",
|
|
"parent": {
|
|
"$ref": "#/pictures/1"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "caption",
|
|
"prov": [
|
|
{
|
|
"page_no": 2,
|
|
"bbox": {
|
|
"l": 226.89,
|
|
"t": 268.68,
|
|
"r": 384.36,
|
|
"b": 259.97,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
35
|
|
]
|
|
}
|
|
],
|
|
"orig": "Figure 2: This is an example image.",
|
|
"text": "Figure 2: This is an example image."
|
|
},
|
|
{
|
|
"self_ref": "#/texts/7",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "text",
|
|
"prov": [
|
|
{
|
|
"page_no": 2,
|
|
"bbox": {
|
|
"l": 133.77,
|
|
"t": 245.61,
|
|
"r": 477.48,
|
|
"b": 117.35,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
804
|
|
]
|
|
}
|
|
],
|
|
"orig": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.",
|
|
"text": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."
|
|
},
|
|
{
|
|
"self_ref": "#/texts/8",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "furniture",
|
|
"label": "page_footer",
|
|
"prov": [
|
|
{
|
|
"page_no": 2,
|
|
"bbox": {
|
|
"l": 303.13,
|
|
"t": 96.17,
|
|
"r": 308.11,
|
|
"b": 87.46,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
1
|
|
]
|
|
}
|
|
],
|
|
"orig": "2",
|
|
"text": "2"
|
|
}
|
|
],
|
|
"pictures": [
|
|
{
|
|
"self_ref": "#/pictures/0",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [
|
|
{
|
|
"$ref": "#/texts/2"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"label": "picture",
|
|
"prov": [
|
|
{
|
|
"page_no": 1,
|
|
"bbox": {
|
|
"l": 134.71,
|
|
"t": 487.7,
|
|
"r": 475.67,
|
|
"b": 282.37,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
0
|
|
]
|
|
}
|
|
],
|
|
"captions": [
|
|
{
|
|
"$ref": "#/texts/2"
|
|
}
|
|
],
|
|
"references": [],
|
|
"footnotes": [],
|
|
"annotations": []
|
|
},
|
|
{
|
|
"self_ref": "#/pictures/1",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [
|
|
{
|
|
"$ref": "#/texts/6"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"label": "picture",
|
|
"prov": [
|
|
{
|
|
"page_no": 2,
|
|
"bbox": {
|
|
"l": 218.88,
|
|
"t": 514.02,
|
|
"r": 392.18,
|
|
"b": 283.26,
|
|
"coord_origin": "BOTTOMLEFT"
|
|
},
|
|
"charspan": [
|
|
0,
|
|
0
|
|
]
|
|
}
|
|
],
|
|
"captions": [
|
|
{
|
|
"$ref": "#/texts/6"
|
|
}
|
|
],
|
|
"references": [],
|
|
"footnotes": [],
|
|
"annotations": []
|
|
}
|
|
],
|
|
"tables": [],
|
|
"key_value_items": [],
|
|
"form_items": [],
|
|
"pages": {
|
|
"1": {
|
|
"size": {
|
|
"width": 612.0,
|
|
"height": 792.0
|
|
},
|
|
"page_no": 1
|
|
},
|
|
"2": {
|
|
"size": {
|
|
"width": 612.0,
|
|
"height": 792.0
|
|
},
|
|
"page_no": 2
|
|
}
|
|
}
|
|
} |