mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 05:08:14 +00:00
feat(ocr): auto-detect rotated pages in Tesseract (#1167)
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
@@ -1,2 +1,2 @@
|
||||
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
</doctag>
|
||||
@@ -42,10 +42,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 69.6796630536824,
|
||||
"l": 70.90211866351085,
|
||||
"t": 764.9216921155637,
|
||||
"r": 504.8720051760782,
|
||||
"b": 689.0124221922704,
|
||||
"r": 504.8720079864275,
|
||||
"b": 689.216658542347,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
||||
@@ -40,14 +40,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"r_x0": 70.90211866351085,
|
||||
"r_y0": 124.83139551297342,
|
||||
"r_x1": 504.8720079864275,
|
||||
"r_y1": 124.83139551297342,
|
||||
"r_x2": 504.8720079864275,
|
||||
"r_y2": 102.66666671251768,
|
||||
"r_x3": 70.90211866351085,
|
||||
"r_y3": 102.66666671251768,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
@@ -65,14 +65,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"r_x0": 73.10852522817731,
|
||||
"r_y0": 152.70503335218433,
|
||||
"r_x1": 153.04479435252625,
|
||||
"r_y1": 152.70503335218433,
|
||||
"r_x2": 153.04479435252625,
|
||||
"r_y2": 130.00136157890958,
|
||||
"r_x3": 73.10852522817731,
|
||||
"r_y3": 130.00136157890958,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
@@ -90,13 +90,13 @@
|
||||
"id": 0,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 69.6796630536824,
|
||||
"l": 70.90211866351085,
|
||||
"t": 76.99999977896756,
|
||||
"r": 504.8720051760782,
|
||||
"b": 152.90926970226084,
|
||||
"r": 504.8720079864275,
|
||||
"b": 152.70503335218433,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9715732336044312,
|
||||
"confidence": 0.9715733528137207,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@@ -132,14 +132,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"r_x0": 70.90211866351085,
|
||||
"r_y0": 124.83139551297342,
|
||||
"r_x1": 504.8720079864275,
|
||||
"r_y1": 124.83139551297342,
|
||||
"r_x2": 504.8720079864275,
|
||||
"r_y2": 102.66666671251768,
|
||||
"r_x3": 70.90211866351085,
|
||||
"r_y3": 102.66666671251768,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
@@ -157,14 +157,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"r_x0": 73.10852522817731,
|
||||
"r_y0": 152.70503335218433,
|
||||
"r_x1": 153.04479435252625,
|
||||
"r_y1": 152.70503335218433,
|
||||
"r_x2": 153.04479435252625,
|
||||
"r_y2": 130.00136157890958,
|
||||
"r_x3": 73.10852522817731,
|
||||
"r_y3": 130.00136157890958,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
@@ -195,13 +195,13 @@
|
||||
"id": 0,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 69.6796630536824,
|
||||
"l": 70.90211866351085,
|
||||
"t": 76.99999977896756,
|
||||
"r": 504.8720051760782,
|
||||
"b": 152.90926970226084,
|
||||
"r": 504.8720079864275,
|
||||
"b": 152.70503335218433,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9715732336044312,
|
||||
"confidence": 0.9715733528137207,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@@ -237,14 +237,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"r_x0": 70.90211866351085,
|
||||
"r_y0": 124.83139551297342,
|
||||
"r_x1": 504.8720079864275,
|
||||
"r_y1": 124.83139551297342,
|
||||
"r_x2": 504.8720079864275,
|
||||
"r_y2": 102.66666671251768,
|
||||
"r_x3": 70.90211866351085,
|
||||
"r_y3": 102.66666671251768,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
@@ -262,14 +262,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"r_x0": 73.10852522817731,
|
||||
"r_y0": 152.70503335218433,
|
||||
"r_x1": 153.04479435252625,
|
||||
"r_y1": 152.70503335218433,
|
||||
"r_x2": 153.04479435252625,
|
||||
"r_y2": 130.00136157890958,
|
||||
"r_x3": 73.10852522817731,
|
||||
"r_y3": 130.00136157890958,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
@@ -293,13 +293,13 @@
|
||||
"id": 0,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 69.6796630536824,
|
||||
"l": 70.90211866351085,
|
||||
"t": 76.99999977896756,
|
||||
"r": 504.8720051760782,
|
||||
"b": 152.90926970226084,
|
||||
"r": 504.8720079864275,
|
||||
"b": 152.70503335218433,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9715732336044312,
|
||||
"confidence": 0.9715733528137207,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@@ -335,14 +335,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"r_x0": 70.90211866351085,
|
||||
"r_y0": 124.83139551297342,
|
||||
"r_x1": 504.8720079864275,
|
||||
"r_y1": 124.83139551297342,
|
||||
"r_x2": 504.8720079864275,
|
||||
"r_y2": 102.66666671251768,
|
||||
"r_x3": 70.90211866351085,
|
||||
"r_y3": 102.66666671251768,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
@@ -360,14 +360,14 @@
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"r_x0": 73.10852522817731,
|
||||
"r_y0": 152.70503335218433,
|
||||
"r_x1": 153.04479435252625,
|
||||
"r_y1": 152.70503335218433,
|
||||
"r_x2": 153.04479435252625,
|
||||
"r_y2": 130.00136157890958,
|
||||
"r_x3": 73.10852522817731,
|
||||
"r_y3": 130.00136157890958,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
<doctag><text><loc_371><loc_410><loc_438><loc_422>package</text>
|
||||
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
</doctag>
|
||||
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "ocr_test_rotated_180",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 2530576989861832966,
|
||||
"filename": "ocr_test_rotated_180.pdf",
|
||||
"uri": null
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"parent": null,
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"parent": null,
|
||||
"children": [
|
||||
{
|
||||
"cref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"cref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 441.304584329099,
|
||||
"t": 151.67751306395223,
|
||||
"r": 521.9863114205704,
|
||||
"b": 132.09610360960653,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "package",
|
||||
"text": "package",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 89.12133215549848,
|
||||
"t": 124.86176457554109,
|
||||
"r": 523.3501733013318,
|
||||
"b": 77.02339849621205,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
86
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"image": null,
|
||||
"page_no": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
package
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
||||
@@ -0,0 +1,445 @@
|
||||
[
|
||||
{
|
||||
"page_no": 0,
|
||||
"size": {
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 90.46133071208328,
|
||||
"r_y0": 764.8982933983192,
|
||||
"r_x1": 520.7638616365624,
|
||||
"r_y1": 764.8982933983192,
|
||||
"r_x2": 520.7638616365624,
|
||||
"r_y2": 744.0929853742306,
|
||||
"r_x3": 90.46133071208328,
|
||||
"r_y3": 744.0929853742306,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.12133215549848,
|
||||
"r_y0": 741.5247710689902,
|
||||
"r_x1": 523.3501733013318,
|
||||
"r_y1": 741.5247710689902,
|
||||
"r_x2": 523.3501733013318,
|
||||
"r_y2": 717.0599273189902,
|
||||
"r_x3": 89.12133215549848,
|
||||
"r_y3": 717.0599273189902,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.304584329099,
|
||||
"r_y0": 709.8255882849247,
|
||||
"r_x1": 521.9863114205704,
|
||||
"r_y1": 709.8255882849247,
|
||||
"r_x2": 521.9863114205704,
|
||||
"r_y2": 690.244178830579,
|
||||
"r_x3": 441.304584329099,
|
||||
"r_y3": 690.244178830579,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
{
|
||||
"id": 0,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 89.12133215549848,
|
||||
"t": 717.0599273189902,
|
||||
"r": 523.3501733013318,
|
||||
"b": 764.8982933983192,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.7318570613861084,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 90.46133071208328,
|
||||
"r_y0": 764.8982933983192,
|
||||
"r_x1": 520.7638616365624,
|
||||
"r_y1": 764.8982933983192,
|
||||
"r_x2": 520.7638616365624,
|
||||
"r_y2": 744.0929853742306,
|
||||
"r_x3": 90.46133071208328,
|
||||
"r_y3": 744.0929853742306,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.12133215549848,
|
||||
"r_y0": 741.5247710689902,
|
||||
"r_x1": 523.3501733013318,
|
||||
"r_y1": 741.5247710689902,
|
||||
"r_x2": 523.3501733013318,
|
||||
"r_y2": 717.0599273189902,
|
||||
"r_x3": 89.12133215549848,
|
||||
"r_y3": 717.0599273189902,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 441.304584329099,
|
||||
"t": 690.244178830579,
|
||||
"r": 521.9863114205704,
|
||||
"b": 709.8255882849247,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.5982133150100708,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.304584329099,
|
||||
"r_y0": 709.8255882849247,
|
||||
"r_x1": 521.9863114205704,
|
||||
"r_y1": 709.8255882849247,
|
||||
"r_x2": 521.9863114205704,
|
||||
"r_y2": 690.244178830579,
|
||||
"r_x3": 441.304584329099,
|
||||
"r_y3": 690.244178830579,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"tablestructure": {
|
||||
"table_map": {}
|
||||
},
|
||||
"figures_classification": null,
|
||||
"equations_prediction": null,
|
||||
"vlm_response": null
|
||||
},
|
||||
"assembled": {
|
||||
"elements": [
|
||||
{
|
||||
"label": "text",
|
||||
"id": 0,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 0,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 89.12133215549848,
|
||||
"t": 717.0599273189902,
|
||||
"r": 523.3501733013318,
|
||||
"b": 764.8982933983192,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.7318570613861084,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 90.46133071208328,
|
||||
"r_y0": 764.8982933983192,
|
||||
"r_x1": 520.7638616365624,
|
||||
"r_y1": 764.8982933983192,
|
||||
"r_x2": 520.7638616365624,
|
||||
"r_y2": 744.0929853742306,
|
||||
"r_x3": 90.46133071208328,
|
||||
"r_y3": 744.0929853742306,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.12133215549848,
|
||||
"r_y0": 741.5247710689902,
|
||||
"r_x1": 523.3501733013318,
|
||||
"r_y1": 741.5247710689902,
|
||||
"r_x2": 523.3501733013318,
|
||||
"r_y2": 717.0599273189902,
|
||||
"r_x3": 89.12133215549848,
|
||||
"r_y3": 717.0599273189902,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
},
|
||||
{
|
||||
"label": "text",
|
||||
"id": 2,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 2,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 441.304584329099,
|
||||
"t": 690.244178830579,
|
||||
"r": 521.9863114205704,
|
||||
"b": 709.8255882849247,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.5982133150100708,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.304584329099,
|
||||
"r_y0": 709.8255882849247,
|
||||
"r_x1": 521.9863114205704,
|
||||
"r_y1": 709.8255882849247,
|
||||
"r_x2": 521.9863114205704,
|
||||
"r_y2": 690.244178830579,
|
||||
"r_x3": 441.304584329099,
|
||||
"r_y3": 690.244178830579,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "package"
|
||||
}
|
||||
],
|
||||
"body": [
|
||||
{
|
||||
"label": "text",
|
||||
"id": 0,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 0,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 89.12133215549848,
|
||||
"t": 717.0599273189902,
|
||||
"r": 523.3501733013318,
|
||||
"b": 764.8982933983192,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.7318570613861084,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 90.46133071208328,
|
||||
"r_y0": 764.8982933983192,
|
||||
"r_x1": 520.7638616365624,
|
||||
"r_y1": 764.8982933983192,
|
||||
"r_x2": 520.7638616365624,
|
||||
"r_y2": 744.0929853742306,
|
||||
"r_x3": 90.46133071208328,
|
||||
"r_y3": 744.0929853742306,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.12133215549848,
|
||||
"r_y0": 741.5247710689902,
|
||||
"r_x1": 523.3501733013318,
|
||||
"r_y1": 741.5247710689902,
|
||||
"r_x2": 523.3501733013318,
|
||||
"r_y2": 717.0599273189902,
|
||||
"r_x3": 89.12133215549848,
|
||||
"r_y3": 717.0599273189902,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
},
|
||||
{
|
||||
"label": "text",
|
||||
"id": 2,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 2,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 441.304584329099,
|
||||
"t": 690.244178830579,
|
||||
"r": 521.9863114205704,
|
||||
"b": 709.8255882849247,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.5982133150100708,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.304584329099,
|
||||
"r_y0": 709.8255882849247,
|
||||
"r_x1": 521.9863114205704,
|
||||
"r_y1": 709.8255882849247,
|
||||
"r_x2": 521.9863114205704,
|
||||
"r_y2": 690.244178830579,
|
||||
"r_x3": 441.304584329099,
|
||||
"r_y3": 690.244178830579,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "package"
|
||||
}
|
||||
],
|
||||
"headers": []
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -0,0 +1,3 @@
|
||||
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_411><loc_61><loc_422><loc_128>package</text>
|
||||
</doctag>
|
||||
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "ocr_test_rotated_270",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 10890858393843077593,
|
||||
"filename": "ocr_test_rotated_270.pdf",
|
||||
"uri": null
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"parent": null,
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"parent": null,
|
||||
"children": [
|
||||
{
|
||||
"cref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"cref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "page_header",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 717.1685859527342,
|
||||
"t": 524.2990548540179,
|
||||
"r": 764.8982839673505,
|
||||
"b": 90.32916553110118,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
86
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 691.4680194659409,
|
||||
"t": 523.0765988200898,
|
||||
"r": 709.8255850278712,
|
||||
"b": 442.3948768148814,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "package",
|
||||
"text": "package",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"image": null,
|
||||
"page_no": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
package
|
||||
@@ -0,0 +1,446 @@
|
||||
[
|
||||
{
|
||||
"page_no": 0,
|
||||
"size": {
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.1685859527342,
|
||||
"r_y0": 504.8720063438988,
|
||||
"r_x1": 737.9738558298501,
|
||||
"r_y1": 504.8720063438988,
|
||||
"r_x2": 737.9738558298501,
|
||||
"r_y2": 70.90211702098213,
|
||||
"r_x3": 717.1685859527342,
|
||||
"r_y3": 70.90211702098213,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 691.4680194659409,
|
||||
"r_y0": 152.80629506011857,
|
||||
"r_x1": 709.8255850278712,
|
||||
"r_y1": 152.80629506011857,
|
||||
"r_x2": 709.8255850278712,
|
||||
"r_y2": 72.12457305491027,
|
||||
"r_x3": 691.4680194659409,
|
||||
"r_y3": 72.12457305491027,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
{
|
||||
"id": 0,
|
||||
"label": "page_header",
|
||||
"bbox": {
|
||||
"l": 717.1685859527342,
|
||||
"t": 70.90211702098213,
|
||||
"r": 764.8982839673505,
|
||||
"b": 504.8720063438988,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.6915205121040344,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.1685859527342,
|
||||
"r_y0": 504.8720063438988,
|
||||
"r_x1": 737.9738558298501,
|
||||
"r_y1": 504.8720063438988,
|
||||
"r_x2": 737.9738558298501,
|
||||
"r_y2": 70.90211702098213,
|
||||
"r_x3": 717.1685859527342,
|
||||
"r_y3": 70.90211702098213,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 691.4680194659409,
|
||||
"t": 72.12457305491027,
|
||||
"r": 709.8255850278712,
|
||||
"b": 152.80629506011857,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 1.0,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 691.4680194659409,
|
||||
"r_y0": 152.80629506011857,
|
||||
"r_x1": 709.8255850278712,
|
||||
"r_y1": 152.80629506011857,
|
||||
"r_x2": 709.8255850278712,
|
||||
"r_y2": 72.12457305491027,
|
||||
"r_x3": 691.4680194659409,
|
||||
"r_y3": 72.12457305491027,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"tablestructure": {
|
||||
"table_map": {}
|
||||
},
|
||||
"figures_classification": null,
|
||||
"equations_prediction": null,
|
||||
"vlm_response": null
|
||||
},
|
||||
"assembled": {
|
||||
"elements": [
|
||||
{
|
||||
"label": "page_header",
|
||||
"id": 0,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 0,
|
||||
"label": "page_header",
|
||||
"bbox": {
|
||||
"l": 717.1685859527342,
|
||||
"t": 70.90211702098213,
|
||||
"r": 764.8982839673505,
|
||||
"b": 504.8720063438988,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.6915205121040344,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.1685859527342,
|
||||
"r_y0": 504.8720063438988,
|
||||
"r_x1": 737.9738558298501,
|
||||
"r_y1": 504.8720063438988,
|
||||
"r_x2": 737.9738558298501,
|
||||
"r_y2": 70.90211702098213,
|
||||
"r_x3": 717.1685859527342,
|
||||
"r_y3": 70.90211702098213,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
},
|
||||
{
|
||||
"label": "text",
|
||||
"id": 8,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 8,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 691.4680194659409,
|
||||
"t": 72.12457305491027,
|
||||
"r": 709.8255850278712,
|
||||
"b": 152.80629506011857,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 1.0,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 691.4680194659409,
|
||||
"r_y0": 152.80629506011857,
|
||||
"r_x1": 709.8255850278712,
|
||||
"r_y1": 152.80629506011857,
|
||||
"r_x2": 709.8255850278712,
|
||||
"r_y2": 72.12457305491027,
|
||||
"r_x3": 691.4680194659409,
|
||||
"r_y3": 72.12457305491027,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "package"
|
||||
}
|
||||
],
|
||||
"body": [
|
||||
{
|
||||
"label": "text",
|
||||
"id": 8,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 8,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 691.4680194659409,
|
||||
"t": 72.12457305491027,
|
||||
"r": 709.8255850278712,
|
||||
"b": 152.80629506011857,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 1.0,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 691.4680194659409,
|
||||
"r_y0": 152.80629506011857,
|
||||
"r_x1": 709.8255850278712,
|
||||
"r_y1": 152.80629506011857,
|
||||
"r_x2": 709.8255850278712,
|
||||
"r_y2": 72.12457305491027,
|
||||
"r_x3": 691.4680194659409,
|
||||
"r_y3": 72.12457305491027,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "package"
|
||||
}
|
||||
],
|
||||
"headers": [
|
||||
{
|
||||
"label": "page_header",
|
||||
"id": 0,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 0,
|
||||
"label": "page_header",
|
||||
"bbox": {
|
||||
"l": 717.1685859527342,
|
||||
"t": 70.90211702098213,
|
||||
"r": 764.8982839673505,
|
||||
"b": 504.8720063438988,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.6915205121040344,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.1685859527342,
|
||||
"r_y0": 504.8720063438988,
|
||||
"r_x1": 737.9738558298501,
|
||||
"r_y1": 504.8720063438988,
|
||||
"r_x2": 737.9738558298501,
|
||||
"r_y2": 70.90211702098213,
|
||||
"r_x3": 717.1685859527342,
|
||||
"r_y3": 70.90211702098213,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -0,0 +1,3 @@
|
||||
<doctag><page_header><loc_46><loc_75><loc_74><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_78><loc_370><loc_90><loc_438>package</text>
|
||||
</doctag>
|
||||
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "ocr_test_rotated_90",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 6989291015361162334,
|
||||
"filename": "ocr_test_rotated_90.pdf",
|
||||
"uri": null
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"parent": null,
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"parent": null,
|
||||
"children": [
|
||||
{
|
||||
"cref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"cref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "page_header",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 77.10171546422428,
|
||||
"t": 506.07735421856773,
|
||||
"r": 124.91101654503161,
|
||||
"b": 71.88562244773436,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
86
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 131.21306574279092,
|
||||
"t": 154.19400205373182,
|
||||
"r": 152.19606490864376,
|
||||
"b": 74.12495603322407,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "package",
|
||||
"text": "package",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"image": null,
|
||||
"page_no": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
package
|
||||
@@ -0,0 +1,446 @@
|
||||
[
|
||||
{
|
||||
"page_no": 0,
|
||||
"size": {
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 77.10171546422428,
|
||||
"r_y0": 520.7638577050515,
|
||||
"r_x1": 96.6831586150625,
|
||||
"r_y1": 520.7638577050515,
|
||||
"r_x2": 96.6831586150625,
|
||||
"r_y2": 89.23887398109309,
|
||||
"r_x3": 77.10171546422428,
|
||||
"r_y3": 89.23887398109309,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.55299576256091,
|
||||
"r_y0": 523.3155494272656,
|
||||
"r_x1": 124.91101654503161,
|
||||
"r_y1": 523.3155494272656,
|
||||
"r_x2": 124.91101654503161,
|
||||
"r_y2": 89.12381765643227,
|
||||
"r_x3": 100.55299576256091,
|
||||
"r_y3": 89.12381765643227,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
{
|
||||
"id": 0,
|
||||
"label": "page_header",
|
||||
"bbox": {
|
||||
"l": 77.10171546422428,
|
||||
"t": 89.12381765643227,
|
||||
"r": 124.91101654503161,
|
||||
"b": 523.3155494272656,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.6016772389411926,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 77.10171546422428,
|
||||
"r_y0": 520.7638577050515,
|
||||
"r_x1": 96.6831586150625,
|
||||
"r_y1": 520.7638577050515,
|
||||
"r_x2": 96.6831586150625,
|
||||
"r_y2": 89.23887398109309,
|
||||
"r_x3": 77.10171546422428,
|
||||
"r_y3": 89.23887398109309,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.55299576256091,
|
||||
"r_y0": 523.3155494272656,
|
||||
"r_x1": 124.91101654503161,
|
||||
"r_y1": 523.3155494272656,
|
||||
"r_x2": 124.91101654503161,
|
||||
"r_y2": 89.12381765643227,
|
||||
"r_x3": 100.55299576256091,
|
||||
"r_y3": 89.12381765643227,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 131.21306574279092,
|
||||
"t": 441.0071698212682,
|
||||
"r": 152.19606490864376,
|
||||
"b": 521.0762158417759,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.5234212875366211,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"tablestructure": {
|
||||
"table_map": {}
|
||||
},
|
||||
"figures_classification": null,
|
||||
"equations_prediction": null,
|
||||
"vlm_response": null
|
||||
},
|
||||
"assembled": {
|
||||
"elements": [
|
||||
{
|
||||
"label": "page_header",
|
||||
"id": 0,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 0,
|
||||
"label": "page_header",
|
||||
"bbox": {
|
||||
"l": 77.10171546422428,
|
||||
"t": 89.12381765643227,
|
||||
"r": 124.91101654503161,
|
||||
"b": 523.3155494272656,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.6016772389411926,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 77.10171546422428,
|
||||
"r_y0": 520.7638577050515,
|
||||
"r_x1": 96.6831586150625,
|
||||
"r_y1": 520.7638577050515,
|
||||
"r_x2": 96.6831586150625,
|
||||
"r_y2": 89.23887398109309,
|
||||
"r_x3": 77.10171546422428,
|
||||
"r_y3": 89.23887398109309,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.55299576256091,
|
||||
"r_y0": 523.3155494272656,
|
||||
"r_x1": 124.91101654503161,
|
||||
"r_y1": 523.3155494272656,
|
||||
"r_x2": 124.91101654503161,
|
||||
"r_y2": 89.12381765643227,
|
||||
"r_x3": 100.55299576256091,
|
||||
"r_y3": 89.12381765643227,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
},
|
||||
{
|
||||
"label": "text",
|
||||
"id": 1,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 1,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 131.21306574279092,
|
||||
"t": 441.0071698212682,
|
||||
"r": 152.19606490864376,
|
||||
"b": 521.0762158417759,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.5234212875366211,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "package"
|
||||
}
|
||||
],
|
||||
"body": [
|
||||
{
|
||||
"label": "text",
|
||||
"id": 1,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 1,
|
||||
"label": "text",
|
||||
"bbox": {
|
||||
"l": 131.21306574279092,
|
||||
"t": 441.0071698212682,
|
||||
"r": 152.19606490864376,
|
||||
"b": 521.0762158417759,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.5234212875366211,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "package"
|
||||
}
|
||||
],
|
||||
"headers": [
|
||||
{
|
||||
"label": "page_header",
|
||||
"id": 0,
|
||||
"page_no": 0,
|
||||
"cluster": {
|
||||
"id": 0,
|
||||
"label": "page_header",
|
||||
"bbox": {
|
||||
"l": 77.10171546422428,
|
||||
"t": 89.12381765643227,
|
||||
"r": 124.91101654503161,
|
||||
"b": 523.3155494272656,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.6016772389411926,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 77.10171546422428,
|
||||
"r_y0": 520.7638577050515,
|
||||
"r_x1": 96.6831586150625,
|
||||
"r_y1": 520.7638577050515,
|
||||
"r_x2": 96.6831586150625,
|
||||
"r_y2": 89.23887398109309,
|
||||
"r_x3": 77.10171546422428,
|
||||
"r_y3": 89.23887398109309,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.55299576256091,
|
||||
"r_y0": 523.3155494272656,
|
||||
"r_x1": 124.91101654503161,
|
||||
"r_y1": 523.3155494272656,
|
||||
"r_x2": 124.91101654503161,
|
||||
"r_y2": 89.12381765643227,
|
||||
"r_x3": 100.55299576256091,
|
||||
"r_y3": 89.12381765643227,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"children": []
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user