feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): update missing test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): rotate image to the natural orientation before layout prediction

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): move bounding bow rotation util to orientation.py

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel`

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel`

* chore(ocr): default `TesseractOcrCliModel._is_auto` to `False`

* fix(ocr): fix `TesseractOcrCliModel._is_auto` computation

* chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel`

---------

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro
2025-05-21 18:12:33 +02:00
committed by GitHub
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions

View File

@@ -1,2 +1,2 @@
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@@ -42,10 +42,10 @@
{
"page_no": 1,
"bbox": {
"l": 69.6796630536824,
"l": 70.90211866351085,
"t": 764.9216921155637,
"r": 504.8720051760782,
"b": 689.0124221922704,
"r": 504.8720079864275,
"b": 689.216658542347,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

View File

@@ -40,14 +40,14 @@
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"r_x0": 70.90211866351085,
"r_y0": 124.83139551297342,
"r_x1": 504.8720079864275,
"r_y1": 124.83139551297342,
"r_x2": 504.8720079864275,
"r_y2": 102.66666671251768,
"r_x3": 70.90211866351085,
"r_y3": 102.66666671251768,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -65,14 +65,14 @@
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"r_x0": 73.10852522817731,
"r_y0": 152.70503335218433,
"r_x1": 153.04479435252625,
"r_y1": 152.70503335218433,
"r_x2": 153.04479435252625,
"r_y2": 130.00136157890958,
"r_x3": 73.10852522817731,
"r_y3": 130.00136157890958,
"coord_origin": "TOPLEFT"
},
"text": "package",
@@ -90,13 +90,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 69.6796630536824,
"l": 70.90211866351085,
"t": 76.99999977896756,
"r": 504.8720051760782,
"b": 152.90926970226084,
"r": 504.8720079864275,
"b": 152.70503335218433,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9715732336044312,
"confidence": 0.9715733528137207,
"cells": [
{
"index": 0,
@@ -132,14 +132,14 @@
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"r_x0": 70.90211866351085,
"r_y0": 124.83139551297342,
"r_x1": 504.8720079864275,
"r_y1": 124.83139551297342,
"r_x2": 504.8720079864275,
"r_y2": 102.66666671251768,
"r_x3": 70.90211866351085,
"r_y3": 102.66666671251768,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -157,14 +157,14 @@
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"r_x0": 73.10852522817731,
"r_y0": 152.70503335218433,
"r_x1": 153.04479435252625,
"r_y1": 152.70503335218433,
"r_x2": 153.04479435252625,
"r_y2": 130.00136157890958,
"r_x3": 73.10852522817731,
"r_y3": 130.00136157890958,
"coord_origin": "TOPLEFT"
},
"text": "package",
@@ -195,13 +195,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 69.6796630536824,
"l": 70.90211866351085,
"t": 76.99999977896756,
"r": 504.8720051760782,
"b": 152.90926970226084,
"r": 504.8720079864275,
"b": 152.70503335218433,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9715732336044312,
"confidence": 0.9715733528137207,
"cells": [
{
"index": 0,
@@ -237,14 +237,14 @@
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"r_x0": 70.90211866351085,
"r_y0": 124.83139551297342,
"r_x1": 504.8720079864275,
"r_y1": 124.83139551297342,
"r_x2": 504.8720079864275,
"r_y2": 102.66666671251768,
"r_x3": 70.90211866351085,
"r_y3": 102.66666671251768,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -262,14 +262,14 @@
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"r_x0": 73.10852522817731,
"r_y0": 152.70503335218433,
"r_x1": 153.04479435252625,
"r_y1": 152.70503335218433,
"r_x2": 153.04479435252625,
"r_y2": 130.00136157890958,
"r_x3": 73.10852522817731,
"r_y3": 130.00136157890958,
"coord_origin": "TOPLEFT"
},
"text": "package",
@@ -293,13 +293,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 69.6796630536824,
"l": 70.90211866351085,
"t": 76.99999977896756,
"r": 504.8720051760782,
"b": 152.90926970226084,
"r": 504.8720079864275,
"b": 152.70503335218433,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9715732336044312,
"confidence": 0.9715733528137207,
"cells": [
{
"index": 0,
@@ -335,14 +335,14 @@
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"r_x0": 70.90211866351085,
"r_y0": 124.83139551297342,
"r_x1": 504.8720079864275,
"r_y1": 124.83139551297342,
"r_x2": 504.8720079864275,
"r_y2": 102.66666671251768,
"r_x3": 70.90211866351085,
"r_y3": 102.66666671251768,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -360,14 +360,14 @@
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"r_x0": 73.10852522817731,
"r_y0": 152.70503335218433,
"r_x1": 153.04479435252625,
"r_y1": 152.70503335218433,
"r_x2": 153.04479435252625,
"r_y2": 130.00136157890958,
"r_x3": 73.10852522817731,
"r_y3": 130.00136157890958,
"coord_origin": "TOPLEFT"
},
"text": "package",

View File

@@ -0,0 +1,3 @@
<doctag><text><loc_371><loc_410><loc_438><loc_422>package</text>
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
</doctag>

View File

@@ -0,0 +1,109 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "ocr_test_rotated_180",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 2530576989861832966,
"filename": "ocr_test_rotated_180.pdf",
"uri": null
},
"furniture": {
"self_ref": "#/furniture",
"parent": null,
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"parent": null,
"children": [
{
"cref": "#/texts/0"
},
{
"cref": "#/texts/1"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 441.304584329099,
"t": 151.67751306395223,
"r": 521.9863114205704,
"b": 132.09610360960653,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
]
}
],
"orig": "package",
"text": "package",
"formatting": null,
"hyperlink": null
},
{
"self_ref": "#/texts/1",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 89.12133215549848,
"t": 124.86176457554109,
"r": 523.3501733013318,
"b": 77.02339849621205,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"formatting": null,
"hyperlink": null
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 595.201171875,
"height": 841.9216918945312
},
"image": null,
"page_no": 1
}
}
}

View File

@@ -0,0 +1,3 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained

View File

@@ -0,0 +1,445 @@
[
{
"page_no": 0,
"size": {
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 90.46133071208328,
"r_y0": 764.8982933983192,
"r_x1": 520.7638616365624,
"r_y1": 764.8982933983192,
"r_x2": 520.7638616365624,
"r_y2": 744.0929853742306,
"r_x3": 90.46133071208328,
"r_y3": 744.0929853742306,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.12133215549848,
"r_y0": 741.5247710689902,
"r_x1": 523.3501733013318,
"r_y1": 741.5247710689902,
"r_x2": 523.3501733013318,
"r_y2": 717.0599273189902,
"r_x3": 89.12133215549848,
"r_y3": 717.0599273189902,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.304584329099,
"r_y0": 709.8255882849247,
"r_x1": 521.9863114205704,
"r_y1": 709.8255882849247,
"r_x2": 521.9863114205704,
"r_y2": 690.244178830579,
"r_x3": 441.304584329099,
"r_y3": 690.244178830579,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
"predictions": {
"layout": {
"clusters": [
{
"id": 0,
"label": "text",
"bbox": {
"l": 89.12133215549848,
"t": 717.0599273189902,
"r": 523.3501733013318,
"b": 764.8982933983192,
"coord_origin": "TOPLEFT"
},
"confidence": 0.7318570613861084,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 90.46133071208328,
"r_y0": 764.8982933983192,
"r_x1": 520.7638616365624,
"r_y1": 764.8982933983192,
"r_x2": 520.7638616365624,
"r_y2": 744.0929853742306,
"r_x3": 90.46133071208328,
"r_y3": 744.0929853742306,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.12133215549848,
"r_y0": 741.5247710689902,
"r_x1": 523.3501733013318,
"r_y1": 741.5247710689902,
"r_x2": 523.3501733013318,
"r_y2": 717.0599273189902,
"r_x3": 89.12133215549848,
"r_y3": 717.0599273189902,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
{
"id": 2,
"label": "text",
"bbox": {
"l": 441.304584329099,
"t": 690.244178830579,
"r": 521.9863114205704,
"b": 709.8255882849247,
"coord_origin": "TOPLEFT"
},
"confidence": 0.5982133150100708,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.304584329099,
"r_y0": 709.8255882849247,
"r_x1": 521.9863114205704,
"r_y1": 709.8255882849247,
"r_x2": 521.9863114205704,
"r_y2": 690.244178830579,
"r_x3": 441.304584329099,
"r_y3": 690.244178830579,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
}
]
},
"tablestructure": {
"table_map": {}
},
"figures_classification": null,
"equations_prediction": null,
"vlm_response": null
},
"assembled": {
"elements": [
{
"label": "text",
"id": 0,
"page_no": 0,
"cluster": {
"id": 0,
"label": "text",
"bbox": {
"l": 89.12133215549848,
"t": 717.0599273189902,
"r": 523.3501733013318,
"b": 764.8982933983192,
"coord_origin": "TOPLEFT"
},
"confidence": 0.7318570613861084,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 90.46133071208328,
"r_y0": 764.8982933983192,
"r_x1": 520.7638616365624,
"r_y1": 764.8982933983192,
"r_x2": 520.7638616365624,
"r_y2": 744.0929853742306,
"r_x3": 90.46133071208328,
"r_y3": 744.0929853742306,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.12133215549848,
"r_y0": 741.5247710689902,
"r_x1": 523.3501733013318,
"r_y1": 741.5247710689902,
"r_x2": 523.3501733013318,
"r_y2": 717.0599273189902,
"r_x3": 89.12133215549848,
"r_y3": 717.0599273189902,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"label": "text",
"id": 2,
"page_no": 0,
"cluster": {
"id": 2,
"label": "text",
"bbox": {
"l": 441.304584329099,
"t": 690.244178830579,
"r": 521.9863114205704,
"b": 709.8255882849247,
"coord_origin": "TOPLEFT"
},
"confidence": 0.5982133150100708,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.304584329099,
"r_y0": 709.8255882849247,
"r_x1": 521.9863114205704,
"r_y1": 709.8255882849247,
"r_x2": 521.9863114205704,
"r_y2": 690.244178830579,
"r_x3": 441.304584329099,
"r_y3": 690.244178830579,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "package"
}
],
"body": [
{
"label": "text",
"id": 0,
"page_no": 0,
"cluster": {
"id": 0,
"label": "text",
"bbox": {
"l": 89.12133215549848,
"t": 717.0599273189902,
"r": 523.3501733013318,
"b": 764.8982933983192,
"coord_origin": "TOPLEFT"
},
"confidence": 0.7318570613861084,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 90.46133071208328,
"r_y0": 764.8982933983192,
"r_x1": 520.7638616365624,
"r_y1": 764.8982933983192,
"r_x2": 520.7638616365624,
"r_y2": 744.0929853742306,
"r_x3": 90.46133071208328,
"r_y3": 744.0929853742306,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.12133215549848,
"r_y0": 741.5247710689902,
"r_x1": 523.3501733013318,
"r_y1": 741.5247710689902,
"r_x2": 523.3501733013318,
"r_y2": 717.0599273189902,
"r_x3": 89.12133215549848,
"r_y3": 717.0599273189902,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"label": "text",
"id": 2,
"page_no": 0,
"cluster": {
"id": 2,
"label": "text",
"bbox": {
"l": 441.304584329099,
"t": 690.244178830579,
"r": 521.9863114205704,
"b": 709.8255882849247,
"coord_origin": "TOPLEFT"
},
"confidence": 0.5982133150100708,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.304584329099,
"r_y0": 709.8255882849247,
"r_x1": 521.9863114205704,
"r_y1": 709.8255882849247,
"r_x2": 521.9863114205704,
"r_y2": 690.244178830579,
"r_x3": 441.304584329099,
"r_y3": 690.244178830579,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "package"
}
],
"headers": []
}
}
]

View File

@@ -0,0 +1,3 @@
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_411><loc_61><loc_422><loc_128>package</text>
</doctag>

View File

@@ -0,0 +1,109 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "ocr_test_rotated_270",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 10890858393843077593,
"filename": "ocr_test_rotated_270.pdf",
"uri": null
},
"furniture": {
"self_ref": "#/furniture",
"parent": null,
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"parent": null,
"children": [
{
"cref": "#/texts/0"
},
{
"cref": "#/texts/1"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "page_header",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 717.1685859527342,
"t": 524.2990548540179,
"r": 764.8982839673505,
"b": 90.32916553110118,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"formatting": null,
"hyperlink": null
},
{
"self_ref": "#/texts/1",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 691.4680194659409,
"t": 523.0765988200898,
"r": 709.8255850278712,
"b": 442.3948768148814,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
]
}
],
"orig": "package",
"text": "package",
"formatting": null,
"hyperlink": null
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 841.9216918945312,
"height": 595.201171875
},
"image": null,
"page_no": 1
}
}
}

View File

@@ -0,0 +1 @@
package

View File

@@ -0,0 +1,446 @@
[
{
"page_no": 0,
"size": {
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.1685859527342,
"r_y0": 504.8720063438988,
"r_x1": 737.9738558298501,
"r_y1": 504.8720063438988,
"r_x2": 737.9738558298501,
"r_y2": 70.90211702098213,
"r_x3": 717.1685859527342,
"r_y3": 70.90211702098213,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 691.4680194659409,
"r_y0": 152.80629506011857,
"r_x1": 709.8255850278712,
"r_y1": 152.80629506011857,
"r_x2": 709.8255850278712,
"r_y2": 72.12457305491027,
"r_x3": 691.4680194659409,
"r_y3": 72.12457305491027,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
"predictions": {
"layout": {
"clusters": [
{
"id": 0,
"label": "page_header",
"bbox": {
"l": 717.1685859527342,
"t": 70.90211702098213,
"r": 764.8982839673505,
"b": 504.8720063438988,
"coord_origin": "TOPLEFT"
},
"confidence": 0.6915205121040344,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.1685859527342,
"r_y0": 504.8720063438988,
"r_x1": 737.9738558298501,
"r_y1": 504.8720063438988,
"r_x2": 737.9738558298501,
"r_y2": 70.90211702098213,
"r_x3": 717.1685859527342,
"r_y3": 70.90211702098213,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
{
"id": 8,
"label": "text",
"bbox": {
"l": 691.4680194659409,
"t": 72.12457305491027,
"r": 709.8255850278712,
"b": 152.80629506011857,
"coord_origin": "TOPLEFT"
},
"confidence": 1.0,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 691.4680194659409,
"r_y0": 152.80629506011857,
"r_x1": 709.8255850278712,
"r_y1": 152.80629506011857,
"r_x2": 709.8255850278712,
"r_y2": 72.12457305491027,
"r_x3": 691.4680194659409,
"r_y3": 72.12457305491027,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
}
]
},
"tablestructure": {
"table_map": {}
},
"figures_classification": null,
"equations_prediction": null,
"vlm_response": null
},
"assembled": {
"elements": [
{
"label": "page_header",
"id": 0,
"page_no": 0,
"cluster": {
"id": 0,
"label": "page_header",
"bbox": {
"l": 717.1685859527342,
"t": 70.90211702098213,
"r": 764.8982839673505,
"b": 504.8720063438988,
"coord_origin": "TOPLEFT"
},
"confidence": 0.6915205121040344,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.1685859527342,
"r_y0": 504.8720063438988,
"r_x1": 737.9738558298501,
"r_y1": 504.8720063438988,
"r_x2": 737.9738558298501,
"r_y2": 70.90211702098213,
"r_x3": 717.1685859527342,
"r_y3": 70.90211702098213,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"label": "text",
"id": 8,
"page_no": 0,
"cluster": {
"id": 8,
"label": "text",
"bbox": {
"l": 691.4680194659409,
"t": 72.12457305491027,
"r": 709.8255850278712,
"b": 152.80629506011857,
"coord_origin": "TOPLEFT"
},
"confidence": 1.0,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 691.4680194659409,
"r_y0": 152.80629506011857,
"r_x1": 709.8255850278712,
"r_y1": 152.80629506011857,
"r_x2": 709.8255850278712,
"r_y2": 72.12457305491027,
"r_x3": 691.4680194659409,
"r_y3": 72.12457305491027,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "package"
}
],
"body": [
{
"label": "text",
"id": 8,
"page_no": 0,
"cluster": {
"id": 8,
"label": "text",
"bbox": {
"l": 691.4680194659409,
"t": 72.12457305491027,
"r": 709.8255850278712,
"b": 152.80629506011857,
"coord_origin": "TOPLEFT"
},
"confidence": 1.0,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 691.4680194659409,
"r_y0": 152.80629506011857,
"r_x1": 709.8255850278712,
"r_y1": 152.80629506011857,
"r_x2": 709.8255850278712,
"r_y2": 72.12457305491027,
"r_x3": 691.4680194659409,
"r_y3": 72.12457305491027,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "package"
}
],
"headers": [
{
"label": "page_header",
"id": 0,
"page_no": 0,
"cluster": {
"id": 0,
"label": "page_header",
"bbox": {
"l": 717.1685859527342,
"t": 70.90211702098213,
"r": 764.8982839673505,
"b": 504.8720063438988,
"coord_origin": "TOPLEFT"
},
"confidence": 0.6915205121040344,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.1685859527342,
"r_y0": 504.8720063438988,
"r_x1": 737.9738558298501,
"r_y1": 504.8720063438988,
"r_x2": 737.9738558298501,
"r_y2": 70.90211702098213,
"r_x3": 717.1685859527342,
"r_y3": 70.90211702098213,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
}
]
}
}
]

View File

@@ -0,0 +1,3 @@
<doctag><page_header><loc_46><loc_75><loc_74><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_370><loc_90><loc_438>package</text>
</doctag>

View File

@@ -0,0 +1,109 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "ocr_test_rotated_90",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 6989291015361162334,
"filename": "ocr_test_rotated_90.pdf",
"uri": null
},
"furniture": {
"self_ref": "#/furniture",
"parent": null,
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"parent": null,
"children": [
{
"cref": "#/texts/0"
},
{
"cref": "#/texts/1"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "page_header",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 77.10171546422428,
"t": 506.07735421856773,
"r": 124.91101654503161,
"b": 71.88562244773436,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"formatting": null,
"hyperlink": null
},
{
"self_ref": "#/texts/1",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 131.21306574279092,
"t": 154.19400205373182,
"r": 152.19606490864376,
"b": 74.12495603322407,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
]
}
],
"orig": "package",
"text": "package",
"formatting": null,
"hyperlink": null
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 841.9216918945312,
"height": 595.201171875
},
"image": null,
"page_no": 1
}
}
}

View File

@@ -0,0 +1 @@
package

View File

@@ -0,0 +1,446 @@
[
{
"page_no": 0,
"size": {
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171546422428,
"r_y0": 520.7638577050515,
"r_x1": 96.6831586150625,
"r_y1": 520.7638577050515,
"r_x2": 96.6831586150625,
"r_y2": 89.23887398109309,
"r_x3": 77.10171546422428,
"r_y3": 89.23887398109309,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.55299576256091,
"r_y0": 523.3155494272656,
"r_x1": 124.91101654503161,
"r_y1": 523.3155494272656,
"r_x2": 124.91101654503161,
"r_y2": 89.12381765643227,
"r_x3": 100.55299576256091,
"r_y3": 89.12381765643227,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
"predictions": {
"layout": {
"clusters": [
{
"id": 0,
"label": "page_header",
"bbox": {
"l": 77.10171546422428,
"t": 89.12381765643227,
"r": 124.91101654503161,
"b": 523.3155494272656,
"coord_origin": "TOPLEFT"
},
"confidence": 0.6016772389411926,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171546422428,
"r_y0": 520.7638577050515,
"r_x1": 96.6831586150625,
"r_y1": 520.7638577050515,
"r_x2": 96.6831586150625,
"r_y2": 89.23887398109309,
"r_x3": 77.10171546422428,
"r_y3": 89.23887398109309,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.55299576256091,
"r_y0": 523.3155494272656,
"r_x1": 124.91101654503161,
"r_y1": 523.3155494272656,
"r_x2": 124.91101654503161,
"r_y2": 89.12381765643227,
"r_x3": 100.55299576256091,
"r_y3": 89.12381765643227,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
{
"id": 1,
"label": "text",
"bbox": {
"l": 131.21306574279092,
"t": 441.0071698212682,
"r": 152.19606490864376,
"b": 521.0762158417759,
"coord_origin": "TOPLEFT"
},
"confidence": 0.5234212875366211,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
}
]
},
"tablestructure": {
"table_map": {}
},
"figures_classification": null,
"equations_prediction": null,
"vlm_response": null
},
"assembled": {
"elements": [
{
"label": "page_header",
"id": 0,
"page_no": 0,
"cluster": {
"id": 0,
"label": "page_header",
"bbox": {
"l": 77.10171546422428,
"t": 89.12381765643227,
"r": 124.91101654503161,
"b": 523.3155494272656,
"coord_origin": "TOPLEFT"
},
"confidence": 0.6016772389411926,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171546422428,
"r_y0": 520.7638577050515,
"r_x1": 96.6831586150625,
"r_y1": 520.7638577050515,
"r_x2": 96.6831586150625,
"r_y2": 89.23887398109309,
"r_x3": 77.10171546422428,
"r_y3": 89.23887398109309,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.55299576256091,
"r_y0": 523.3155494272656,
"r_x1": 124.91101654503161,
"r_y1": 523.3155494272656,
"r_x2": 124.91101654503161,
"r_y2": 89.12381765643227,
"r_x3": 100.55299576256091,
"r_y3": 89.12381765643227,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"label": "text",
"id": 1,
"page_no": 0,
"cluster": {
"id": 1,
"label": "text",
"bbox": {
"l": 131.21306574279092,
"t": 441.0071698212682,
"r": 152.19606490864376,
"b": 521.0762158417759,
"coord_origin": "TOPLEFT"
},
"confidence": 0.5234212875366211,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "package"
}
],
"body": [
{
"label": "text",
"id": 1,
"page_no": 0,
"cluster": {
"id": 1,
"label": "text",
"bbox": {
"l": 131.21306574279092,
"t": 441.0071698212682,
"r": 152.19606490864376,
"b": 521.0762158417759,
"coord_origin": "TOPLEFT"
},
"confidence": 0.5234212875366211,
"cells": [
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "package"
}
],
"headers": [
{
"label": "page_header",
"id": 0,
"page_no": 0,
"cluster": {
"id": 0,
"label": "page_header",
"bbox": {
"l": 77.10171546422428,
"t": 89.12381765643227,
"r": 124.91101654503161,
"b": 523.3155494272656,
"coord_origin": "TOPLEFT"
},
"confidence": 0.6016772389411926,
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171546422428,
"r_y0": 520.7638577050515,
"r_x1": 96.6831586150625,
"r_y1": 520.7638577050515,
"r_x2": 96.6831586150625,
"r_y2": 89.23887398109309,
"r_x3": 77.10171546422428,
"r_y3": 89.23887398109309,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.55299576256091,
"r_y0": 523.3155494272656,
"r_x1": 124.91101654503161,
"r_y1": 523.3155494272656,
"r_x2": 124.91101654503161,
"r_y2": 89.12381765643227,
"r_x3": 100.55299576256091,
"r_y3": 89.12381765643227,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"children": []
},
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
}
]
}
}
]