diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt index 3322c749..029be08d 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt @@ -1,5 +1,4 @@ -package -JSON and Markdown in an easy self contained -Docling bundles PDF document conversion to +package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json index 38d07835..982320c3 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json @@ -1 +1 @@ -{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 104.58835856119788, 523.0, 123.25502522786462], "page": 1, "span": [0, 43], "__ref_s3_data": null}], "text": "JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 521.3333333333334, 96.58835856119788], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file +{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [441.304584329099, 132.09610360960653, 521.9863114205704, 151.67751306395223], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [89.12133215549848, 77.02339849621205, 523.3501733013318, 124.86176457554109], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md index 120ab1cc..f5d50b5c 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md @@ -1,5 +1,3 @@ package -JSON and Markdown in an easy self contained - -Docling bundles PDF document conversion to \ No newline at end of file +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json index 34ff80da..a57c3401 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json @@ -1 +1 @@ -[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 1, "label": "text", "bbox": {"l": 92.0, "t": 745.3333333333334, "r": 521.3333333333334, "b": 764.0, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, {"id": 2, "label": "text", "bbox": {"l": 92.0, "t": 718.6666666666666, "r": 523.0, "b": 737.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, {"id": 3, "label": "text", "bbox": {"l": 444.6666666666667, "t": 691.6666666666666, "r": 521.6666666666666, "b": 710.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 90.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 92.0, "t": 745.3333333333334, "r": 521.3333333333334, "b": 764.0, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 92.0, "t": 718.6666666666666, "r": 523.0, "b": 737.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 444.6666666666667, "t": 691.6666666666666, "r": 521.6666666666666, "b": 710.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 90.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 92.0, "t": 745.3333333333334, "r": 521.3333333333334, "b": 764.0, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 92.0, "t": 718.6666666666666, "r": 523.0, "b": 737.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 444.6666666666667, "t": 691.6666666666666, "r": 521.6666666666666, "b": 710.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 90.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": []}}] \ No newline at end of file +[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 89.12133215549848, "t": 717.0599273189902, "r": 523.3501733013318, "b": 764.8982933983192, "coord_origin": "TOPLEFT"}, "confidence": 0.7318570613861084, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, {"id": 2, "label": "text", "bbox": {"l": 441.304584329099, "t": 690.244178830579, "r": 521.9863114205704, "b": 709.8255882849247, "coord_origin": "TOPLEFT"}, "confidence": 0.5982133150100708, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 89.12133215549848, "t": 717.0599273189902, "r": 523.3501733013318, "b": 764.8982933983192, "coord_origin": "TOPLEFT"}, "confidence": 0.7318570613861084, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 441.304584329099, "t": 690.244178830579, "r": 521.9863114205704, "b": 709.8255882849247, "coord_origin": "TOPLEFT"}, "confidence": 0.5982133150100708, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 89.12133215549848, "t": 717.0599273189902, "r": 523.3501733013318, "b": 764.8982933983192, "coord_origin": "TOPLEFT"}, "confidence": 0.7318570613861084, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 441.304584329099, "t": 690.244178830579, "r": 521.9863114205704, "b": 709.8255882849247, "coord_origin": "TOPLEFT"}, "confidence": 0.5982133150100708, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": []}}] \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt index 8350737b..d5c2972a 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt @@ -1,3 +1,3 @@ -package +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json index 6b843dca..42e30bf7 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json @@ -1 +1 @@ -{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_270.pdf", "filename-prov": null, "document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [691.6666666666666, 444.53450520833337, 710.3333333333334, 521.5345052083334], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file +{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_270.pdf", "filename-prov": null, "document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [691.4680194659409, 442.3948768148814, 709.8255850278712, 523.0765988200898], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json index c4416b3b..5f76e79a 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json @@ -1 +1 @@ -[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 718.6666666666666, "t": 72.33333333333333, "r": 764.0, "b": 503.3333333333333, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, {"id": 8, "label": "text", "bbox": {"l": 691.6666666666666, "t": 73.66666666666667, "r": 710.3333333333334, "b": 150.66666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 718.6666666666666, "t": 72.33333333333333, "r": 764.0, "b": 503.3333333333333, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.6666666666666, "t": 73.66666666666667, "r": 710.3333333333334, "b": 150.66666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.6666666666666, "t": 73.66666666666667, "r": 710.3333333333334, "b": 150.66666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 718.6666666666666, "t": 72.33333333333333, "r": 764.0, "b": 503.3333333333333, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file +[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 717.1685859527342, "t": 70.90211702098213, "r": 764.8982839673505, "b": 504.8720063438988, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, {"id": 8, "label": "text", "bbox": {"l": 691.4680194659409, "t": 72.12457305491027, "r": 709.8255850278712, "b": 152.80629506011857, "coord_origin": "TOPLEFT"}, "confidence": 1.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 717.1685859527342, "t": 70.90211702098213, "r": 764.8982839673505, "b": 504.8720063438988, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.4680194659409, "t": 72.12457305491027, "r": 709.8255850278712, "b": 152.80629506011857, "coord_origin": "TOPLEFT"}, "confidence": 1.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.4680194659409, "t": 72.12457305491027, "r": 709.8255850278712, "b": 152.80629506011857, "coord_origin": "TOPLEFT"}, "confidence": 1.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 717.1685859527342, "t": 70.90211702098213, "r": 764.8982839673505, "b": 504.8720063438988, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt index c1068b56..0b7a3a14 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt @@ -1,4 +1,3 @@ -Docling bundles PDF document conversion to -JSON and Markdown in an easy self contained package +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json index b8076e9e..cd086df8 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json @@ -1 +1 @@ -{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_90.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [78.0, 73.86783854166663, 96.66666666666667, 503.201171875], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [104.66666666666667, 72.201171875, 123.33333333333333, 503.201171875], "page": 1, "span": [0, 51], "__ref_s3_data": null}, {"bbox": [104.66666666666667, 72.201171875, 123.33333333333333, 503.201171875], "page": 1, "span": [0, 51], "__ref_s3_data": null}], "text": "JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file +{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_90.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md index 8d77a437..597acc76 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md @@ -1,3 +1 @@ -Docling bundles PDF document conversion to - -JSON and Markdown in an easy self contained package \ No newline at end of file +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json index 71c1e2bd..89e716e1 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json @@ -1 +1 @@ -[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 1, "label": "text", "bbox": {"l": 78.0, "t": 92.0, "r": 96.66666666666667, "b": 521.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, {"id": 2, "label": "text", "bbox": {"l": 104.66666666666667, "t": 92.0, "r": 123.33333333333333, "b": 523.0, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, {"id": 3, "label": "text", "bbox": {"l": 131.66666666666666, "t": 444.6666666666667, "r": 150.33333333333334, "b": 521.6666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 78.0, "t": 92.0, "r": 96.66666666666667, "b": 521.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 104.66666666666667, "t": 92.0, "r": 123.33333333333333, "b": 523.0, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 131.66666666666666, "t": 444.6666666666667, "r": 150.33333333333334, "b": 521.6666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 78.0, "t": 92.0, "r": 96.66666666666667, "b": 521.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 104.66666666666667, "t": 92.0, "r": 123.33333333333333, "b": 523.0, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 131.66666666666666, "t": 444.6666666666667, "r": 150.33333333333334, "b": 521.6666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": []}}] \ No newline at end of file +[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt index 820b0726..da0deb0b 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt @@ -1,4 +1,3 @@ -package -JSON and Markdown in an easy self contained -Docling bundles PDF document conversion to +package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json index 84c4508d..1cb0a4f6 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json @@ -1 +1 @@ -{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}, {"cref": "#/texts/2"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 104.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 43]}], "orig": "JSON and Markdown in an easy self contained", "text": "JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/2", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 96.58835856119788, "r": 521.3333333333334, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 42]}], "orig": "Docling bundles PDF document conversion to", "text": "Docling bundles PDF document conversion to", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}} \ No newline at end of file +{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 441.304584329099, "t": 151.67751306395223, "r": 521.9863114205704, "b": 132.09610360960653, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 89.12133215549848, "t": 124.86176457554109, "r": 523.3501733013318, "b": 77.02339849621205, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md index 120ab1cc..f5d50b5c 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md @@ -1,5 +1,3 @@ package -JSON and Markdown in an easy self contained - -Docling bundles PDF document conversion to \ No newline at end of file +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json index 34ff80da..a57c3401 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json @@ -1 +1 @@ -[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 1, "label": "text", "bbox": {"l": 92.0, "t": 745.3333333333334, "r": 521.3333333333334, "b": 764.0, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, {"id": 2, "label": "text", "bbox": {"l": 92.0, "t": 718.6666666666666, "r": 523.0, "b": 737.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, {"id": 3, "label": "text", "bbox": {"l": 444.6666666666667, "t": 691.6666666666666, "r": 521.6666666666666, "b": 710.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 90.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 92.0, "t": 745.3333333333334, "r": 521.3333333333334, "b": 764.0, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 92.0, "t": 718.6666666666666, "r": 523.0, "b": 737.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 444.6666666666667, "t": 691.6666666666666, "r": 521.6666666666666, "b": 710.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 90.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 92.0, "t": 745.3333333333334, "r": 521.3333333333334, "b": 764.0, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.3333333333334, "r_y0": 745.3333333333334, "r_x1": 92.0, "r_y1": 745.3333333333334, "r_x2": 92.0, "r_y2": 764.0, "r_x3": 521.3333333333334, "r_y3": 764.0, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 92.0, "t": 718.6666666666666, "r": 523.0, "b": 737.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 523.0, "r_y0": 718.6666666666666, "r_x1": 92.0, "r_y1": 718.6666666666666, "r_x2": 92.0, "r_y2": 737.3333333333334, "r_x3": 523.0, "r_y3": 737.3333333333334, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 444.6666666666667, "t": 691.6666666666666, "r": 521.6666666666666, "b": 710.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 90.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 521.6666666666666, "r_y0": 691.6666666666666, "r_x1": 444.6666666666667, "r_y1": 691.6666666666666, "r_x2": 444.6666666666667, "r_y2": 710.3333333333334, "r_x3": 521.6666666666666, "r_y3": 710.3333333333334, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 90.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": []}}] \ No newline at end of file +[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 89.12133215549848, "t": 717.0599273189902, "r": 523.3501733013318, "b": 764.8982933983192, "coord_origin": "TOPLEFT"}, "confidence": 0.7318570613861084, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, {"id": 2, "label": "text", "bbox": {"l": 441.304584329099, "t": 690.244178830579, "r": 521.9863114205704, "b": 709.8255882849247, "coord_origin": "TOPLEFT"}, "confidence": 0.5982133150100708, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 89.12133215549848, "t": 717.0599273189902, "r": 523.3501733013318, "b": 764.8982933983192, "coord_origin": "TOPLEFT"}, "confidence": 0.7318570613861084, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 441.304584329099, "t": 690.244178830579, "r": 521.9863114205704, "b": 709.8255882849247, "coord_origin": "TOPLEFT"}, "confidence": 0.5982133150100708, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 89.12133215549848, "t": 717.0599273189902, "r": 523.3501733013318, "b": 764.8982933983192, "coord_origin": "TOPLEFT"}, "confidence": 0.7318570613861084, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 90.46133071208328, "r_y0": 764.8982933983192, "r_x1": 520.7638616365624, "r_y1": 764.8982933983192, "r_x2": 520.7638616365624, "r_y2": 744.0929853742306, "r_x3": 90.46133071208328, "r_y3": 744.0929853742306, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 89.12133215549848, "r_y0": 741.5247710689902, "r_x1": 523.3501733013318, "r_y1": 741.5247710689902, "r_x2": 523.3501733013318, "r_y2": 717.0599273189902, "r_x3": 89.12133215549848, "r_y3": 717.0599273189902, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 441.304584329099, "t": 690.244178830579, "r": 521.9863114205704, "b": 709.8255882849247, "coord_origin": "TOPLEFT"}, "confidence": 0.5982133150100708, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 441.304584329099, "r_y0": 709.8255882849247, "r_x1": 521.9863114205704, "r_y1": 709.8255882849247, "r_x2": 521.9863114205704, "r_y2": 690.244178830579, "r_x3": 441.304584329099, "r_y3": 690.244178830579, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": []}}] \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt index 2c343d7b..95999c0c 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt @@ -1,3 +1,3 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained -package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json index 580ed117..9a2e18bb 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json @@ -1 +1 @@ -{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_270", "origin": {"mimetype": "application/pdf", "binary_hash": 10890858393843077593, "filename": "ocr_test_rotated_270.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 718.6666666666666, "t": 522.8678385416666, "r": 764.0, "b": 91.86783854166669, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 691.6666666666666, "t": 521.5345052083334, "r": 710.3333333333334, "b": 444.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}} \ No newline at end of file +{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_270", "origin": {"mimetype": "application/pdf", "binary_hash": 10890858393843077593, "filename": "ocr_test_rotated_270.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 717.1685859527342, "t": 524.2990548540179, "r": 764.8982839673505, "b": 90.32916553110118, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 691.4680194659409, "t": 523.0765988200898, "r": 709.8255850278712, "b": 442.3948768148814, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json index c4416b3b..5f76e79a 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json @@ -1 +1 @@ -[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 718.6666666666666, "t": 72.33333333333333, "r": 764.0, "b": 503.3333333333333, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, {"id": 8, "label": "text", "bbox": {"l": 691.6666666666666, "t": 73.66666666666667, "r": 710.3333333333334, "b": 150.66666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 718.6666666666666, "t": 72.33333333333333, "r": 764.0, "b": 503.3333333333333, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.6666666666666, "t": 73.66666666666667, "r": 710.3333333333334, "b": 150.66666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.6666666666666, "t": 73.66666666666667, "r": 710.3333333333334, "b": 150.66666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.6666666666666, "r_y0": 73.66666666666667, "r_x1": 691.6666666666666, "r_y1": 150.66666666666666, "r_x2": 710.3333333333334, "r_y2": 150.66666666666666, "r_x3": 691.6666666666666, "r_y3": 150.66666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 718.6666666666666, "t": 72.33333333333333, "r": 764.0, "b": 503.3333333333333, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 745.3333333333334, "r_y0": 74.0, "r_x1": 745.3333333333334, "r_y1": 503.3333333333333, "r_x2": 764.0, "r_y2": 503.3333333333333, "r_x3": 745.3333333333334, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 718.6666666666666, "r_y0": 72.33333333333333, "r_x1": 718.6666666666666, "r_y1": 503.3333333333333, "r_x2": 737.3333333333334, "r_y2": 503.3333333333333, "r_x3": 718.6666666666666, "r_y3": 503.3333333333333, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file +[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 717.1685859527342, "t": 70.90211702098213, "r": 764.8982839673505, "b": 504.8720063438988, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, {"id": 8, "label": "text", "bbox": {"l": 691.4680194659409, "t": 72.12457305491027, "r": 709.8255850278712, "b": 152.80629506011857, "coord_origin": "TOPLEFT"}, "confidence": 1.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 717.1685859527342, "t": 70.90211702098213, "r": 764.8982839673505, "b": 504.8720063438988, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.4680194659409, "t": 72.12457305491027, "r": 709.8255850278712, "b": 152.80629506011857, "coord_origin": "TOPLEFT"}, "confidence": 1.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 8, "page_no": 0, "cluster": {"id": 8, "label": "text", "bbox": {"l": 691.4680194659409, "t": 72.12457305491027, "r": 709.8255850278712, "b": 152.80629506011857, "coord_origin": "TOPLEFT"}, "confidence": 1.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 691.4680194659409, "r_y0": 152.80629506011857, "r_x1": 709.8255850278712, "r_y1": 152.80629506011857, "r_x2": 709.8255850278712, "r_y2": 72.12457305491027, "r_x3": 691.4680194659409, "r_y3": 72.12457305491027, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 717.1685859527342, "t": 70.90211702098213, "r": 764.8982839673505, "b": 504.8720063438988, "coord_origin": "TOPLEFT"}, "confidence": 0.6915205121040344, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 744.0930045534915, "r_y0": 504.87200373583954, "r_x1": 764.8982839673505, "r_y1": 504.87200373583954, "r_x2": 764.8982839673505, "r_y2": 73.34702001188118, "r_x3": 744.0930045534915, "r_y3": 73.34702001188118, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 717.1685859527342, "r_y0": 504.8720063438988, "r_x1": 737.9738558298501, "r_y1": 504.8720063438988, "r_x2": 737.9738558298501, "r_y2": 70.90211702098213, "r_x3": 717.1685859527342, "r_y3": 70.90211702098213, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt index 3d262655..c99f4b1f 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt @@ -1,3 +1,3 @@ -Docling bundles PDF document conversion to -JSON and Markdown in an easy self contained package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json index 3022d1ca..d4bca1a6 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json @@ -1 +1 @@ -{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_90", "origin": {"mimetype": "application/pdf", "binary_hash": 6989291015361162334, "filename": "ocr_test_rotated_90.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 78.0, "t": 503.201171875, "r": 96.66666666666667, "b": 73.86783854166663, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 42]}], "orig": "Docling bundles PDF document conversion to", "text": "Docling bundles PDF document conversion to", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 104.66666666666667, "t": 503.201171875, "r": 123.33333333333333, "b": 72.201171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 43]}, {"page_no": 1, "bbox": {"l": 104.66666666666667, "t": 503.201171875, "r": 123.33333333333333, "b": 72.201171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [44, 51]}], "orig": "JSON and Markdown in an easy self contained package", "text": "JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}} \ No newline at end of file +{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_90", "origin": {"mimetype": "application/pdf", "binary_hash": 6989291015361162334, "filename": "ocr_test_rotated_90.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 77.10171546422428, "t": 506.07735421856773, "r": 124.91101654503161, "b": 71.88562244773436, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 131.21306574279092, "t": 154.19400205373182, "r": 152.19606490864376, "b": 74.12495603322407, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md index 8d77a437..597acc76 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md @@ -1,3 +1 @@ -Docling bundles PDF document conversion to - -JSON and Markdown in an easy self contained package \ No newline at end of file +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json index 71c1e2bd..89e716e1 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json @@ -1 +1 @@ -[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 1, "label": "text", "bbox": {"l": 78.0, "t": 92.0, "r": 96.66666666666667, "b": 521.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, {"id": 2, "label": "text", "bbox": {"l": 104.66666666666667, "t": 92.0, "r": 123.33333333333333, "b": 523.0, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, {"id": 3, "label": "text", "bbox": {"l": 131.66666666666666, "t": 444.6666666666667, "r": 150.33333333333334, "b": 521.6666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 78.0, "t": 92.0, "r": 96.66666666666667, "b": 521.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 104.66666666666667, "t": 92.0, "r": 123.33333333333333, "b": 523.0, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 131.66666666666666, "t": 444.6666666666667, "r": 150.33333333333334, "b": 521.6666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 78.0, "t": 92.0, "r": 96.66666666666667, "b": 521.3333333333334, "coord_origin": "TOPLEFT"}, "confidence": 94.0, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 96.66666666666667, "r_y0": 521.3333333333334, "r_x1": 96.66666666666667, "r_y1": 92.0, "r_x2": 78.0, "r_y2": 92.0, "r_x3": 78.0, "r_y3": 521.3333333333334, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 94.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to"}, {"label": "text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "text", "bbox": {"l": 104.66666666666667, "t": 92.0, "r": 123.33333333333333, "b": 523.0, "coord_origin": "TOPLEFT"}, "confidence": 92.0, "cells": [{"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 123.33333333333333, "r_y0": 523.0, "r_x1": 123.33333333333333, "r_y1": 92.0, "r_x2": 104.66666666666667, "r_y2": 92.0, "r_x3": 104.66666666666667, "r_y3": 523.0, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 92.0, "from_ocr": true}], "children": []}, "text": "JSON and Markdown in an easy self contained"}, {"label": "text", "id": 3, "page_no": 0, "cluster": {"id": 3, "label": "text", "bbox": {"l": 131.66666666666666, "t": 444.6666666666667, "r": 150.33333333333334, "b": 521.6666666666666, "coord_origin": "TOPLEFT"}, "confidence": 89.0, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 150.33333333333334, "r_y0": 521.6666666666666, "r_x1": 150.33333333333334, "r_y1": 444.6666666666667, "r_x2": 131.66666666666666, "r_y2": 444.6666666666667, "r_x3": 131.66666666666666, "r_y3": 521.6666666666666, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 89.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": []}}] \ No newline at end of file +[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 131.21306574279092, "r_y0": 521.0762158417759, "r_x1": 152.19606490864376, "r_y1": 521.0762158417759, "r_x2": 152.19606490864376, "r_y2": 441.0071698212682, "r_x3": 131.21306574279092, "r_y3": 441.0071698212682, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 77.10171546422428, "r_y0": 520.7638577050515, "r_x1": 96.6831586150625, "r_y1": 520.7638577050515, "r_x2": 96.6831586150625, "r_y2": 89.23887398109309, "r_x3": 77.10171546422428, "r_y3": 89.23887398109309, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 100.55299576256091, "r_y0": 523.3155494272656, "r_x1": 124.91101654503161, "r_y1": 523.3155494272656, "r_x2": 124.91101654503161, "r_y2": 89.12381765643227, "r_x3": 100.55299576256091, "r_y3": 89.12381765643227, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file