docling/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
Peter W. J. Staar c0ba88edf1
feat(cli): add option for html with split-page mode (#1355)
* updated the cli to output html in split-page mode

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* add pin for new docling-core with html split argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* relock with fixed html export in docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update example

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update lock with docling-core fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add again chunking extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-04-14 08:41:50 +02:00

1 line
6.8 KiB
JSON

[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 73.34702132031646, "r_y0": 97.99999977896755, "r_x1": 503.64955224479564, "r_y1": 97.99999977896755, "r_x2": 503.64955224479564, "r_y2": 76.99999977896756, "r_x3": 73.34702132031646, "r_y3": 76.99999977896756, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 69.6796630536824, "r_y0": 124.83139494707741, "r_x1": 504.8720051760782, "r_y1": 124.83139494707741, "r_x2": 504.8720051760782, "r_y2": 104.00000011573796, "r_x3": 69.6796630536824, "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 71.84193505100733, "r_y0": 152.90926970226084, "r_x1": 153.088934155825, "r_y1": 152.90926970226084, "r_x2": 153.088934155825, "r_y2": 129.797125232046, "r_x3": 71.84193505100733, "r_y3": 129.797125232046, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "parsed_page": null, "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 73.34702132031646, "r_y0": 97.99999977896755, "r_x1": 503.64955224479564, "r_y1": 97.99999977896755, "r_x2": 503.64955224479564, "r_y2": 76.99999977896756, "r_x3": 73.34702132031646, "r_y3": 76.99999977896756, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 69.6796630536824, "r_y0": 124.83139494707741, "r_x1": 504.8720051760782, "r_y1": 124.83139494707741, "r_x2": 504.8720051760782, "r_y2": 104.00000011573796, "r_x3": 69.6796630536824, "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 71.84193505100733, "r_y0": 152.90926970226084, "r_x1": 153.088934155825, "r_y1": 152.90926970226084, "r_x2": 153.088934155825, "r_y2": 129.797125232046, "r_x3": 71.84193505100733, "r_y3": 129.797125232046, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 73.34702132031646, "r_y0": 97.99999977896755, "r_x1": 503.64955224479564, "r_y1": 97.99999977896755, "r_x2": 503.64955224479564, "r_y2": 76.99999977896756, "r_x3": 73.34702132031646, "r_y3": 76.99999977896756, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 69.6796630536824, "r_y0": 124.83139494707741, "r_x1": 504.8720051760782, "r_y1": 124.83139494707741, "r_x2": 504.8720051760782, "r_y2": 104.00000011573796, "r_x3": 69.6796630536824, "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 71.84193505100733, "r_y0": 152.90926970226084, "r_x1": 153.088934155825, "r_y1": 152.90926970226084, "r_x2": 153.088934155825, "r_y2": 129.797125232046, "r_x3": 71.84193505100733, "r_y3": 129.797125232046, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"index": 0, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 73.34702132031646, "r_y0": 97.99999977896755, "r_x1": 503.64955224479564, "r_y1": 97.99999977896755, "r_x2": 503.64955224479564, "r_y2": 76.99999977896756, "r_x3": 73.34702132031646, "r_y3": 76.99999977896756, "coord_origin": "TOPLEFT"}, "text": "Docling bundles PDF document conversion to", "orig": "Docling bundles PDF document conversion to", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 1, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 69.6796630536824, "r_y0": 124.83139494707741, "r_x1": 504.8720051760782, "r_y1": 124.83139494707741, "r_x2": 504.8720051760782, "r_y2": 104.00000011573796, "r_x3": 69.6796630536824, "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT"}, "text": "JSON and Markdown in an easy self contained", "orig": "JSON and Markdown in an easy self contained", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}, {"index": 2, "rgba": {"r": 0, "g": 0, "b": 0, "a": 255}, "rect": {"r_x0": 71.84193505100733, "r_y0": 152.90926970226084, "r_x1": 153.088934155825, "r_y1": 152.90926970226084, "r_x2": 153.088934155825, "r_y2": 129.797125232046, "r_x3": 71.84193505100733, "r_y3": 129.797125232046, "coord_origin": "TOPLEFT"}, "text": "package", "orig": "package", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": true}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]