diff --git a/tests/data_scanned/ocr_test.doctags.txt b/tests/data_scanned/ocr_test.doctags.txt
index 742c7b69..7cd53510 100644
--- a/tests/data_scanned/ocr_test.doctags.txt
+++ b/tests/data_scanned/ocr_test.doctags.txt
@@ -1,5 +1,3 @@
 <document>
-<subtitle-level-1><location><page_1><loc_12><loc_89><loc_21><loc_91></location>Docling</subtitle-level-1>
-<paragraph><location><page_1><loc_12><loc_84><loc_84><loc_87></location>Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.</paragraph>
-<paragraph><location><page_1><loc_12><loc_58><loc_87><loc_80></location>Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.</paragraph>
+<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
 </document>
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.json b/tests/data_scanned/ocr_test.json
index ed2c157d..bf0fb86d 100644
--- a/tests/data_scanned/ocr_test.json
+++ b/tests/data_scanned/ocr_test.json
@@ -1 +1 @@
-{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test.pdf", "document-hash": "1e6966b64695f3e77f2931dfd42c79050f4a47cd9c53eb32dc061c98a3129b05", "#-pages": 1, "page-hashes": [{"hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.608642578125, 750.5054931640625, 127.90485382080078, 770.1392211914062], "page": 1, "span": [0, 7]}], "text": "Docling", "type": "subtitle-level-1", "name": "Section-header"}, {"prov": [{"bbox": [71.54174041748047, 703.8960571289062, 498.7333068847656, 733.1880493164062], "page": 1, "span": [0, 95]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.", "type": "paragraph", "name": "Text"}, {"prov": [{"bbox": [71.21173858642578, 484.2960510253906, 519.8010864257812, 674.6280517578125], "page": 1, "span": [0, 409]}], "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9200439453125, "page": 1, "width": 595.2000122070312}], "page-footers": [], "page-headers": []}
\ No newline at end of file
+{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.md b/tests/data_scanned/ocr_test.md
index 6c232b73..42896546 100644
--- a/tests/data_scanned/ocr_test.md
+++ b/tests/data_scanned/ocr_test.md
@@ -1,5 +1 @@
-## Docling
-
-Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.
-
-Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.
\ No newline at end of file
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.pages.json b/tests/data_scanned/ocr_test.pages.json
index 94cdaa78..de3f5f5e 100644
--- a/tests/data_scanned/ocr_test.pages.json
+++ b/tests/data_scanned/ocr_test.pages.json
@@ -1 +1 @@
-[{"page_no": 0, "page_hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "size": {"width": 595.2000122070312, "height": 841.9200439453125}, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}, {"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}, {"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Section-header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, "text": "Docling"}, {"label": "Text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package."}, {"label": "Text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}, "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI."}], "body": [{"label": "Section-header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, "text": "Docling"}, {"label": "Text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package."}, {"label": "Text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}, "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI."}], "headers": []}}]
\ No newline at end of file
+[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf
index 866319cd..b79f3c28 100644
Binary files a/tests/data_scanned/ocr_test.pdf and b/tests/data_scanned/ocr_test.pdf differ
diff --git a/tests/data_scanned/ocr_test.png b/tests/data_scanned/ocr_test.png
deleted file mode 100644
index 54359a70..00000000
Binary files a/tests/data_scanned/ocr_test.png and /dev/null differ