diff --git a/tests/data_scanned/ocr_test.doctags.txt b/tests/data_scanned/ocr_test.doctags.txt index 742c7b69..7cd53510 100644 --- a/tests/data_scanned/ocr_test.doctags.txt +++ b/tests/data_scanned/ocr_test.doctags.txt @@ -1,5 +1,3 @@ -Docling -Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package. -Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI. +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.json b/tests/data_scanned/ocr_test.json index ed2c157d..bf0fb86d 100644 --- a/tests/data_scanned/ocr_test.json +++ b/tests/data_scanned/ocr_test.json @@ -1 +1 @@ -{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test.pdf", "document-hash": "1e6966b64695f3e77f2931dfd42c79050f4a47cd9c53eb32dc061c98a3129b05", "#-pages": 1, "page-hashes": [{"hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.608642578125, 750.5054931640625, 127.90485382080078, 770.1392211914062], "page": 1, "span": [0, 7]}], "text": "Docling", "type": "subtitle-level-1", "name": "Section-header"}, {"prov": [{"bbox": [71.54174041748047, 703.8960571289062, 498.7333068847656, 733.1880493164062], "page": 1, "span": [0, 95]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.", "type": "paragraph", "name": "Text"}, {"prov": [{"bbox": [71.21173858642578, 484.2960510253906, 519.8010864257812, 674.6280517578125], "page": 1, "span": [0, 409]}], "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9200439453125, "page": 1, "width": 595.2000122070312}], "page-footers": [], "page-headers": []} \ No newline at end of file +{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []} \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.md b/tests/data_scanned/ocr_test.md index 6c232b73..42896546 100644 --- a/tests/data_scanned/ocr_test.md +++ b/tests/data_scanned/ocr_test.md @@ -1,5 +1 @@ -## Docling - -Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package. - -Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI. \ No newline at end of file +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pages.json b/tests/data_scanned/ocr_test.pages.json index 94cdaa78..de3f5f5e 100644 --- a/tests/data_scanned/ocr_test.pages.json +++ b/tests/data_scanned/ocr_test.pages.json @@ -1 +1 @@ -[{"page_no": 0, "page_hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "size": {"width": 595.2000122070312, "height": 841.9200439453125}, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}, {"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}, {"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Section-header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, "text": "Docling"}, {"label": "Text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package."}, {"label": "Text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}, "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI."}], "body": [{"label": "Section-header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, "text": "Docling"}, {"label": "Text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package."}, {"label": "Text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}, "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI."}], "headers": []}}] \ No newline at end of file +[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}] \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf index 866319cd..b79f3c28 100644 Binary files a/tests/data_scanned/ocr_test.pdf and b/tests/data_scanned/ocr_test.pdf differ diff --git a/tests/data_scanned/ocr_test.png b/tests/data_scanned/ocr_test.png deleted file mode 100644 index 54359a70..00000000 Binary files a/tests/data_scanned/ocr_test.png and /dev/null differ