docling/tests/data_scanned/ocr_test.pages.json


			
				
					
						
						
						
							
							
							[{"page_no": 0, "page_hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "size": {"width": 595.2000122070312, "height": 841.9200439453125}, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}, {"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}, {"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Section-header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, "text": "Docling"}, {"label": "Text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package."}, {"label": "Text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}, "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI."}], "body": [{"label": "Section-header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Section-header", "bbox": {"l": 71.608642578125, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}, "confidence": 0.8694888949394226, "cells": [{"id": 0, "text": "Docling ", "bbox": {"l": 72.00000026697958, "t": 71.78082545188033, "r": 127.90485047427754, "b": 91.41455694309684, "coord_origin": "1"}}]}, "text": "Docling"}, {"label": "Text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "Text", "bbox": {"l": 71.54174041748047, "t": 108.73199825838697, "r": 498.7333068847656, "b": 138.02399048316556, "coord_origin": "1"}, "confidence": 0.8374634981155396, "cells": [{"id": 1, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, self-", "bbox": {"l": 72.00000026697958, "t": 108.73199825838697, "r": 498.1307718470936, "b": 123.38397937123091, "coord_origin": "1"}}, {"id": 2, "text": "contained package. ", "bbox": {"l": 72.00000026697958, "t": 123.371959370318, "r": 175.68600065145242, "b": 138.02399048316556, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package."}, {"label": "Text", "id": 2, "page_no": 0, "cluster": {"id": 2, "label": "Text", "bbox": {"l": 71.21173858642578, "t": 167.29200270612273, "r": 519.8010919274483, "b": 357.6239871621727, "coord_origin": "1"}, "confidence": 0.6254582405090332, "cells": [{"id": 3, "text": "Features ", "bbox": {"l": 72.00000026697958, "t": 167.29200270612273, "r": 119.69380044383055, "b": 181.94398381896667, "coord_origin": "1"}}, {"id": 4, "text": "Converts any PDF document to JSON or Markdown format, stable and lightning fast. ", "bbox": {"l": 72.00000026697958, "t": 196.5719749299883, "r": 503.4534918668306, "b": 211.22395604283201, "coord_origin": "1"}}, {"id": 5, "text": "Understands detailed page layout, reading order and recovers table structures. ", "bbox": {"l": 72.00000026697958, "t": 225.85200715385838, "r": 478.9497717759695, "b": 240.50397826670132, "coord_origin": "1"}}, {"id": 6, "text": "Extracts metadata from the document, such as title, authors, references and language. ", "bbox": {"l": 72.00000026697958, "t": 255.13197937772395, "r": 519.8010919274483, "b": 269.7840104905715, "coord_origin": "1"}}, {"id": 7, "text": "Includes OCR support for scanned PDFs. ", "bbox": {"l": 72.00000026697958, "t": 284.41200160159326, "r": 285.15097105735396, "b": 299.0639827144371, "coord_origin": "1"}}, {"id": 8, "text": "Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain ", "bbox": {"l": 72.00000026697958, "t": 313.69197382545883, "r": 486.82465180517005, "b": 328.34395493830255, "coord_origin": "1"}}, {"id": 9, "text": "Provides a simple and convenient CLI. ", "bbox": {"l": 72.00000026697958, "t": 342.97197604932654, "r": 270.3559310024932, "b": 357.6239871621727, "coord_origin": "1"}}]}, "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI."}], "headers": []}}]
						
						
					
				
				
					
						Reference in New Issue
					
					View Git Blame
					Copy Permalink