fix(test): Update test data for OCR

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-10-08 14:12:22 +02:00
parent 5bd64779d1
commit 072aaf6bb1
6 changed files with 4 additions and 10 deletions

View File

@ -1,5 +1,3 @@
<document>
<subtitle-level-1><location><page_1><loc_12><loc_89><loc_21><loc_91></location>Docling</subtitle-level-1>
<paragraph><location><page_1><loc_12><loc_84><loc_84><loc_87></location>Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.</paragraph>
<paragraph><location><page_1><loc_12><loc_58><loc_87><loc_80></location>Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.</paragraph>
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
</document>

View File

@ -1 +1 @@
{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test.pdf", "document-hash": "1e6966b64695f3e77f2931dfd42c79050f4a47cd9c53eb32dc061c98a3129b05", "#-pages": 1, "page-hashes": [{"hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.608642578125, 750.5054931640625, 127.90485382080078, 770.1392211914062], "page": 1, "span": [0, 7]}], "text": "Docling", "type": "subtitle-level-1", "name": "Section-header"}, {"prov": [{"bbox": [71.54174041748047, 703.8960571289062, 498.7333068847656, 733.1880493164062], "page": 1, "span": [0, 95]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.", "type": "paragraph", "name": "Text"}, {"prov": [{"bbox": [71.21173858642578, 484.2960510253906, 519.8010864257812, 674.6280517578125], "page": 1, "span": [0, 409]}], "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9200439453125, "page": 1, "width": 595.2000122070312}], "page-footers": [], "page-headers": []}
{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}

View File

@ -1,5 +1 @@
## Docling
Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.
Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB