mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix(test): Update test data for OCR
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
5bd64779d1
commit
072aaf6bb1
@ -1,5 +1,3 @@
|
|||||||
<document>
|
<document>
|
||||||
<subtitle-level-1><location><page_1><loc_12><loc_89><loc_21><loc_91></location>Docling</subtitle-level-1>
|
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
|
||||||
<paragraph><location><page_1><loc_12><loc_84><loc_84><loc_87></location>Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.</paragraph>
|
|
||||||
<paragraph><location><page_1><loc_12><loc_58><loc_87><loc_80></location>Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.</paragraph>
|
|
||||||
</document>
|
</document>
|
@ -1 +1 @@
|
|||||||
{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test.pdf", "document-hash": "1e6966b64695f3e77f2931dfd42c79050f4a47cd9c53eb32dc061c98a3129b05", "#-pages": 1, "page-hashes": [{"hash": "5b246e5b7c627e174ffcbbe2a41131c2f19e4c2b02314f6bc9ca65c11f9b8d76", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.608642578125, 750.5054931640625, 127.90485382080078, 770.1392211914062], "page": 1, "span": [0, 7]}], "text": "Docling", "type": "subtitle-level-1", "name": "Section-header"}, {"prov": [{"bbox": [71.54174041748047, 703.8960571289062, 498.7333068847656, 733.1880493164062], "page": 1, "span": [0, 95]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.", "type": "paragraph", "name": "Text"}, {"prov": [{"bbox": [71.21173858642578, 484.2960510253906, 519.8010864257812, 674.6280517578125], "page": 1, "span": [0, 409]}], "text": "Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9200439453125, "page": 1, "width": 595.2000122070312}], "page-footers": [], "page-headers": []}
|
{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}
|
@ -1,5 +1 @@
|
|||||||
## Docling
|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
||||||
|
|
||||||
Docling bundles PDF document conversion to JSON and Markdown in an easy, selfcontained package.
|
|
||||||
|
|
||||||
Features Converts any PDF document to JSON or Markdown format, stable and lightning fast. Understands detailed page layout, reading order and recovers table structures. Extracts metadata from the document, such as title, authors, references and language. Includes OCR support for scanned PDFs. Integrates easily with LLM app / RAG frameworks like LlamaIndex and LangChain Provides a simple and convenient CLI.
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 82 KiB |
Loading…
Reference in New Issue
Block a user