Added verify_conversion_result_v2, Regenerate v1 and v2 test data

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-10 15:37:36 +02:00
parent 7cad290ceb
commit cd72ea2412
61 changed files with 15467 additions and 29 deletions

View File

@@ -0,0 +1,3 @@
<document>
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
</document>

View File

@@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [71.35887908935547, 690.8582153320312, 504.0870056152344, 765.0995483398438], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@@ -0,0 +1 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,3 @@
<document>
<text><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</document>

View File

@@ -0,0 +1 @@
{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@@ -0,0 +1 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

File diff suppressed because one or more lines are too long