mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 03:55:00 +00:00
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
83 lines
1.8 KiB
JSON
83 lines
1.8 KiB
JSON
{
|
|
"_name": "",
|
|
"type": "pdf-document",
|
|
"description": {
|
|
"title": null,
|
|
"abstract": null,
|
|
"authors": null,
|
|
"affiliations": null,
|
|
"subjects": null,
|
|
"keywords": null,
|
|
"publication_date": null,
|
|
"languages": null,
|
|
"license": null,
|
|
"publishers": null,
|
|
"url_refs": null,
|
|
"references": null,
|
|
"publication": null,
|
|
"reference_count": null,
|
|
"citation_count": null,
|
|
"citation_date": null,
|
|
"advanced": null,
|
|
"analytics": null,
|
|
"logs": [],
|
|
"collection": null,
|
|
"acquisition": null
|
|
},
|
|
"file-info": {
|
|
"filename": "ocr_test.pdf",
|
|
"filename-prov": null,
|
|
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
|
|
"#-pages": 1,
|
|
"collection-name": null,
|
|
"description": null,
|
|
"page-hashes": [
|
|
{
|
|
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
|
|
"model": "default",
|
|
"page": 1
|
|
}
|
|
]
|
|
},
|
|
"main-text": [
|
|
{
|
|
"prov": [
|
|
{
|
|
"bbox": [
|
|
70.90211866351085,
|
|
689.216658542347,
|
|
504.8720079864275,
|
|
764.9216921155637
|
|
],
|
|
"page": 1,
|
|
"span": [
|
|
0,
|
|
94
|
|
],
|
|
"__ref_s3_data": null
|
|
}
|
|
],
|
|
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
|
"type": "paragraph",
|
|
"payload": null,
|
|
"name": "Text",
|
|
"font": null
|
|
}
|
|
],
|
|
"figures": [],
|
|
"tables": [],
|
|
"bitmaps": null,
|
|
"equations": [],
|
|
"footnotes": [],
|
|
"page-dimensions": [
|
|
{
|
|
"height": 841.9216918945312,
|
|
"page": 1,
|
|
"width": 595.201171875
|
|
}
|
|
],
|
|
"page-footers": [],
|
|
"page-headers": [],
|
|
"_s3_data": null,
|
|
"identifiers": null
|
|
} |