Use different OCR engine order

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-11 15:59:47 +02:00
parent 9752e824fb
commit 9469280802
24 changed files with 1300 additions and 10900 deletions

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
72.33333333333333, 69.6796630536824,
691.5883585611979, 689.0124221922704,
503.3333333333333, 504.8720051760782,
763.9216918945312 764.9216921155637
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
<document> <document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph> <paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph> <paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
</document> </document>

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
444.6666666666667, 441.2561096985719,
131.58835856119788, 131.89488404865142,
521.6666666666666, 522.0347860494834,
150.25502522786462 151.87873262042876
], ],
"page": 1, "page": 1,
"span": [ "span": [
@ -67,10 +67,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
92.0, 89.23887497045128,
77.92169189453125, 77.02339852098021,
523.0, 523.208764293368,
123.25502522786462 124.75312428291147
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<document> <document>
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph> <paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
</document> </document>

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
691.6666666666666, 690.2441821046808,
444.53450520833337, 442.39487414368364,
710.3333333333334, 709.8255852011977,
521.5345052083334 523.076601235155
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<document> <document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph> <paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document> </document>

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
131.66666666666666, 131.21306574279092,
73.53450520833337, 74.12495603322407,
150.33333333333334, 152.19606490864376,
150.53450520833331 154.19400205373182
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text> <doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag> </doctag>

View File

@ -42,10 +42,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 72.33333333333333, "l": 69.6796630536824,
"t": 763.9216918945312, "t": 764.9216921155637,
"r": 503.3333333333333, "r": 504.8720051760782,
"b": 691.5883585611979, "b": 689.0124221922704,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text> <doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text> <text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
</doctag> </doctag>

View File

@ -45,10 +45,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 444.6666666666667, "l": 441.2561096985719,
"t": 150.25502522786462, "t": 151.87873262042876,
"r": 521.6666666666666, "r": 522.0347860494834,
"b": 131.58835856119788, "b": 131.89488404865142,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
@ -74,10 +74,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 92.0, "l": 89.23887497045128,
"t": 123.25502522786462, "t": 124.75312428291147,
"r": 523.0, "r": 523.208764293368,
"b": 77.92169189453125, "b": 77.02339852098021,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header> <doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_411><loc_62><loc_422><loc_127>package</text> <text><loc_410><loc_61><loc_422><loc_128>package</text>
</doctag> </doctag>

View File

@ -45,10 +45,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 718.6666666666666, "l": 717.168585936602,
"t": 522.8678385416666, "t": 524.2990550512769,
"r": 764.0, "r": 764.8982839673505,
"b": 91.86783854166669, "b": 90.3291657283603,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
@ -74,10 +74,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 691.6666666666666, "l": 690.2441821046808,
"t": 521.5345052083334, "t": 523.076601235155,
"r": 710.3333333333334, "r": 709.8255852011977,
"b": 444.53450520833337, "b": 442.39487414368364,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header> <doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_374><loc_89><loc_438>package</text> <text><loc_78><loc_370><loc_90><loc_438>package</text>
</doctag> </doctag>

View File

@ -45,10 +45,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 78.0, "l": 77.10171545548258,
"t": 503.201171875, "t": 506.0744964609271,
"r": 123.33333333333333, "r": 126.08064862014129,
"b": 72.201171875, "b": 71.87755635676046,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
@ -74,10 +74,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 131.66666666666666, "l": 131.21306574279092,
"t": 150.53450520833331, "t": 154.19400205373182,
"r": 150.33333333333334, "r": 152.19606490864376,
"b": 73.53450520833337, "b": 74.12495603322407,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -57,14 +57,14 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
engines: List[Tuple[OcrOptions, bool]] = [ engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True), (TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True), (TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False), (EasyOcrOptions(), False),
(TesseractOcrOptions(force_full_page_ocr=True), True), (TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True), (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True), (TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True), (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
] ]
# rapidocr is only available for Python >=3.6,<3.13 # rapidocr is only available for Python >=3.6,<3.13