Use different OCR engine order

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-11 15:59:47 +02:00
parent 9752e824fb
commit 9469280802
24 changed files with 1300 additions and 10900 deletions

View File

@ -44,10 +44,10 @@
"prov": [
{
"bbox": [
72.33333333333333,
691.5883585611979,
503.3333333333333,
763.9216918945312
69.6796630536824,
689.0124221922704,
504.8720051760782,
764.9216921155637
],
"page": 1,
"span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
<document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
</document>

View File

@ -44,10 +44,10 @@
"prov": [
{
"bbox": [
444.6666666666667,
131.58835856119788,
521.6666666666666,
150.25502522786462
441.2561096985719,
131.89488404865142,
522.0347860494834,
151.87873262042876
],
"page": 1,
"span": [
@ -67,10 +67,10 @@
"prov": [
{
"bbox": [
92.0,
77.92169189453125,
523.0,
123.25502522786462
89.23887497045128,
77.02339852098021,
523.208764293368,
124.75312428291147
],
"page": 1,
"span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<document>
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
</document>

View File

@ -44,10 +44,10 @@
"prov": [
{
"bbox": [
691.6666666666666,
444.53450520833337,
710.3333333333334,
521.5345052083334
690.2441821046808,
442.39487414368364,
709.8255852011977,
523.076601235155
],
"page": 1,
"span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document>

View File

@ -44,10 +44,10 @@
"prov": [
{
"bbox": [
131.66666666666666,
73.53450520833337,
150.33333333333334,
150.53450520833331
131.21306574279092,
74.12495603322407,
152.19606490864376,
154.19400205373182
],
"page": 1,
"span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@ -42,10 +42,10 @@
{
"page_no": 1,
"bbox": {
"l": 72.33333333333333,
"t": 763.9216918945312,
"r": 503.3333333333333,
"b": 691.5883585611979,
"l": 69.6796630536824,
"t": 764.9216921155637,
"r": 504.8720051760782,
"b": 689.0124221922704,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
</doctag>

View File

@ -45,10 +45,10 @@
{
"page_no": 1,
"bbox": {
"l": 444.6666666666667,
"t": 150.25502522786462,
"r": 521.6666666666666,
"b": 131.58835856119788,
"l": 441.2561096985719,
"t": 151.87873262042876,
"r": 522.0347860494834,
"b": 131.89488404865142,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@ -74,10 +74,10 @@
{
"page_no": 1,
"bbox": {
"l": 92.0,
"t": 123.25502522786462,
"r": 523.0,
"b": 77.92169189453125,
"l": 89.23887497045128,
"t": 124.75312428291147,
"r": 523.208764293368,
"b": 77.02339852098021,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_411><loc_62><loc_422><loc_127>package</text>
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_410><loc_61><loc_422><loc_128>package</text>
</doctag>

View File

@ -45,10 +45,10 @@
{
"page_no": 1,
"bbox": {
"l": 718.6666666666666,
"t": 522.8678385416666,
"r": 764.0,
"b": 91.86783854166669,
"l": 717.168585936602,
"t": 524.2990550512769,
"r": 764.8982839673505,
"b": 90.3291657283603,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@ -74,10 +74,10 @@
{
"page_no": 1,
"bbox": {
"l": 691.6666666666666,
"t": 521.5345052083334,
"r": 710.3333333333334,
"b": 444.53450520833337,
"l": 690.2441821046808,
"t": 523.076601235155,
"r": 709.8255852011977,
"b": 442.39487414368364,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_374><loc_89><loc_438>package</text>
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_370><loc_90><loc_438>package</text>
</doctag>

View File

@ -45,10 +45,10 @@
{
"page_no": 1,
"bbox": {
"l": 78.0,
"t": 503.201171875,
"r": 123.33333333333333,
"b": 72.201171875,
"l": 77.10171545548258,
"t": 506.0744964609271,
"r": 126.08064862014129,
"b": 71.87755635676046,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
@ -74,10 +74,10 @@
{
"page_no": 1,
"bbox": {
"l": 131.66666666666666,
"t": 150.53450520833331,
"r": 150.33333333333334,
"b": 73.53450520833337,
"l": 131.21306574279092,
"t": 154.19400205373182,
"r": 152.19606490864376,
"b": 74.12495603322407,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -57,14 +57,14 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
(EasyOcrOptions(), False),
(TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
]
# rapidocr is only available for Python >=3.6,<3.13