mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Use different OCR engine order
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
9752e824fb
commit
9469280802
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
72.33333333333333,
|
||||
691.5883585611979,
|
||||
503.3333333333333,
|
||||
763.9216918945312
|
||||
69.6796630536824,
|
||||
689.0124221922704,
|
||||
504.8720051760782,
|
||||
764.9216921155637
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
|
||||
</document>
|
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
444.6666666666667,
|
||||
131.58835856119788,
|
||||
521.6666666666666,
|
||||
150.25502522786462
|
||||
441.2561096985719,
|
||||
131.89488404865142,
|
||||
522.0347860494834,
|
||||
151.87873262042876
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
@ -67,10 +67,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
92.0,
|
||||
77.92169189453125,
|
||||
523.0,
|
||||
123.25502522786462
|
||||
89.23887497045128,
|
||||
77.02339852098021,
|
||||
523.208764293368,
|
||||
124.75312428291147
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
|
||||
</document>
|
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
691.6666666666666,
|
||||
444.53450520833337,
|
||||
710.3333333333334,
|
||||
521.5345052083334
|
||||
690.2441821046808,
|
||||
442.39487414368364,
|
||||
709.8255852011977,
|
||||
523.076601235155
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||
</document>
|
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
131.66666666666666,
|
||||
73.53450520833337,
|
||||
150.33333333333334,
|
||||
150.53450520833331
|
||||
131.21306574279092,
|
||||
74.12495603322407,
|
||||
152.19606490864376,
|
||||
154.19400205373182
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,2 +1,2 @@
|
||||
<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
</doctag>
|
@ -42,10 +42,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 72.33333333333333,
|
||||
"t": 763.9216918945312,
|
||||
"r": 503.3333333333333,
|
||||
"b": 691.5883585611979,
|
||||
"l": 69.6796630536824,
|
||||
"t": 764.9216921155637,
|
||||
"r": 504.8720051760782,
|
||||
"b": 689.0124221922704,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
|
||||
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
|
||||
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
</doctag>
|
@ -45,10 +45,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 444.6666666666667,
|
||||
"t": 150.25502522786462,
|
||||
"r": 521.6666666666666,
|
||||
"b": 131.58835856119788,
|
||||
"l": 441.2561096985719,
|
||||
"t": 151.87873262042876,
|
||||
"r": 522.0347860494834,
|
||||
"b": 131.89488404865142,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
@ -74,10 +74,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 92.0,
|
||||
"t": 123.25502522786462,
|
||||
"r": 523.0,
|
||||
"b": 77.92169189453125,
|
||||
"l": 89.23887497045128,
|
||||
"t": 124.75312428291147,
|
||||
"r": 523.208764293368,
|
||||
"b": 77.02339852098021,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_411><loc_62><loc_422><loc_127>package</text>
|
||||
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_410><loc_61><loc_422><loc_128>package</text>
|
||||
</doctag>
|
@ -45,10 +45,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 718.6666666666666,
|
||||
"t": 522.8678385416666,
|
||||
"r": 764.0,
|
||||
"b": 91.86783854166669,
|
||||
"l": 717.168585936602,
|
||||
"t": 524.2990550512769,
|
||||
"r": 764.8982839673505,
|
||||
"b": 90.3291657283603,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
@ -74,10 +74,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 691.6666666666666,
|
||||
"t": 521.5345052083334,
|
||||
"r": 710.3333333333334,
|
||||
"b": 444.53450520833337,
|
||||
"l": 690.2441821046808,
|
||||
"t": 523.076601235155,
|
||||
"r": 709.8255852011977,
|
||||
"b": 442.39487414368364,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_78><loc_374><loc_89><loc_438>package</text>
|
||||
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_78><loc_370><loc_90><loc_438>package</text>
|
||||
</doctag>
|
@ -45,10 +45,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 78.0,
|
||||
"t": 503.201171875,
|
||||
"r": 123.33333333333333,
|
||||
"b": 72.201171875,
|
||||
"l": 77.10171545548258,
|
||||
"t": 506.0744964609271,
|
||||
"r": 126.08064862014129,
|
||||
"b": 71.87755635676046,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
@ -74,10 +74,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 131.66666666666666,
|
||||
"t": 150.53450520833331,
|
||||
"r": 150.33333333333334,
|
||||
"b": 73.53450520833337,
|
||||
"l": 131.21306574279092,
|
||||
"t": 154.19400205373182,
|
||||
"r": 152.19606490864376,
|
||||
"b": 74.12495603322407,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -57,14 +57,14 @@ def test_e2e_conversions():
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
engines: List[Tuple[OcrOptions, bool]] = [
|
||||
(EasyOcrOptions(), False),
|
||||
(TesseractOcrOptions(), True),
|
||||
(TesseractCliOcrOptions(), True),
|
||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||
(EasyOcrOptions(), False),
|
||||
(TesseractOcrOptions(force_full_page_ocr=True), True),
|
||||
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
|
||||
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||
]
|
||||
|
||||
# rapidocr is only available for Python >=3.6,<3.13
|
||||
|
Loading…
Reference in New Issue
Block a user