mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
feat(ocr): auto-detect rotated pages in Tesseract (#1167)
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
@@ -4913,9 +4913,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
315.65362548828125,
|
||||
489.19854736328125,
|
||||
489.1985778808594,
|
||||
537.1475219726562,
|
||||
563.2765655517578
|
||||
563.276611328125
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
@@ -4979,9 +4979,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
312.10369873046875,
|
||||
541.3901214599609,
|
||||
541.3901519775391,
|
||||
550.38916015625,
|
||||
713.5591125488281
|
||||
713.5591354370117
|
||||
],
|
||||
"page": 3,
|
||||
"span": [
|
||||
@@ -5003,7 +5003,7 @@
|
||||
74.30525970458984,
|
||||
608.2984924316406,
|
||||
519.9801025390625,
|
||||
714.0887908935547
|
||||
714.0887985229492
|
||||
],
|
||||
"page": 5,
|
||||
"span": [
|
||||
@@ -5024,7 +5024,7 @@
|
||||
"bbox": [
|
||||
53.03328323364258,
|
||||
284.3311462402344,
|
||||
285.3731384277344,
|
||||
285.3731689453125,
|
||||
534.3346557617188
|
||||
],
|
||||
"page": 5,
|
||||
@@ -5047,7 +5047,7 @@
|
||||
49.97503662109375,
|
||||
604.4210662841797,
|
||||
301.6335754394531,
|
||||
688.2873153686523
|
||||
688.2873382568359
|
||||
],
|
||||
"page": 8,
|
||||
"span": [
|
||||
@@ -5066,7 +5066,7 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
305.58367919921875,
|
||||
305.5836486816406,
|
||||
611.3732452392578,
|
||||
554.8258666992188,
|
||||
693.3458404541016
|
||||
@@ -5111,9 +5111,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
216.76925659179688,
|
||||
348.6529541015625,
|
||||
348.65301513671875,
|
||||
375.7829284667969,
|
||||
411.5093994140625
|
||||
411.5093688964844
|
||||
],
|
||||
"page": 8,
|
||||
"span": [
|
||||
@@ -5132,10 +5132,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
383.1363830566406,
|
||||
383.1364440917969,
|
||||
349.2250671386719,
|
||||
542.1131591796875,
|
||||
410.7687072753906
|
||||
542.1132202148438,
|
||||
410.7686767578125
|
||||
],
|
||||
"page": 8,
|
||||
"span": [
|
||||
@@ -5220,7 +5220,7 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
51.153778076171875,
|
||||
51.15378952026367,
|
||||
447.09332275390625,
|
||||
282.8598937988281,
|
||||
687.6914825439453
|
||||
@@ -5286,8 +5286,8 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
55.11635208129883,
|
||||
542.6654968261719,
|
||||
55.116363525390625,
|
||||
542.6654510498047,
|
||||
279.370849609375,
|
||||
655.7449951171875
|
||||
],
|
||||
@@ -5375,9 +5375,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
323.46868896484375,
|
||||
327.73956298828125,
|
||||
327.739501953125,
|
||||
525.9569091796875,
|
||||
429.5492248535156
|
||||
429.5491638183594
|
||||
],
|
||||
"page": 15,
|
||||
"span": [
|
||||
@@ -5421,7 +5421,7 @@
|
||||
66.79948425292969,
|
||||
293.8616027832031,
|
||||
528.5565795898438,
|
||||
538.3837127685547
|
||||
538.3836822509766
|
||||
],
|
||||
"page": 16,
|
||||
"span": [
|
||||
@@ -5443,9 +5443,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
315.65362548828125,
|
||||
489.19854736328125,
|
||||
489.1985778808594,
|
||||
537.1475219726562,
|
||||
563.2765655517578
|
||||
563.276611328125
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
@@ -6250,10 +6250,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
310.6757507324219,
|
||||
310.67584228515625,
|
||||
636.7794799804688,
|
||||
542.9546508789062,
|
||||
718.8061218261719
|
||||
542.9547119140625,
|
||||
718.8061141967773
|
||||
],
|
||||
"page": 4,
|
||||
"span": [
|
||||
@@ -9329,7 +9329,7 @@
|
||||
332.9688720703125,
|
||||
148.73028564453125,
|
||||
520.942138671875,
|
||||
251.71649169921875
|
||||
251.7164306640625
|
||||
],
|
||||
"page": 7,
|
||||
"span": [
|
||||
@@ -10152,9 +10152,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
53.62853240966797,
|
||||
499.6000061035156,
|
||||
499.60003662109375,
|
||||
298.5574951171875,
|
||||
573.0514526367188
|
||||
573.0514221191406
|
||||
],
|
||||
"page": 8,
|
||||
"span": [
|
||||
@@ -12941,7 +12941,7 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
83.31759643554688,
|
||||
83.31756591796875,
|
||||
304.7430114746094,
|
||||
248.873046875,
|
||||
395.9864501953125
|
||||
@@ -12968,9 +12968,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
310.3294372558594,
|
||||
655.8524932861328,
|
||||
655.8524780273438,
|
||||
555.8338623046875,
|
||||
690.8223342895508
|
||||
690.8223266601562
|
||||
],
|
||||
"page": 13,
|
||||
"span": [
|
||||
@@ -12994,9 +12994,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
309.9566345214844,
|
||||
607.2774353027344,
|
||||
607.2774658203125,
|
||||
555.7466430664062,
|
||||
637.3854827880859
|
||||
637.3855133056641
|
||||
],
|
||||
"page": 13,
|
||||
"span": [
|
||||
@@ -13019,10 +13019,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
309.96356201171875,
|
||||
558.4485168457031,
|
||||
555.7053833007812,
|
||||
596.2945098876953
|
||||
309.9635314941406,
|
||||
558.4485473632812,
|
||||
555.7054443359375,
|
||||
596.2945861816406
|
||||
],
|
||||
"page": 13,
|
||||
"span": [
|
||||
@@ -13175,10 +13175,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
51.726383209228516,
|
||||
447.7555236816406,
|
||||
283.1140441894531,
|
||||
518.3907470703125
|
||||
51.72642135620117,
|
||||
447.7554931640625,
|
||||
283.114013671875,
|
||||
518.3907165527344
|
||||
],
|
||||
"page": 14,
|
||||
"span": [
|
||||
@@ -13201,7 +13201,7 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
51.43488693237305,
|
||||
51.434879302978516,
|
||||
300.17974853515625,
|
||||
310.7267150878906,
|
||||
338.51251220703125
|
||||
@@ -13253,7 +13253,7 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
51.2728271484375,
|
||||
51.27280807495117,
|
||||
200.086669921875,
|
||||
311.0897216796875,
|
||||
238.271484375
|
||||
@@ -13435,10 +13435,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
319.0649719238281,
|
||||
319.06494140625,
|
||||
122.80792236328125,
|
||||
533.7738647460938,
|
||||
182.1590576171875
|
||||
533.77392578125,
|
||||
182.1591796875
|
||||
],
|
||||
"page": 14,
|
||||
"span": [
|
||||
@@ -13461,8 +13461,8 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
55.11635208129883,
|
||||
542.6654968261719,
|
||||
55.116363525390625,
|
||||
542.6654510498047,
|
||||
279.370849609375,
|
||||
655.7449951171875
|
||||
],
|
||||
@@ -13513,10 +13513,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
323.0059814453125,
|
||||
569.0885772705078,
|
||||
525.9517211914062,
|
||||
670.4528656005859
|
||||
323.0059509277344,
|
||||
569.0885925292969,
|
||||
525.95166015625,
|
||||
670.4528503417969
|
||||
],
|
||||
"page": 15,
|
||||
"span": [
|
||||
@@ -13540,9 +13540,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
323.384765625,
|
||||
447.9078674316406,
|
||||
447.90789794921875,
|
||||
526.1268920898438,
|
||||
550.0270690917969
|
||||
550.0270538330078
|
||||
],
|
||||
"page": 15,
|
||||
"span": [
|
||||
@@ -13566,9 +13566,9 @@
|
||||
{
|
||||
"bbox": [
|
||||
323.46868896484375,
|
||||
327.73956298828125,
|
||||
327.739501953125,
|
||||
525.9569091796875,
|
||||
429.5492248535156
|
||||
429.5491638183594
|
||||
],
|
||||
"page": 15,
|
||||
"span": [
|
||||
|
||||
Reference in New Issue
Block a user