feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): update missing test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): rotate image to the natural orientation before layout prediction

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): move bounding bow rotation util to orientation.py

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel`

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel`

* chore(ocr): default `TesseractOcrCliModel._is_auto` to `False`

* fix(ocr): fix `TesseractOcrCliModel._is_auto` computation

* chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel`

---------

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro
2025-05-21 18:12:33 +02:00
committed by GitHub
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions

View File

@@ -3411,7 +3411,7 @@
"b": 519.65363,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9695364832878113,
"confidence": 0.9695363640785217,
"cells": [
{
"index": 34,
@@ -4081,7 +4081,7 @@
"b": 142.65363000000002,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9263732433319092,
"confidence": 0.9263731241226196,
"cells": [
{
"index": 59,
@@ -4611,7 +4611,7 @@
"b": 382.15362999999996,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9253151416778564,
"confidence": 0.9253152012825012,
"cells": [
{
"index": 79,
@@ -4651,7 +4651,7 @@
"b": 409.15362999999996,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9676452875137329,
"confidence": 0.9676451683044434,
"cells": [
{
"index": 80,
@@ -4711,12 +4711,12 @@
"label": "picture",
"bbox": {
"l": 320.4467468261719,
"t": 421.6407165527344,
"t": 421.640625,
"r": 558.8576049804688,
"b": 692.310791015625,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9881085753440857,
"confidence": 0.9881086945533752,
"cells": [
{
"index": 82,
@@ -5463,7 +5463,7 @@
"b": 713.009598,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9449449777603149,
"confidence": 0.9449448585510254,
"cells": [
{
"index": 93,
@@ -5528,7 +5528,7 @@
"b": 710.989597,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9497623443603516,
"confidence": 0.9497622847557068,
"cells": [
{
"index": 95,
@@ -5593,7 +5593,7 @@
"b": 740.290298,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9368569850921631,
"confidence": 0.9368568658828735,
"cells": [
{
"index": 97,
@@ -6624,7 +6624,7 @@
"b": 519.65363,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9695364832878113,
"confidence": 0.9695363640785217,
"cells": [
{
"index": 34,
@@ -7312,7 +7312,7 @@
"b": 142.65363000000002,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9263732433319092,
"confidence": 0.9263731241226196,
"cells": [
{
"index": 59,
@@ -7854,7 +7854,7 @@
"b": 382.15362999999996,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9253151416778564,
"confidence": 0.9253152012825012,
"cells": [
{
"index": 79,
@@ -7900,7 +7900,7 @@
"b": 409.15362999999996,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9676452875137329,
"confidence": 0.9676451683044434,
"cells": [
{
"index": 80,
@@ -7966,12 +7966,12 @@
"label": "picture",
"bbox": {
"l": 320.4467468261719,
"t": 421.6407165527344,
"t": 421.640625,
"r": 558.8576049804688,
"b": 692.310791015625,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9881085753440857,
"confidence": 0.9881086945533752,
"cells": [
{
"index": 82,
@@ -8738,7 +8738,7 @@
"b": 713.009598,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9449449777603149,
"confidence": 0.9449448585510254,
"cells": [
{
"index": 93,
@@ -8809,7 +8809,7 @@
"b": 710.989597,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9497623443603516,
"confidence": 0.9497622847557068,
"cells": [
{
"index": 95,
@@ -8880,7 +8880,7 @@
"b": 740.290298,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9368569850921631,
"confidence": 0.9368568658828735,
"cells": [
{
"index": 97,
@@ -9904,7 +9904,7 @@
"b": 519.65363,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9695364832878113,
"confidence": 0.9695363640785217,
"cells": [
{
"index": 34,
@@ -10592,7 +10592,7 @@
"b": 142.65363000000002,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9263732433319092,
"confidence": 0.9263731241226196,
"cells": [
{
"index": 59,
@@ -11134,7 +11134,7 @@
"b": 382.15362999999996,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9253151416778564,
"confidence": 0.9253152012825012,
"cells": [
{
"index": 79,
@@ -11180,7 +11180,7 @@
"b": 409.15362999999996,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9676452875137329,
"confidence": 0.9676451683044434,
"cells": [
{
"index": 80,
@@ -11246,12 +11246,12 @@
"label": "picture",
"bbox": {
"l": 320.4467468261719,
"t": 421.6407165527344,
"t": 421.640625,
"r": 558.8576049804688,
"b": 692.310791015625,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9881085753440857,
"confidence": 0.9881086945533752,
"cells": [
{
"index": 82,
@@ -12018,7 +12018,7 @@
"b": 713.009598,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9449449777603149,
"confidence": 0.9449448585510254,
"cells": [
{
"index": 93,
@@ -12089,7 +12089,7 @@
"b": 710.989597,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9497623443603516,
"confidence": 0.9497622847557068,
"cells": [
{
"index": 95,
@@ -12162,7 +12162,7 @@
"b": 740.290298,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9368569850921631,
"confidence": 0.9368568658828735,
"cells": [
{
"index": 97,