mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
feat(ocr): auto-detect rotated pages in Tesseract (#1167)
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
@@ -101,7 +101,13 @@
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Summer activities",
|
||||
"text": "Summer activities"
|
||||
"text": "Summer activities",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
@@ -138,7 +144,13 @@
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Duck",
|
||||
"text": "Duck"
|
||||
"text": "Duck",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
@@ -150,7 +162,13 @@
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Figure 1: This is a cute duckling",
|
||||
"text": "Figure 1: This is a cute duckling"
|
||||
"text": "Figure 1: This is a cute duckling",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
@@ -180,8 +198,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Let\u2019s swim!",
|
||||
"text": "Let\u2019s swim!",
|
||||
"orig": "Let’s swim!",
|
||||
"text": "Let’s swim!",
|
||||
"level": 1
|
||||
},
|
||||
{
|
||||
@@ -194,7 +212,13 @@
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||
"text": "To get started with swimming, first lay down in a water and try not to drown:"
|
||||
"text": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
@@ -207,6 +231,12 @@
|
||||
"prov": [],
|
||||
"orig": "You can relax and look around",
|
||||
"text": "You can relax and look around",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -221,6 +251,12 @@
|
||||
"prov": [],
|
||||
"orig": "Paddle about",
|
||||
"text": "Paddle about",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -235,6 +271,12 @@
|
||||
"prov": [],
|
||||
"orig": "Enjoy summer warmth",
|
||||
"text": "Enjoy summer warmth",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -247,8 +289,14 @@
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Also, don\u2019t forget:",
|
||||
"text": "Also, don\u2019t forget:"
|
||||
"orig": "Also, don’t forget:",
|
||||
"text": "Also, don’t forget:",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
@@ -261,6 +309,12 @@
|
||||
"prov": [],
|
||||
"orig": "Wear sunglasses",
|
||||
"text": "Wear sunglasses",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -273,8 +327,14 @@
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Don\u2019t forget to drink water",
|
||||
"text": "Don\u2019t forget to drink water",
|
||||
"orig": "Don’t forget to drink water",
|
||||
"text": "Don’t forget to drink water",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -289,6 +349,12 @@
|
||||
"prov": [],
|
||||
"orig": "Use sun cream",
|
||||
"text": "Use sun cream",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -301,8 +367,14 @@
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Hmm, what else\u2026",
|
||||
"text": "Hmm, what else\u2026"
|
||||
"orig": "Hmm, what else…",
|
||||
"text": "Hmm, what else…",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/14",
|
||||
@@ -335,8 +407,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Let\u2019s eat",
|
||||
"text": "Let\u2019s eat",
|
||||
"orig": "Let’s eat",
|
||||
"text": "Let’s eat",
|
||||
"level": 2
|
||||
},
|
||||
{
|
||||
@@ -348,8 +420,14 @@
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice",
|
||||
"text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice"
|
||||
"orig": "After we had a good day of swimming in the lake, it’s important to eat something nice",
|
||||
"text": "After we had a good day of swimming in the lake, it’s important to eat something nice",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
@@ -361,7 +439,13 @@
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "I like to eat leaves",
|
||||
"text": "I like to eat leaves"
|
||||
"text": "I like to eat leaves",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/17",
|
||||
@@ -373,7 +457,13 @@
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Here are some interesting things a respectful duck could eat:",
|
||||
"text": "Here are some interesting things a respectful duck could eat:"
|
||||
"text": "Here are some interesting things a respectful duck could eat:",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
@@ -396,8 +486,14 @@
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "And let\u2019s add another list in the end:",
|
||||
"text": "And let\u2019s add another list in the end:"
|
||||
"orig": "And let’s add another list in the end:",
|
||||
"text": "And let’s add another list in the end:",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
@@ -410,6 +506,12 @@
|
||||
"prov": [],
|
||||
"orig": "Leaves",
|
||||
"text": "Leaves",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -424,6 +526,12 @@
|
||||
"prov": [],
|
||||
"orig": "Berries",
|
||||
"text": "Berries",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@@ -438,6 +546,12 @@
|
||||
"prov": [],
|
||||
"orig": "Grain",
|
||||
"text": "Grain",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user