feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
2025-12-08 20:58:11 +00:00 · 2025-05-21 18:12:33 +02:00
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -1,6 +1,6 @@
 import sys
 from pathlib import Path
-from typing import List
+from typing import List, Tuple

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -56,33 +56,35 @@ def get_converter(ocr_options: OcrOptions):
 def test_e2e_conversions():
    pdf_paths = get_pdf_paths()

-    engines: List[OcrOptions] = [
-        EasyOcrOptions(),
-        TesseractOcrOptions(),
-        TesseractCliOcrOptions(),
-        EasyOcrOptions(force_full_page_ocr=True),
-        TesseractOcrOptions(force_full_page_ocr=True),
-        TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
-        TesseractCliOcrOptions(force_full_page_ocr=True),
-        TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+    engines: List[Tuple[OcrOptions, bool]] = [
+        (EasyOcrOptions(), False),
+        (TesseractOcrOptions(), True),
+        (TesseractCliOcrOptions(), True),
+        (EasyOcrOptions(force_full_page_ocr=True), False),
+        (TesseractOcrOptions(force_full_page_ocr=True), True),
+        (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
+        (TesseractCliOcrOptions(force_full_page_ocr=True), True),
+        (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
    ]

    # rapidocr is only available for Python >=3.6,<3.13
    if sys.version_info < (3, 13):
-        engines.append(RapidOcrOptions())
-        engines.append(RapidOcrOptions(force_full_page_ocr=True))
+        engines.append((RapidOcrOptions(), False))
+        engines.append((RapidOcrOptions(force_full_page_ocr=True), False))

    # only works on mac
    if "darwin" == sys.platform:
-        engines.append(OcrMacOptions())
-        engines.append(OcrMacOptions(force_full_page_ocr=True))
+        engines.append((OcrMacOptions(), True))
+        engines.append((OcrMacOptions(force_full_page_ocr=True), True))

-    for ocr_options in engines:
+    for ocr_options, supports_rotation in engines:
        print(
            f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
        )
        converter = get_converter(ocr_options=ocr_options)
        for pdf_path in pdf_paths:
+            if not supports_rotation and "rotated" in pdf_path.name:
+                continue
            print(f"converting {pdf_path}")

            doc_result: ConversionResult = converter.convert(pdf_path)