From 1181338737ca9b98ca3eb02cab6d7ee3ed0f02f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cle=CC=81ment=20Doumouro?= <clement.doumouro@gmail.com>
Date: Wed, 9 Apr 2025 11:31:44 +0200
Subject: [PATCH] fix(ocr): avoid to swallow tesseract errors causing
 orientation detection failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
---
 docling/models/tesseract_ocr_cli_model.py | 26 ++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py
index 11e61bf2..5f222399 100644
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -2,6 +2,7 @@ import csv
 import io
 import logging
 import os
+import subprocess
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
@@ -150,9 +151,8 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd = [self.options.tesseract_cmd]
         cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, capture_output=True, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_detected = pd.read_csv(
             io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
         )
@@ -231,14 +231,30 @@ class TesseractOcrCliModel(BaseOcrModel):
                             ) as image_file:
                                 fname = image_file.name
                                 high_res_image.save(image_file)
-                            df_osd = self._perform_osd(fname)
+                            try:
+                                df_osd = self._perform_osd(fname)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "OSD failed for: %s with error:\n %s",
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                             doc_orientation = _parse_orientation(df_osd)
                             if doc_orientation != 0:
                                 high_res_image = high_res_image.rotate(
                                     -doc_orientation, expand=True
                                 )
                                 high_res_image.save(fname)
-                            df_result = self._run_tesseract(fname, df_osd)
+                            try:
+                                df_result = self._run_tesseract(fname, df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "tesseract OCR failed for: %s with error:\n %s",
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                         finally:
                             if os.path.exists(fname):
                                 os.remove(fname)