fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro 2025-04-09 11:31:44 +02:00
parent c1ac22947f
commit 1181338737

View File

@ -2,6 +2,7 @@ import csv
import io import io
import logging import logging
import os import os
import subprocess
import tempfile import tempfile
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
@ -150,9 +151,8 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd = [self.options.tesseract_cmd] cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"]) cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
_log.info("command: {}".format(" ".join(cmd))) _log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output = subprocess.run(cmd, capture_output=True, check=True)
output, _ = proc.communicate() decoded_data = output.stdout.decode("utf-8")
decoded_data = output.decode("utf-8")
df_detected = pd.read_csv( df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
) )
@ -231,14 +231,30 @@ class TesseractOcrCliModel(BaseOcrModel):
) as image_file: ) as image_file:
fname = image_file.name fname = image_file.name
high_res_image.save(image_file) high_res_image.save(image_file)
df_osd = self._perform_osd(fname) try:
df_osd = self._perform_osd(fname)
except subprocess.CalledProcessError as exc:
_log.error(
"OSD failed for: %s with error:\n %s",
image_file,
exc.stderr,
)
continue
doc_orientation = _parse_orientation(df_osd) doc_orientation = _parse_orientation(df_osd)
if doc_orientation != 0: if doc_orientation != 0:
high_res_image = high_res_image.rotate( high_res_image = high_res_image.rotate(
-doc_orientation, expand=True -doc_orientation, expand=True
) )
high_res_image.save(fname) high_res_image.save(fname)
df_result = self._run_tesseract(fname, df_osd) try:
df_result = self._run_tesseract(fname, df_osd)
except subprocess.CalledProcessError as exc:
_log.error(
"tesseract OCR failed for: %s with error:\n %s",
image_file,
exc.stderr,
)
continue
finally: finally:
if os.path.exists(fname): if os.path.exists(fname):
os.remove(fname) os.remove(fname)