mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fix(ocr): avoid to swallow tesseract errors causing orientation detection failures
Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
parent
c1ac22947f
commit
1181338737
@ -2,6 +2,7 @@ import csv
|
|||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -150,9 +151,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
||||||
_log.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
output = subprocess.run(cmd, capture_output=True, check=True)
|
||||||
output, _ = proc.communicate()
|
decoded_data = output.stdout.decode("utf-8")
|
||||||
decoded_data = output.decode("utf-8")
|
|
||||||
df_detected = pd.read_csv(
|
df_detected = pd.read_csv(
|
||||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||||
)
|
)
|
||||||
@ -231,14 +231,30 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
) as image_file:
|
) as image_file:
|
||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(image_file)
|
high_res_image.save(image_file)
|
||||||
df_osd = self._perform_osd(fname)
|
try:
|
||||||
|
df_osd = self._perform_osd(fname)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
_log.error(
|
||||||
|
"OSD failed for: %s with error:\n %s",
|
||||||
|
image_file,
|
||||||
|
exc.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
doc_orientation = _parse_orientation(df_osd)
|
doc_orientation = _parse_orientation(df_osd)
|
||||||
if doc_orientation != 0:
|
if doc_orientation != 0:
|
||||||
high_res_image = high_res_image.rotate(
|
high_res_image = high_res_image.rotate(
|
||||||
-doc_orientation, expand=True
|
-doc_orientation, expand=True
|
||||||
)
|
)
|
||||||
high_res_image.save(fname)
|
high_res_image.save(fname)
|
||||||
df_result = self._run_tesseract(fname, df_osd)
|
try:
|
||||||
|
df_result = self._run_tesseract(fname, df_osd)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
_log.error(
|
||||||
|
"tesseract OCR failed for: %s with error:\n %s",
|
||||||
|
image_file,
|
||||||
|
exc.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(fname):
|
if os.path.exists(fname):
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
|
Loading…
Reference in New Issue
Block a user