fix: remove stderr from tesseract cli and introduce fuzziness in the text validation of OCR tests (#138)

* feat(OCR tests): Introduce fuzziness in the text validation of OCR tests

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix(TesseractOcrCliModel): Send the stderr to devnull to avoid poluting the console with messages from tesseract cmd

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos
2024-10-11 10:21:19 +02:00
committed by GitHub
parent 5f1bd9e9c8
commit dae2a3b667
3 changed files with 50 additions and 17 deletions

View File

@@ -1,7 +1,7 @@
import io
import logging
import tempfile
from subprocess import PIPE, Popen
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Tuple
import pandas as pd
@@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE)
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
# _log.info(output)