feat(OCR): Introduce support for the language path in the pipelines of both Tesseract OCR engines.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-10-08 14:24:13 +02:00
parent 118afee1f3
commit 074acd703c
3 changed files with 24 additions and 6 deletions

View File

@ -40,6 +40,7 @@ class TesseractOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract" kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"] lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract" tesseract_cmd: str = "tesseract"
path: Optional[str] = None
model_config = ConfigDict( model_config = ConfigDict(
extra="forbid", extra="forbid",
@ -49,6 +50,7 @@ class TesseractOcrOptions(OcrOptions):
class TesserOcrOptions(OcrOptions): class TesserOcrOptions(OcrOptions):
kind: Literal["tesserocr"] = "tesserocr" kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"] lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
model_config = ConfigDict( model_config = ConfigDict(
extra="forbid", extra="forbid",

View File

@ -67,12 +67,16 @@ class TesseractOcrModel(BaseOcrModel):
return name, version return name, version
def _run_tesseract(self, ifilename, languages=None): def _run_tesseract(self, ifilename: str):
cmd = [self.options.tesseract_cmd] cmd = [self.options.tesseract_cmd]
if languages: if self.options.lang is not None and len(self.options.lang) > 0:
cmd += ["-l", "+".join(languages)] cmd.append("-l")
cmd.append("+".join(self.options.lang))
if self.options.path is not None:
cmd.append("--tessdata-dir")
cmd.append(self.options.path)
cmd += [ifilename, "stdout", "tsv"] cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd))) _log.info("command: {}".format(" ".join(cmd)))

View File

@ -39,8 +39,20 @@ class TesserOcrModel(BaseOcrModel):
# Initialize the tesseractAPI # Initialize the tesseractAPI
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
if self.options.path is not None:
self.reader = tesserocr.PyTessBaseAPI( self.reader = tesserocr.PyTessBaseAPI(
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT path=self.options.path,
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)
else:
self.reader = tesserocr.PyTessBaseAPI(
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
) )
self.reader_RIL = tesserocr.RIL self.reader_RIL = tesserocr.RIL