From 074acd703cacae3c7c62a5b3b5e8cd920bee900d Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Tue, 8 Oct 2024 14:24:13 +0200 Subject: [PATCH] feat(OCR): Introduce support for the language `path` in the pipelines of both Tesseract OCR engines. Signed-off-by: Nikos Livathinos --- docling/datamodel/pipeline_options.py | 2 ++ docling/models/tesseract_model.py | 10 +++++++--- docling/models/tesserocr_model.py | 18 +++++++++++++++--- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index b4bb9977..c3c81c3e 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -40,6 +40,7 @@ class TesseractOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" lang: List[str] = ["fra", "deu", "spa", "eng"] tesseract_cmd: str = "tesseract" + path: Optional[str] = None model_config = ConfigDict( extra="forbid", @@ -49,6 +50,7 @@ class TesseractOcrOptions(OcrOptions): class TesserOcrOptions(OcrOptions): kind: Literal["tesserocr"] = "tesserocr" lang: List[str] = ["fra", "deu", "spa", "eng"] + path: Optional[str] = None model_config = ConfigDict( extra="forbid", diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index fde8d770..94da2779 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -67,12 +67,16 @@ class TesseractOcrModel(BaseOcrModel): return name, version - def _run_tesseract(self, ifilename, languages=None): + def _run_tesseract(self, ifilename: str): cmd = [self.options.tesseract_cmd] - if languages: - cmd += ["-l", "+".join(languages)] + if self.options.lang is not None and len(self.options.lang) > 0: + cmd.append("-l") + cmd.append("+".join(self.options.lang)) + if self.options.path is not None: + cmd.append("--tessdata-dir") + cmd.append(self.options.path) cmd += [ifilename, "stdout", "tsv"] _log.info("command: {}".format(" ".join(cmd))) diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py index da320870..f748abb3 100644 --- a/docling/models/tesserocr_model.py +++ b/docling/models/tesserocr_model.py @@ -39,9 +39,21 @@ class TesserOcrModel(BaseOcrModel): # Initialize the tesseractAPI lang = "+".join(self.options.lang) - self.reader = tesserocr.PyTessBaseAPI( - lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT - ) + if self.options.path is not None: + self.reader = tesserocr.PyTessBaseAPI( + path=self.options.path, + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + else: + self.reader = tesserocr.PyTessBaseAPI( + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) self.reader_RIL = tesserocr.RIL def __del__(self):