mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
feat(OCR): Introduce support for the language path
in the pipelines of both Tesseract OCR engines.
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
118afee1f3
commit
074acd703c
@ -40,6 +40,7 @@ class TesseractOcrOptions(OcrOptions):
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
path: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
@ -49,6 +50,7 @@ class TesseractOcrOptions(OcrOptions):
|
||||
class TesserOcrOptions(OcrOptions):
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
|
@ -67,12 +67,16 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
|
||||
return name, version
|
||||
|
||||
def _run_tesseract(self, ifilename, languages=None):
|
||||
def _run_tesseract(self, ifilename: str):
|
||||
|
||||
cmd = [self.options.tesseract_cmd]
|
||||
|
||||
if languages:
|
||||
cmd += ["-l", "+".join(languages)]
|
||||
if self.options.lang is not None and len(self.options.lang) > 0:
|
||||
cmd.append("-l")
|
||||
cmd.append("+".join(self.options.lang))
|
||||
if self.options.path is not None:
|
||||
cmd.append("--tessdata-dir")
|
||||
cmd.append(self.options.path)
|
||||
|
||||
cmd += [ifilename, "stdout", "tsv"]
|
||||
_log.info("command: {}".format(" ".join(cmd)))
|
||||
|
@ -39,8 +39,20 @@ class TesserOcrModel(BaseOcrModel):
|
||||
|
||||
# Initialize the tesseractAPI
|
||||
lang = "+".join(self.options.lang)
|
||||
if self.options.path is not None:
|
||||
self.reader = tesserocr.PyTessBaseAPI(
|
||||
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
|
||||
path=self.options.path,
|
||||
lang=lang,
|
||||
psm=tesserocr.PSM.AUTO,
|
||||
init=True,
|
||||
oem=tesserocr.OEM.DEFAULT,
|
||||
)
|
||||
else:
|
||||
self.reader = tesserocr.PyTessBaseAPI(
|
||||
lang=lang,
|
||||
psm=tesserocr.PSM.AUTO,
|
||||
init=True,
|
||||
oem=tesserocr.OEM.DEFAULT,
|
||||
)
|
||||
self.reader_RIL = tesserocr.RIL
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user