mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
feat(OCR): Introduce support for the language path
in the pipelines of both Tesseract OCR engines.
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
118afee1f3
commit
074acd703c
@ -40,6 +40,7 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
kind: Literal["tesseract"] = "tesseract"
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
tesseract_cmd: str = "tesseract"
|
tesseract_cmd: str = "tesseract"
|
||||||
|
path: Optional[str] = None
|
||||||
|
|
||||||
model_config = ConfigDict(
|
model_config = ConfigDict(
|
||||||
extra="forbid",
|
extra="forbid",
|
||||||
@ -49,6 +50,7 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
class TesserOcrOptions(OcrOptions):
|
class TesserOcrOptions(OcrOptions):
|
||||||
kind: Literal["tesserocr"] = "tesserocr"
|
kind: Literal["tesserocr"] = "tesserocr"
|
||||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
|
path: Optional[str] = None
|
||||||
|
|
||||||
model_config = ConfigDict(
|
model_config = ConfigDict(
|
||||||
extra="forbid",
|
extra="forbid",
|
||||||
|
@ -67,12 +67,16 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
return name, version
|
return name, version
|
||||||
|
|
||||||
def _run_tesseract(self, ifilename, languages=None):
|
def _run_tesseract(self, ifilename: str):
|
||||||
|
|
||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
|
|
||||||
if languages:
|
if self.options.lang is not None and len(self.options.lang) > 0:
|
||||||
cmd += ["-l", "+".join(languages)]
|
cmd.append("-l")
|
||||||
|
cmd.append("+".join(self.options.lang))
|
||||||
|
if self.options.path is not None:
|
||||||
|
cmd.append("--tessdata-dir")
|
||||||
|
cmd.append(self.options.path)
|
||||||
|
|
||||||
cmd += [ifilename, "stdout", "tsv"]
|
cmd += [ifilename, "stdout", "tsv"]
|
||||||
_log.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
|
@ -39,9 +39,21 @@ class TesserOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
# Initialize the tesseractAPI
|
# Initialize the tesseractAPI
|
||||||
lang = "+".join(self.options.lang)
|
lang = "+".join(self.options.lang)
|
||||||
self.reader = tesserocr.PyTessBaseAPI(
|
if self.options.path is not None:
|
||||||
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
)
|
path=self.options.path,
|
||||||
|
lang=lang,
|
||||||
|
psm=tesserocr.PSM.AUTO,
|
||||||
|
init=True,
|
||||||
|
oem=tesserocr.OEM.DEFAULT,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
|
lang=lang,
|
||||||
|
psm=tesserocr.PSM.AUTO,
|
||||||
|
init=True,
|
||||||
|
oem=tesserocr.OEM.DEFAULT,
|
||||||
|
)
|
||||||
self.reader_RIL = tesserocr.RIL
|
self.reader_RIL = tesserocr.RIL
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user