From 1f3f4be3f03c515cac490d1c5355931755b3c893 Mon Sep 17 00:00:00 2001 From: Pavel Denisov Date: Thu, 23 Jan 2025 11:11:34 +0100 Subject: [PATCH] Fix script models prefix for Linux Signed-off-by: Pavel Denisov --- docling/models/tesseract_ocr_model.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index d31c9797..6a1b60ee 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -57,6 +57,11 @@ class TesseractOcrModel(BaseOcrModel): self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} + if any([l.startswith("script/") for l in tesserocr_languages]): + self.script_prefix = "script/" + else: + self.script_prefix = "" + tesserocr_kwargs = { "psm": tesserocr.PSM.AUTO, "init": True, @@ -138,7 +143,7 @@ class TesseractOcrModel(BaseOcrModel): if script not in self.script_readers: self.script_readers[script] = tesserocr.PyTessBaseAPI( path=self.reader.GetDatapath(), - lang=f"script/{script}", + lang=f"{self.script_prefix}{script}", psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT,