From 89c9ca3823b2a31ba9f574b72f5177be44519970 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 8 Nov 2024 11:28:20 +0100 Subject: [PATCH] fix(TesseractOcrModel): Use different error messages when tesserocr is not properly installed and when the TESSDATA_PREFIX envvar is not properly configured. Signed-off-by: Nikos Livathinos --- docling/models/tesseract_ocr_model.py | 29 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 88a5ffc5..83f23837 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -22,28 +22,37 @@ class TesseractOcrModel(BaseOcrModel): self.reader = None if self.enabled: - setup_errmsg = ( + install_errmsg = ( "tesserocr is not correctly installed. " "Please install it via `pip install tesserocr` to use this OCR engine. " - "Note that tesserocr might have to be manually compiled for working with" + "Note that tesserocr might have to be manually compiled for working with " "your Tesseract installation. The Docling documentation provides examples for it. " - "Alternatively, Docling has support for other OCR engines. See the documentation." + "Alternatively, Docling has support for other OCR engines. See the documentation: " + "https://ds4sd.github.io/docling/installation/" ) + missing_langs_errmsg = ( + "tesserocr is not correctly configured. No language models have been detected. " + "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. " + "You can find more information how to setup other OCR engines in Docling " + "documentation: " + "https://ds4sd.github.io/docling/installation/" + ) + try: import tesserocr except ImportError: - raise ImportError(setup_errmsg) - + raise ImportError(install_errmsg) try: tesseract_version = tesserocr.tesseract_version() - _, tesserocr_languages = tesserocr.get_languages() - if not tesserocr_languages: - raise ImportError(setup_errmsg) - _log.debug("Initializing TesserOCR: %s", tesseract_version) except: - raise ImportError(setup_errmsg) + raise ImportError(install_errmsg) + + _, tesserocr_languages = tesserocr.get_languages() + if not tesserocr_languages: + raise ImportError(missing_langs_errmsg) # Initialize the tesseractAPI + _log.debug("Initializing TesserOCR: %s", tesseract_version) lang = "+".join(self.options.lang) if self.options.path is not None: self.reader = tesserocr.PyTessBaseAPI(