From 1d4517ffb4afcab9f8a755ee2ac9d7804c3ed798 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 3 Oct 2024 14:23:07 +0200 Subject: [PATCH] fix(TesserOcrModel): Refactor code to catch exception in case of import error Signed-off-by: Nikos Livathinos --- docling/models/tesserocr_model.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py index 97078184..9581589b 100644 --- a/docling/models/tesserocr_model.py +++ b/docling/models/tesserocr_model.py @@ -2,8 +2,6 @@ import logging from typing import Iterable import numpy -import tesserocr -from tesserocr import OEM, PSM, RIL, PyTessBaseAPI from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page from docling.datamodel.pipeline_options import TesseractOcrOptions @@ -21,12 +19,22 @@ class TesserOcrModel(BaseOcrModel): self.reader = None if self.enabled: + try: + import tesserocr + except ImportError: + msg = ( + "TesserOCR is not installed." + "Please install it via `pip install easyocr` to use this OCR engine." + ) + raise ImportError(msg) + # Initialize the tesseractAPI lang = "+".join(self.options.lang) _log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version()) - self.reader = PyTessBaseAPI( - lang=lang, psm=PSM.AUTO, init=True, oem=OEM.DEFAULT + self.reader = tesserocr.PyTessBaseAPI( + lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT ) + self.reader_RIL = tesserocr.RIL.TEXTLINE def __del__(self): if self.reader is not None: @@ -51,7 +59,7 @@ class TesserOcrModel(BaseOcrModel): # Retrieve text snippets with their bounding boxes self.reader.SetImage(high_res_image) - boxes = self.reader.GetComponentImages(RIL.TEXTLINE, True) + boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True) cells = [] for ix, (im, box, _, _) in enumerate(boxes):