mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fix(TesserOcrModel): Refactor code to catch exception in case of import error
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
81d176cd3d
commit
1d4517ffb4
@ -2,8 +2,6 @@ import logging
|
|||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import tesserocr
|
|
||||||
from tesserocr import OEM, PSM, RIL, PyTessBaseAPI
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||||
@ -21,12 +19,22 @@ class TesserOcrModel(BaseOcrModel):
|
|||||||
self.reader = None
|
self.reader = None
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
try:
|
||||||
|
import tesserocr
|
||||||
|
except ImportError:
|
||||||
|
msg = (
|
||||||
|
"TesserOCR is not installed."
|
||||||
|
"Please install it via `pip install easyocr` to use this OCR engine."
|
||||||
|
)
|
||||||
|
raise ImportError(msg)
|
||||||
|
|
||||||
# Initialize the tesseractAPI
|
# Initialize the tesseractAPI
|
||||||
lang = "+".join(self.options.lang)
|
lang = "+".join(self.options.lang)
|
||||||
_log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version())
|
_log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version())
|
||||||
self.reader = PyTessBaseAPI(
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
lang=lang, psm=PSM.AUTO, init=True, oem=OEM.DEFAULT
|
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
|
||||||
)
|
)
|
||||||
|
self.reader_RIL = tesserocr.RIL.TEXTLINE
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
if self.reader is not None:
|
if self.reader is not None:
|
||||||
@ -51,7 +59,7 @@ class TesserOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
# Retrieve text snippets with their bounding boxes
|
# Retrieve text snippets with their bounding boxes
|
||||||
self.reader.SetImage(high_res_image)
|
self.reader.SetImage(high_res_image)
|
||||||
boxes = self.reader.GetComponentImages(RIL.TEXTLINE, True)
|
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
|
||||||
|
|
||||||
cells = []
|
cells = []
|
||||||
for ix, (im, box, _, _) in enumerate(boxes):
|
for ix, (im, box, _, _) in enumerate(boxes):
|
||||||
|
Loading…
Reference in New Issue
Block a user