From 306e83e0fe57c5bc197e367897f86634be5dfcc3 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 24 Jan 2025 17:08:01 +0000 Subject: [PATCH] fix: Refactor the TesseractOcrModel and TesseractOcrCliModel to validate if the auto-detected language is installed in the system and if not fall back to a default option without language. Signed-off-by: Nikos Livathinos --- docling/models/tesseract_ocr_cli_model.py | 40 ++++++++++------ docling/models/tesseract_ocr_model.py | 58 +++++++++++++---------- docs/examples/tesseract_lang_detection.py | 4 +- 3 files changed, 60 insertions(+), 42 deletions(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 86349ddb..959264ff 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -30,11 +30,13 @@ class TesseractOcrCliModel(BaseOcrModel): self._name: Optional[str] = None self._version: Optional[str] = None - self._script_prefix = None + self._tesseract_languages: Optional[List[str]] = None + self._script_prefix: Optional[str] = None if self.enabled: try: self._get_name_and_version() + self._set_languages_and_prefix() except Exception as exc: raise RuntimeError( @@ -119,8 +121,10 @@ class TesseractOcrCliModel(BaseOcrModel): def _detect_language(self, ifilename: str): r""" - Run tesseract in PSM 0 mode to detect the document language + Run tesseract in PSM 0 mode to detect the language """ + assert self._tesseract_languages is not None + cmd = [self.options.tesseract_cmd] cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"]) _log.info("command: {}".format(" ".join(cmd))) @@ -134,36 +138,43 @@ class TesseractOcrCliModel(BaseOcrModel): if len(scripts) == 0: _log.warning("Tesseract cannot detect the script of the page") return None + script = map_tesseract_script(scripts[0].strip()) + lang = f"{self._script_prefix}{script}" + + # Check if the detected language has been installed + if lang not in self._tesseract_languages: + msg = f"Tesseract detected the script '{script}' and language '{lang}'." + msg += " However this language is not installed in your system and will be ignored." + _log.warning(msg) + return None - # Translate the script into language - script_prefix = self._get_script_prefix() - lang = f"{script_prefix}{script}" _log.debug( - f'Using tesseract model for the detected script "{script}" and language "{lang}"' + f"Using tesseract model for the detected script '{script}' and language '{lang}'" ) - return lang - def _get_script_prefix(self) -> str: - if self._script_prefix is not None: - return self._script_prefix - + def _set_languages_and_prefix(self): + r""" + Read and set the languages installed in tesseract and decide the script prefix + """ # Get all languages cmd = [self.options.tesseract_cmd] cmd.append("--list-langs") + _log.info("command: {}".format(" ".join(cmd))) proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output, _ = proc.communicate() decoded_data = output.decode("utf-8") df = pd.read_csv(io.StringIO(decoded_data), header=None) - all_languages = df[0].tolist()[1:] + self._tesseract_languages = df[0].tolist()[1:] # Decide the script prefix - if any([l.startswith("script/") for l in all_languages]): + if any([l.startswith("script/") for l in self._tesseract_languages]): script_prefix = "script/" else: script_prefix = "" - return script_prefix + + self._script_prefix = script_prefix def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] @@ -179,7 +190,6 @@ class TesseractOcrCliModel(BaseOcrModel): yield page else: with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 9eb3d829..5b70155e 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -21,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel): self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.reader = None + self.osd_reader = None if self.enabled: install_errmsg = ( @@ -48,8 +49,8 @@ class TesseractOcrModel(BaseOcrModel): except: raise ImportError(install_errmsg) - _, tesserocr_languages = tesserocr.get_languages() - if not tesserocr_languages: + _, self._tesserocr_languages = tesserocr.get_languages() + if not self._tesserocr_languages: raise ImportError(missing_langs_errmsg) # Initialize the tesseractAPI @@ -58,7 +59,7 @@ class TesseractOcrModel(BaseOcrModel): self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} - if any([l.startswith("script/") for l in tesserocr_languages]): + if any([l.startswith("script/") for l in self._tesserocr_languages]): self.script_prefix = "script/" else: self.script_prefix = "" @@ -73,14 +74,14 @@ class TesseractOcrModel(BaseOcrModel): tesserocr_kwargs["path"] = self.options.path if lang == "auto": - self.reader = tesserocr.PyTessBaseAPI( + self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs) + self.osd_reader = tesserocr.PyTessBaseAPI( **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs ) else: self.reader = tesserocr.PyTessBaseAPI( **{"lang": lang} | tesserocr_kwargs, ) - self.reader_RIL = tesserocr.RIL def __del__(self): @@ -97,8 +98,6 @@ class TesseractOcrModel(BaseOcrModel): yield from page_batch return - import tesserocr - for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): @@ -106,6 +105,7 @@ class TesseractOcrModel(BaseOcrModel): else: with TimeRecorder(conv_res, "ocr"): assert self.reader is not None + assert self._tesserocr_languages is not None ocr_rects = self.get_ocr_rects(page) @@ -118,11 +118,12 @@ class TesseractOcrModel(BaseOcrModel): scale=self.scale, cropbox=ocr_rect ) - # Retrieve text snippets with their bounding boxes - self.reader.SetImage(high_res_image) + local_reader = self.reader + if "auto" in self.options.lang: + assert self.osd_reader is not None - if self.options.lang == ["auto"]: - osd = self.reader.DetectOrientationScript() + self.osd_reader.SetImage(high_res_image) + osd = self.osd_reader.DetectOrientationScript() # No text, probably if osd is None: @@ -130,24 +131,29 @@ class TesseractOcrModel(BaseOcrModel): script = osd["script_name"] script = map_tesseract_script(script) - _log.debug( - f'Using model for the detected script "{script}"' - ) + lang = f"{self.script_prefix}{script}" - if script not in self.script_readers: - self.script_readers[script] = tesserocr.PyTessBaseAPI( - path=self.reader.GetDatapath(), - lang=f"{self.script_prefix}{script}", - psm=tesserocr.PSM.AUTO, - init=True, - oem=tesserocr.OEM.DEFAULT, - ) + # Check if the detected languge is present in the system + if lang not in self._tesserocr_languages: + msg = f"Tesseract detected the script '{script}' and language '{lang}'." + msg += " However this language is not installed in your system and will be ignored." + _log.warning(msg) + else: + if script not in self.script_readers: + import tesserocr - local_reader = self.script_readers[script] - local_reader.SetImage(high_res_image) - else: - local_reader = self.reader + self.script_readers[script] = ( + tesserocr.PyTessBaseAPI( + path=self.reader.GetDatapath(), + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + ) + local_reader = self.script_readers[script] + local_reader.SetImage(high_res_image) boxes = local_reader.GetComponentImages( self.reader_RIL.TEXTLINE, True ) diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py index 856d50ce..b75e4707 100644 --- a/docs/examples/tesseract_lang_detection.py +++ b/docs/examples/tesseract_lang_detection.py @@ -16,7 +16,9 @@ def main(): # ocr_options = TesseractOcrOptions(lang=["auto"]) ocr_options = TesseractCliOcrOptions(lang=["auto"]) - pipeline_options = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_options) + pipeline_options = PdfPipelineOptions( + do_ocr=True, force_full_page_ocr=True, ocr_options=ocr_options + ) converter = DocumentConverter( format_options={