From 4c2552efc51dd91468a66071e8a079746183eb67 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 24 Jan 2025 11:20:21 +0100 Subject: [PATCH] feat: Introduce automatic language detection in tesseract_ocr_cli model. Extend unit tests. Signed-off-by: Nikos Livathinos --- docling/models/tesseract_ocr_cli_model.py | 64 +++++++++++++++++++++-- docling/models/tesseract_ocr_model.py | 10 +--- docling/utils/ocr_utils.py | 9 ++++ tests/test_e2e_ocr_conversion.py | 1 + 4 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 docling/utils/ocr_utils.py diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 16e1629d..86349ddb 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -4,7 +4,7 @@ import logging import os import tempfile from subprocess import DEVNULL, PIPE, Popen -from typing import Iterable, Optional, Tuple +from typing import Iterable, List, Optional, Tuple import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin @@ -14,6 +14,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.ocr_utils import map_tesseract_script from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -29,6 +30,7 @@ class TesseractOcrCliModel(BaseOcrModel): self._name: Optional[str] = None self._version: Optional[str] = None + self._script_prefix = None if self.enabled: try: @@ -74,12 +76,20 @@ class TesseractOcrCliModel(BaseOcrModel): return name, version def _run_tesseract(self, ifilename: str): - + r""" + Run tesseract CLI + """ cmd = [self.options.tesseract_cmd] - if self.options.lang is not None and len(self.options.lang) > 0: + if "auto" in self.options.lang: + lang = self._detect_language(ifilename) + if lang is not None: + cmd.append("-l") + cmd.append(lang) + elif self.options.lang is not None and len(self.options.lang) > 0: cmd.append("-l") cmd.append("+".join(self.options.lang)) + if self.options.path is not None: cmd.append("--tessdata-dir") cmd.append(self.options.path) @@ -107,6 +117,54 @@ class TesseractOcrCliModel(BaseOcrModel): return df_filtered + def _detect_language(self, ifilename: str): + r""" + Run tesseract in PSM 0 mode to detect the document language + """ + cmd = [self.options.tesseract_cmd] + cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"]) + _log.info("command: {}".format(" ".join(cmd))) + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) + output, _ = proc.communicate() + decoded_data = output.decode("utf-8") + df = pd.read_csv( + io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] + ) + scripts = df.loc[df["key"] == "Script"].value.tolist() + if len(scripts) == 0: + _log.warning("Tesseract cannot detect the script of the page") + return None + script = map_tesseract_script(scripts[0].strip()) + + # Translate the script into language + script_prefix = self._get_script_prefix() + lang = f"{script_prefix}{script}" + _log.debug( + f'Using tesseract model for the detected script "{script}" and language "{lang}"' + ) + + return lang + + def _get_script_prefix(self) -> str: + if self._script_prefix is not None: + return self._script_prefix + + # Get all languages + cmd = [self.options.tesseract_cmd] + cmd.append("--list-langs") + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) + output, _ = proc.communicate() + decoded_data = output.decode("utf-8") + df = pd.read_csv(io.StringIO(decoded_data), header=None) + all_languages = df[0].tolist()[1:] + + # Decide the script prefix + if any([l.startswith("script/") for l in all_languages]): + script_prefix = "script/" + else: + script_prefix = "" + return script_prefix + def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 6a1b60ee..9eb3d829 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.ocr_utils import map_tesseract_script from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -128,14 +129,7 @@ class TesseractOcrModel(BaseOcrModel): continue script = osd["script_name"] - - if script == "Katakana" or script == "Hiragana": - script = "Japanese" - elif script == "Han": - script = "HanS" - elif script == "Korean": - script = "Hangul" - + script = map_tesseract_script(script) _log.debug( f'Using model for the detected script "{script}"' ) diff --git a/docling/utils/ocr_utils.py b/docling/utils/ocr_utils.py new file mode 100644 index 00000000..59503f1f --- /dev/null +++ b/docling/utils/ocr_utils.py @@ -0,0 +1,9 @@ +def map_tesseract_script(script: str) -> str: + r""" """ + if script == "Katakana" or script == "Hiragana": + script = "Japanese" + elif script == "Han": + script = "HanS" + elif script == "Korean": + script = "Hangul" + return script diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index b3cdd312..4a542d21 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -62,6 +62,7 @@ def test_e2e_conversions(): TesseractOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), TesseractCliOcrOptions(force_full_page_ocr=True), + TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), RapidOcrOptions(force_full_page_ocr=True), ]