feat: Introduce automatic language detection in tesseract_ocr_cli model. Extend unit tests.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2025-01-24 11:20:21 +01:00
parent 8543c22687
commit 4c2552efc5
4 changed files with 73 additions and 11 deletions

View File

@ -4,7 +4,7 @@ import logging
import os import os
import tempfile import tempfile
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Optional, Tuple from typing import Iterable, List, Optional, Tuple
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -14,6 +14,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -29,6 +30,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self._name: Optional[str] = None self._name: Optional[str] = None
self._version: Optional[str] = None self._version: Optional[str] = None
self._script_prefix = None
if self.enabled: if self.enabled:
try: try:
@ -74,12 +76,20 @@ class TesseractOcrCliModel(BaseOcrModel):
return name, version return name, version
def _run_tesseract(self, ifilename: str): def _run_tesseract(self, ifilename: str):
r"""
Run tesseract CLI
"""
cmd = [self.options.tesseract_cmd] cmd = [self.options.tesseract_cmd]
if self.options.lang is not None and len(self.options.lang) > 0: if "auto" in self.options.lang:
lang = self._detect_language(ifilename)
if lang is not None:
cmd.append("-l")
cmd.append(lang)
elif self.options.lang is not None and len(self.options.lang) > 0:
cmd.append("-l") cmd.append("-l")
cmd.append("+".join(self.options.lang)) cmd.append("+".join(self.options.lang))
if self.options.path is not None: if self.options.path is not None:
cmd.append("--tessdata-dir") cmd.append("--tessdata-dir")
cmd.append(self.options.path) cmd.append(self.options.path)
@ -107,6 +117,54 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered return df_filtered
def _detect_language(self, ifilename: str):
r"""
Run tesseract in PSM 0 mode to detect the document language
"""
cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df.loc[df["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
script = map_tesseract_script(scripts[0].strip())
# Translate the script into language
script_prefix = self._get_script_prefix()
lang = f"{script_prefix}{script}"
_log.debug(
f'Using tesseract model for the detected script "{script}" and language "{lang}"'
)
return lang
def _get_script_prefix(self) -> str:
if self._script_prefix is not None:
return self._script_prefix
# Get all languages
cmd = [self.options.tesseract_cmd]
cmd.append("--list-langs")
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None)
all_languages = df[0].tolist()[1:]
# Decide the script prefix
if any([l.startswith("script/") for l in all_languages]):
script_prefix = "script/"
else:
script_prefix = ""
return script_prefix
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:

View File

@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -128,14 +129,7 @@ class TesseractOcrModel(BaseOcrModel):
continue continue
script = osd["script_name"] script = osd["script_name"]
script = map_tesseract_script(script)
if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"
_log.debug( _log.debug(
f'Using model for the detected script "{script}"' f'Using model for the detected script "{script}"'
) )

View File

@ -0,0 +1,9 @@
def map_tesseract_script(script: str) -> str:
r""" """
if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"
return script

View File

@ -62,6 +62,7 @@ def test_e2e_conversions():
TesseractOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True), TesseractCliOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
RapidOcrOptions(force_full_page_ocr=True), RapidOcrOptions(force_full_page_ocr=True),
] ]