Add "auto" language for TesseractOcr

Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de>
This commit is contained in:
Pavel Denisov 2025-01-16 10:31:36 +01:00
parent 57fc28d3d8
commit 858f93a6d5
2 changed files with 61 additions and 14 deletions

View File

@ -20,6 +20,7 @@ class TesseractOcrModel(BaseOcrModel):
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None self.reader = None
self.script_readers = None
if self.enabled: if self.enabled:
install_errmsg = ( install_errmsg = (
@ -54,21 +55,33 @@ class TesseractOcrModel(BaseOcrModel):
# Initialize the tesseractAPI # Initialize the tesseractAPI
_log.debug("Initializing TesserOCR: %s", tesseract_version) _log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
tesserocr_kwargs = {
"psm": tesserocr.PSM.AUTO,
"init": True,
"oem": tesserocr.OEM.DEFAULT,
}
if self.options.path is not None: if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path
if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI( self.reader = tesserocr.PyTessBaseAPI(
path=self.options.path, **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
) )
self.script_readers = {}
scripts = [l for l in tesserocr_languages if l.startswith("script")]
for script in scripts:
self.script_readers[script] = tesserocr.PyTessBaseAPI(
**{"lang": script} | tesserocr_kwargs,
)
else: else:
self.reader = tesserocr.PyTessBaseAPI( self.reader = tesserocr.PyTessBaseAPI(
lang=lang, **{"lang": lang} | tesserocr_kwargs,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
) )
self.reader_RIL = tesserocr.RIL self.reader_RIL = tesserocr.RIL
def __del__(self): def __del__(self):
@ -106,20 +119,51 @@ class TesseractOcrModel(BaseOcrModel):
# Retrieve text snippets with their bounding boxes # Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image) self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(
if self.script_readers is not None:
osd = self.reader.DetectOrientationScript()
# No text, probably
if osd is None:
continue
script = osd["script_name"]
if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"
if f"script/{script}" in self.script_readers:
_log.debug(
f'Using model for the detected script "{script}"'
)
local_reader = self.script_readers[f"script/{script}"]
local_reader.SetImage(high_res_image)
else:
_log.warning(
f'No model for the detected script "{script}"'
)
continue
else:
local_reader = self.reader
boxes = local_reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True self.reader_RIL.TEXTLINE, True
) )
cells = [] cells = []
for ix, (im, box, _, _) in enumerate(boxes): for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin # Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle( local_reader.SetRectangle(
box["x"], box["y"], box["w"], box["h"] box["x"], box["y"], box["w"], box["h"]
) )
# Extract text within the bounding box # Extract text within the bounding box
text = self.reader.GetUTF8Text().strip() text = local_reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf() confidence = local_reader.MeanTextConf()
left = box["x"] / self.scale left = box["x"] / self.scale
bottom = box["y"] / self.scale bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale right = (box["x"] + box["w"]) / self.scale

View File

@ -60,6 +60,7 @@ def test_e2e_conversions():
RapidOcrOptions(), RapidOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True), EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True), TesseractCliOcrOptions(force_full_page_ocr=True),
RapidOcrOptions(force_full_page_ocr=True), RapidOcrOptions(force_full_page_ocr=True),
] ]
@ -70,7 +71,9 @@ def test_e2e_conversions():
engines.append(OcrMacOptions(force_full_page_ocr=True)) engines.append(OcrMacOptions(force_full_page_ocr=True))
for ocr_options in engines: for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}") print(
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
)
converter = get_converter(ocr_options=ocr_options) converter = get_converter(ocr_options=ocr_options)
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")