mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 06:08:09 +00:00
feat: Introduce automatic language detection in TesseractOcrCliModel (#800)
* feat: Introduce automatic language detection in tesseract_ocr_cli model. Extend unit tests. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * docs: Add example how to use "auto" language with tesseract OCR engines Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Refactor the TesseractOcrModel and TesseractOcrCliModel to validate if the auto-detected language is installed in the system and if not fall back to a default option without language. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
9
docling/utils/ocr_utils.py
Normal file
9
docling/utils/ocr_utils.py
Normal file
@@ -0,0 +1,9 @@
|
||||
def map_tesseract_script(script: str) -> str:
|
||||
r""" """
|
||||
if script == "Katakana" or script == "Hiragana":
|
||||
script = "Japanese"
|
||||
elif script == "Han":
|
||||
script = "HanS"
|
||||
elif script == "Korean":
|
||||
script = "Hangul"
|
||||
return script
|
||||
Reference in New Issue
Block a user