mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: add "auto" language for TesseractOcr (#759)
* Add "auto" language for TesseractOcr Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de> * Add tesseract-ocr-script-latn installation for the "auto" language Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de> * Modify "auto" language in TesseractOcr to initialize the script readers lazily Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de> * Finalize script readers Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de> * Fix script models prefix for Linux Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de> --------- Signed-off-by: Pavel Denisov <pavel.denisov@iais.fraunhofer.de>
This commit is contained in:
@@ -60,6 +60,7 @@ def test_e2e_conversions():
|
||||
RapidOcrOptions(),
|
||||
EasyOcrOptions(force_full_page_ocr=True),
|
||||
TesseractOcrOptions(force_full_page_ocr=True),
|
||||
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
||||
TesseractCliOcrOptions(force_full_page_ocr=True),
|
||||
RapidOcrOptions(force_full_page_ocr=True),
|
||||
]
|
||||
@@ -70,7 +71,9 @@ def test_e2e_conversions():
|
||||
engines.append(OcrMacOptions(force_full_page_ocr=True))
|
||||
|
||||
for ocr_options in engines:
|
||||
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||
print(
|
||||
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
|
||||
)
|
||||
converter = get_converter(ocr_options=ocr_options)
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
Reference in New Issue
Block a user