From 4c741b53fa5df9509fc125afd91a1c8fc94730d5 Mon Sep 17 00:00:00 2001 From: Guilhem VERMOREL <83694424+guilhemvermorel@users.noreply.github.com> Date: Mon, 31 Mar 2025 10:53:49 +0200 Subject: [PATCH] fix: Tesseract OCR CLI can't process images composed with numbers only (#1201) fix wrong type text extracted by tesseract_ocr_cli_model Signed-off-by: gvl4 Co-authored-by: gvl4 Signed-off-by: Benichou --- docling/models/tesseract_ocr_cli_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 56968a2e..1e7fe039 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -247,7 +247,7 @@ class TesseractOcrCliModel(BaseOcrModel): cell = TextCell( index=ix, - text=text, + text=str(text), orig=text, from_ocr=True, confidence=conf / 100.0,