From 0d12ad1dcc1f55991ee0adee92b2165bf88cf669 Mon Sep 17 00:00:00 2001 From: Gaspard Petit Date: Mon, 25 Nov 2024 09:19:53 -0500 Subject: [PATCH] fix: PermissionError when using tesseract_ocr_cli_model Make sure that the `tesseract_ocr_cli_model.py` does not open the png image file twice (`tempfile.NamedTemporaryFile` + `high_res_image.save`), and ensure that `_run_tesseract` is executed once the file is no longer open by python. This other results in a "PermissionError: [Errno 13] Permission denied" error on Windows. Signed-off-by: Gaspard Petit --- docling/models/tesseract_ocr_cli_model.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 9a50eee0..b689cf90 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -1,4 +1,5 @@ import io +import os import logging import tempfile from subprocess import DEVNULL, PIPE, Popen @@ -130,14 +131,17 @@ class TesseractOcrCliModel(BaseOcrModel): high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) - - with tempfile.NamedTemporaryFile( - suffix=".png", mode="w" - ) as image_file: - fname = image_file.name - high_res_image.save(fname) + try: + with tempfile.NamedTemporaryFile( + suffix=".png", mode="w+b", delete=False + ) as image_file: + fname = image_file.name + high_res_image.save(image_file) df = self._run_tesseract(fname) + finally: + if os.path.exists(fname): + os.remove(fname) # _log.info(df)