diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index d694bf58..db01fcec 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -119,13 +119,12 @@ class TesseractOcrCliModel(BaseOcrModel): cmd += [ifilename, "stdout", "tsv"] _log.info("command: {}".format(" ".join(cmd))) - proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) - output, _ = proc.communicate() + output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True) # _log.info(output) # Decode the byte string to a regular string - decoded_data = output.decode("utf-8") + decoded_data = output.stdout.decode("utf-8") # _log.info(decoded_data) # Read the TSV file generated by Tesseract @@ -188,9 +187,8 @@ class TesseractOcrCliModel(BaseOcrModel): cmd = [self.options.tesseract_cmd] cmd.append("--list-langs") _log.info("command: {}".format(" ".join(cmd))) - proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) - output, _ = proc.communicate() - decoded_data = output.decode("utf-8") + output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True) + decoded_data = output.stdout.decode("utf-8") df_list = pd.read_csv(io.StringIO(decoded_data), header=None) self._tesseract_languages = df_list[0].tolist()[1:] @@ -231,16 +229,18 @@ class TesseractOcrCliModel(BaseOcrModel): ) as image_file: fname = image_file.name high_res_image.save(image_file) + doc_orientation = 0 try: df_osd = self._perform_osd(fname) + doc_orientation = _parse_orientation(df_osd) except subprocess.CalledProcessError as exc: + # Here we just log the error and proceed to OCR in the + # hope OCR will succeed while OSD failed _log.error( "OSD failed for: %s with error:\n %s", image_file, exc.stderr, ) - continue - doc_orientation = _parse_orientation(df_osd) if doc_orientation != 0: high_res_image = high_res_image.rotate( -doc_orientation, expand=True