From 1ce40a7097f2089c581542cd51bcfc8a15dcda07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cle=CC=81ment=20Doumouro?= Date: Wed, 21 May 2025 11:16:08 +0200 Subject: [PATCH] chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --- docling/models/tesseract_ocr_cli_model.py | 9 ++++----- docling/models/tesseract_ocr_model.py | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 1ed1fb05..8bca5479 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -235,11 +235,6 @@ class TesseractOcrCliModel(BaseOcrModel): df_osd = self._perform_osd(fname) doc_orientation = _parse_orientation(df_osd) except subprocess.CalledProcessError as exc: - if self._is_auto: - # OSD is required in auto mode, skipping - continue - # Proceed to OCR in the hope OCR will succeed while - # OSD failed _log.error( "OSD failed (doc %s, page: %s, " "OCR rectangle: %s, processed image file %s):\n %s", @@ -249,6 +244,10 @@ class TesseractOcrCliModel(BaseOcrModel): image_file, exc.stderr, ) + # Skipping if OSD fail when in auto mode, otherwise proceed + # to OCR in the hope OCR will succeed while OSD failed + if self._is_auto: + continue if doc_orientation != 0: high_res_image = high_res_image.rotate( -doc_orientation, expand=True diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index d00ba05a..108485d7 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -147,11 +147,6 @@ class TesseractOcrModel(BaseOcrModel): osd = self.osd_reader.DetectOrientationScript() # No text, or Orientation and Script detection failure if osd is None: - if self._is_auto: - # OSD is required in auto mode, skipping - continue - # Proceed to OCR in the hope OCR will succeed while - # OSD failed _log.error( "OSD failed for doc (doc %s, page: %s, " "OCR rectangle: %s)", @@ -159,6 +154,10 @@ class TesseractOcrModel(BaseOcrModel): page_i, ocr_rect_i, ) + # Skipping if OSD fail when in auto mode, otherwise proceed + # to OCR in the hope OCR will succeed while OSD failed + if self._is_auto: + continue doc_orientation = parse_tesseract_orientation(osd["orient_deg"]) if doc_orientation != 0: high_res_image = high_res_image.rotate(