diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index db01fcec..0c6f2bd1 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -54,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel): self._version: Optional[str] = None self._tesseract_languages: Optional[List[str]] = None self._script_prefix: Optional[str] = None + self._is_auto: bool if self.enabled: try: @@ -103,7 +104,7 @@ class TesseractOcrCliModel(BaseOcrModel): Run tesseract CLI """ cmd = [self.options.tesseract_cmd] - if "auto" in self.options.lang: + if self._is_auto: lang = self._parse_language(osd) if lang is not None: cmd.append("-l") @@ -191,6 +192,7 @@ class TesseractOcrCliModel(BaseOcrModel): decoded_data = output.stdout.decode("utf-8") df_list = pd.read_csv(io.StringIO(decoded_data), header=None) self._tesseract_languages = df_list[0].tolist()[1:] + self._is_auto = "auto" in self._tesseract_languages # Decide the script prefix if any(lang.startswith("script/") for lang in self._tesseract_languages): @@ -207,7 +209,7 @@ class TesseractOcrCliModel(BaseOcrModel): yield from page_batch return - for page in page_batch: + for page_i, page in enumerate(page_batch): assert page._backend is not None if not page._backend.is_valid(): yield page @@ -216,7 +218,7 @@ class TesseractOcrCliModel(BaseOcrModel): ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] - for ocr_rect in ocr_rects: + for ocr_rect_i, ocr_rect in enumerate(ocr_rects): # Skip zero area boxes if ocr_rect.area() == 0: continue @@ -234,10 +236,17 @@ class TesseractOcrCliModel(BaseOcrModel): df_osd = self._perform_osd(fname) doc_orientation = _parse_orientation(df_osd) except subprocess.CalledProcessError as exc: - # Here we just log the error and proceed to OCR in the - # hope OCR will succeed while OSD failed + if self._is_auto: + # OSD is required in auto mode, skipping + continue + # Proceed to OCR in the hope OCR will succeed while + # OSD failed _log.error( - "OSD failed for: %s with error:\n %s", + "OSD failed (doc %s, page: %s, " + "OCR rectangle: %s, processed image file %s):\n %s", + conv_res.input.file, + page_i, + ocr_rect_i, image_file, exc.stderr, ) @@ -250,7 +259,11 @@ class TesseractOcrCliModel(BaseOcrModel): df_result = self._run_tesseract(fname, df_osd) except subprocess.CalledProcessError as exc: _log.error( - "tesseract OCR failed for: %s with error:\n %s", + "tesseract OCR failed (doc %s, page: %s, " + "OCR rectangle: %s, processed image file %s):\n %s", + conv_res.input.file, + page_i, + ocr_rect_i, image_file, exc.stderr, ) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index ec29349c..f826dc2d 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,9 +1,8 @@ from __future__ import annotations import logging -from collections.abc import Iterable from pathlib import Path -from typing import Dict, Iterable, Optional, Type +from typing import Iterable, Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import TextCell @@ -77,6 +76,8 @@ class TesseractOcrModel(BaseOcrModel): if not self._tesserocr_languages: raise ImportError(missing_langs_errmsg) + self._is_auto: bool = "auto" in self._tesserocr_languages + # Initialize the tesseractAPI _log.debug("Initializing TesserOCR: %s", tesseract_version) lang = "+".join(self.options.lang) @@ -122,7 +123,7 @@ class TesseractOcrModel(BaseOcrModel): yield from page_batch return - for page in page_batch: + for page_i, page in enumerate(page_batch): assert page._backend is not None if not page._backend.is_valid(): yield page @@ -135,7 +136,7 @@ class TesseractOcrModel(BaseOcrModel): ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] - for ocr_rect in ocr_rects: + for ocr_rect_i, ocr_rect in enumerate(ocr_rects): # Skip zero area boxes if ocr_rect.area() == 0: continue @@ -146,15 +147,26 @@ class TesseractOcrModel(BaseOcrModel): local_reader = self.reader self.osd_reader.SetImage(high_res_image) osd = self.osd_reader.DetectOrientationScript() - # No text, probably + # No text, or Orientation and Script detection failure if osd is None: - continue + if self._is_auto: + # OSD is required in auto mode, skipping + continue + # Proceed to OCR in the hope OCR will succeed while + # OSD failed + _log.error( + "OSD failed for doc (doc %s, page: %s, " + "OCR rectangle: %s)", + conv_res.input.file, + page_i, + ocr_rect_i, + ) doc_orientation = parse_tesseract_orientation(osd["orient_deg"]) if doc_orientation != 0: high_res_image = high_res_image.rotate( -doc_orientation, expand=True ) - if "auto" in self.options.lang: + if self._is_auto: script = osd["script_name"] script = map_tesseract_script(script) lang = f"{self.script_prefix}{script}"