chore(ocr): proceed to OCR without rotation when OSD fails in TesseractOcrModel

This commit is contained in:
Clément Doumouro 2025-05-20 19:20:42 +02:00
parent 4c8f1caba3
commit 08832bfd60
2 changed files with 39 additions and 14 deletions

View File

@ -54,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self._version: Optional[str] = None self._version: Optional[str] = None
self._tesseract_languages: Optional[List[str]] = None self._tesseract_languages: Optional[List[str]] = None
self._script_prefix: Optional[str] = None self._script_prefix: Optional[str] = None
self._is_auto: bool
if self.enabled: if self.enabled:
try: try:
@ -103,7 +104,7 @@ class TesseractOcrCliModel(BaseOcrModel):
Run tesseract CLI Run tesseract CLI
""" """
cmd = [self.options.tesseract_cmd] cmd = [self.options.tesseract_cmd]
if "auto" in self.options.lang: if self._is_auto:
lang = self._parse_language(osd) lang = self._parse_language(osd)
if lang is not None: if lang is not None:
cmd.append("-l") cmd.append("-l")
@ -191,6 +192,7 @@ class TesseractOcrCliModel(BaseOcrModel):
decoded_data = output.stdout.decode("utf-8") decoded_data = output.stdout.decode("utf-8")
df_list = pd.read_csv(io.StringIO(decoded_data), header=None) df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df_list[0].tolist()[1:] self._tesseract_languages = df_list[0].tolist()[1:]
self._is_auto = "auto" in self._tesseract_languages
# Decide the script prefix # Decide the script prefix
if any(lang.startswith("script/") for lang in self._tesseract_languages): if any(lang.startswith("script/") for lang in self._tesseract_languages):
@ -207,7 +209,7 @@ class TesseractOcrCliModel(BaseOcrModel):
yield from page_batch yield from page_batch
return return
for page in page_batch: for page_i, page in enumerate(page_batch):
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid(): if not page._backend.is_valid():
yield page yield page
@ -216,7 +218,7 @@ class TesseractOcrCliModel(BaseOcrModel):
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []
for ocr_rect in ocr_rects: for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
# Skip zero area boxes # Skip zero area boxes
if ocr_rect.area() == 0: if ocr_rect.area() == 0:
continue continue
@ -234,10 +236,17 @@ class TesseractOcrCliModel(BaseOcrModel):
df_osd = self._perform_osd(fname) df_osd = self._perform_osd(fname)
doc_orientation = _parse_orientation(df_osd) doc_orientation = _parse_orientation(df_osd)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
# Here we just log the error and proceed to OCR in the if self._is_auto:
# hope OCR will succeed while OSD failed # OSD is required in auto mode, skipping
continue
# Proceed to OCR in the hope OCR will succeed while
# OSD failed
_log.error( _log.error(
"OSD failed for: %s with error:\n %s", "OSD failed (doc %s, page: %s, "
"OCR rectangle: %s, processed image file %s):\n %s",
conv_res.input.file,
page_i,
ocr_rect_i,
image_file, image_file,
exc.stderr, exc.stderr,
) )
@ -250,7 +259,11 @@ class TesseractOcrCliModel(BaseOcrModel):
df_result = self._run_tesseract(fname, df_osd) df_result = self._run_tesseract(fname, df_osd)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
_log.error( _log.error(
"tesseract OCR failed for: %s with error:\n %s", "tesseract OCR failed (doc %s, page: %s, "
"OCR rectangle: %s, processed image file %s):\n %s",
conv_res.input.file,
page_i,
ocr_rect_i,
image_file, image_file,
exc.stderr, exc.stderr,
) )

View File

@ -1,9 +1,8 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, Optional, Type from typing import Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell from docling_core.types.doc.page import TextCell
@ -77,6 +76,8 @@ class TesseractOcrModel(BaseOcrModel):
if not self._tesserocr_languages: if not self._tesserocr_languages:
raise ImportError(missing_langs_errmsg) raise ImportError(missing_langs_errmsg)
self._is_auto: bool = "auto" in self._tesserocr_languages
# Initialize the tesseractAPI # Initialize the tesseractAPI
_log.debug("Initializing TesserOCR: %s", tesseract_version) _log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
@ -122,7 +123,7 @@ class TesseractOcrModel(BaseOcrModel):
yield from page_batch yield from page_batch
return return
for page in page_batch: for page_i, page in enumerate(page_batch):
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid(): if not page._backend.is_valid():
yield page yield page
@ -135,7 +136,7 @@ class TesseractOcrModel(BaseOcrModel):
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []
for ocr_rect in ocr_rects: for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
# Skip zero area boxes # Skip zero area boxes
if ocr_rect.area() == 0: if ocr_rect.area() == 0:
continue continue
@ -146,15 +147,26 @@ class TesseractOcrModel(BaseOcrModel):
local_reader = self.reader local_reader = self.reader
self.osd_reader.SetImage(high_res_image) self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript() osd = self.osd_reader.DetectOrientationScript()
# No text, probably # No text, or Orientation and Script detection failure
if osd is None: if osd is None:
if self._is_auto:
# OSD is required in auto mode, skipping
continue continue
# Proceed to OCR in the hope OCR will succeed while
# OSD failed
_log.error(
"OSD failed for doc (doc %s, page: %s, "
"OCR rectangle: %s)",
conv_res.input.file,
page_i,
ocr_rect_i,
)
doc_orientation = parse_tesseract_orientation(osd["orient_deg"]) doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0: if doc_orientation != 0:
high_res_image = high_res_image.rotate( high_res_image = high_res_image.rotate(
-doc_orientation, expand=True -doc_orientation, expand=True
) )
if "auto" in self.options.lang: if self._is_auto:
script = osd["script_name"] script = osd["script_name"]
script = map_tesseract_script(script) script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}" lang = f"{self.script_prefix}{script}"