feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): update missing test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): rotate image to the natural orientation before layout prediction

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): move bounding bow rotation util to orientation.py

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel`

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel`

* chore(ocr): default `TesseractOcrCliModel._is_auto` to `False`

* fix(ocr): fix `TesseractOcrCliModel._is_auto` computation

* chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel`

---------

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro
2025-05-21 18:12:33 +02:00
committed by GitHub
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions

View File

@@ -2,6 +2,7 @@ import csv
import io
import logging
import os
import subprocess
import tempfile
from collections.abc import Iterable
from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.ocr_utils import (
map_tesseract_script,
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self._version: Optional[str] = None
self._tesseract_languages: Optional[List[str]] = None
self._script_prefix: Optional[str] = None
self._is_auto: bool = "auto" in self.options.lang
if self.enabled:
try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
return name, version
def _run_tesseract(self, ifilename: str):
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
r"""
Run tesseract CLI
"""
cmd = [self.options.tesseract_cmd]
if "auto" in self.options.lang:
lang = self._detect_language(ifilename)
if self._is_auto:
lang = self._parse_language(osd)
if lang is not None:
cmd.append("-l")
cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
# _log.info(output)
# Decode the byte string to a regular string
decoded_data = output.decode("utf-8")
decoded_data = output.stdout.decode("utf-8")
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered
def _detect_language(self, ifilename: str):
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
r"""
Run tesseract in PSM 0 mode to detect the language
"""
assert self._tesseract_languages is not None
cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
output = subprocess.run(cmd, capture_output=True, check=True)
decoded_data = output.stdout.decode("utf-8")
df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
return df_detected
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
assert self._tesseract_languages is not None
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd = [self.options.tesseract_cmd]
cmd.append("--list-langs")
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
decoded_data = output.stdout.decode("utf-8")
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df_list[0].tolist()[1:]
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
yield from page_batch
return
for page in page_batch:
for page_i, page in enumerate(page_batch):
assert page._backend is not None
if not page._backend.is_valid():
yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
) as image_file:
fname = image_file.name
high_res_image.save(image_file)
df_result = self._run_tesseract(fname)
doc_orientation = 0
try:
df_osd = self._perform_osd(fname)
doc_orientation = _parse_orientation(df_osd)
except subprocess.CalledProcessError as exc:
_log.error(
"OSD failed (doc %s, page: %s, "
"OCR rectangle: %s, processed image file %s):\n %s",
conv_res.input.file,
page_i,
ocr_rect_i,
image_file,
exc.stderr,
)
# Skipping if OSD fail when in auto mode, otherwise proceed
# to OCR in the hope OCR will succeed while OSD failed
if self._is_auto:
continue
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
)
high_res_image.save(fname)
try:
df_result = self._run_tesseract(fname, df_osd)
except subprocess.CalledProcessError as exc:
_log.error(
"tesseract OCR failed (doc %s, page: %s, "
"OCR rectangle: %s, processed image file %s):\n %s",
conv_res.input.file,
page_i,
ocr_rect_i,
image_file,
exc.stderr,
)
continue
finally:
if os.path.exists(fname):
os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
text = row["text"]
conf = row["conf"]
l = float(row["left"]) # noqa: E741
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
left, top = float(row["left"]), float(row["top"])
right = left + float(row["width"])
bottom = top + row["height"]
bbox = BoundingBox(
l=left,
t=top,
r=right,
b=bottom,
coord_origin=CoordOrigin.TOPLEFT,
)
rect = tesseract_box_to_bounding_rectangle(
bbox,
original_offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
im_size=high_res_image.size,
)
cell = TextCell(
index=ix,
text=str(text),
orig=str(text),
from_ocr=True,
confidence=conf / 100.0,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
)
),
rect=rect,
)
all_ocr_cells.append(cell)
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return TesseractCliOcrOptions
def _parse_orientation(df_osd: pd.DataFrame) -> int:
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
orientation = parse_tesseract_orientation(orientations[0].strip())
return orientation

View File

@@ -1,12 +1,11 @@
from __future__ import annotations
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Optional, Type
from typing import Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.ocr_utils import (
map_tesseract_script,
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
accelerator_options=accelerator_options,
)
self.options: TesseractOcrOptions
self._is_auto: bool = "auto" in self.options.lang
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
else:
self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs,
)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
self.reader_RIL = tesserocr.RIL
def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
yield from page_batch
return
for page in page_batch:
for page_i, page in enumerate(page_batch):
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
assert self.reader is not None
assert self.osd_reader is not None
assert self._tesserocr_languages is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
)
local_reader = self.reader
if "auto" in self.options.lang:
assert self.osd_reader is not None
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()
# No text, probably
if osd is None:
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()
# No text, or Orientation and Script detection failure
if osd is None:
_log.error(
"OSD failed for doc (doc %s, page: %s, "
"OCR rectangle: %s)",
conv_res.input.file,
page_i,
ocr_rect_i,
)
# Skipping if OSD fail when in auto mode, otherwise proceed
# to OCR in the hope OCR will succeed while OSD failed
if self._is_auto:
continue
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
)
if self._is_auto:
script = osd["script_name"]
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
# Extract text within the bounding box
text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
left, top = box["x"], box["y"]
right = left + box["w"]
bottom = top + box["h"]
bbox = BoundingBox(
l=left,
t=top,
r=right,
b=bottom,
coord_origin=CoordOrigin.TOPLEFT,
)
rect = tesseract_box_to_bounding_rectangle(
bbox,
original_offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
im_size=high_res_image.size,
)
cells.append(
TextCell(
index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
orig=text,
from_ocr=True,
confidence=confidence,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
),
rect=rect,
)
)

View File

@@ -1,3 +1,11 @@
from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
def map_tesseract_script(script: str) -> str:
r""" """
if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
elif script == "Korean":
script = "Hangul"
return script
def parse_tesseract_orientation(orientation: str) -> int:
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
# are [0, 360[ counterclockwise
parsed = int(orientation)
if parsed not in CLIPPED_ORIENTATIONS:
msg = (
f"invalid tesseract document orientation {orientation}, "
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
)
raise ValueError(msg)
parsed = -parsed
parsed %= 360
return parsed
def tesseract_box_to_bounding_rectangle(
bbox: BoundingBox,
*,
original_offset: Optional[BoundingBox] = None,
scale: float,
orientation: int,
im_size: Tuple[int, int],
) -> BoundingRectangle:
# box is in the top, left, height, width format, top left coordinates
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
rect = BoundingRectangle(
r_x0=rect.r_x0 / scale,
r_y0=rect.r_y0 / scale,
r_x1=rect.r_x1 / scale,
r_y1=rect.r_y1 / scale,
r_x2=rect.r_x2 / scale,
r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT,
)
if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
raise ValueError(msg)
if original_offset is not None:
rect.r_x0 += original_offset.l
rect.r_x1 += original_offset.l
rect.r_x2 += original_offset.l
rect.r_x3 += original_offset.l
rect.r_y0 += original_offset.t
rect.r_y1 += original_offset.t
rect.r_y2 += original_offset.t
rect.r_y3 += original_offset.t
return rect

View File

@@ -0,0 +1,71 @@
from typing import Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def rotate_bounding_box(
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
) -> BoundingRectangle:
# The box is left top width height in TOPLEFT coordinates
# Bounding rectangle start with r_0 at the bottom left whatever the
# coordinate system. Then other corners are found rotating counterclockwise
bbox = bbox.to_top_left_origin(im_size[1])
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
im_h, im_w = im_size
angle = angle % 360
if angle == 0:
r_x0 = left
r_y0 = top + height
r_x1 = r_x0 + width
r_y1 = r_y0
r_x2 = r_x0 + width
r_y2 = r_y0 - height
r_x3 = r_x0
r_y3 = r_y0 - height
elif angle == 90:
r_x0 = im_w - (top + height)
r_y0 = left
r_x1 = r_x0
r_y1 = r_y0 + width
r_x2 = r_x0 + height
r_y2 = r_y0 + width
r_x3 = r_x0
r_y3 = r_y0 + width
elif angle == 180:
r_x0 = im_h - left
r_y0 = im_w - (top + height)
r_x1 = r_x0 - width
r_y1 = r_y0
r_x2 = r_x0 - width
r_y2 = r_y0 + height
r_x3 = r_x0
r_y3 = r_y0 + height
elif angle == 270:
r_x0 = top + height
r_y0 = im_h - left
r_x1 = r_x0
r_y1 = r_y0 - width
r_x2 = r_x0 - height
r_y2 = r_y0 - width
r_x3 = r_x0 - height
r_y3 = r_y0
else:
msg = (
f"invalid orientation {angle}, expected values in:"
f" {sorted(CLIPPED_ORIENTATIONS)}"
)
raise ValueError(msg)
return BoundingRectangle(
r_x0=r_x0,
r_y0=r_y0,
r_x1=r_x1,
r_y1=r_y1,
r_x2=r_x2,
r_y2=r_y2,
r_x3=r_x3,
r_y3=r_y3,
coord_origin=CoordOrigin.TOPLEFT,
)