feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): update missing test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): rotate image to the natural orientation before layout prediction

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): move bounding bow rotation util to orientation.py

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel`

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel`

* chore(ocr): default `TesseractOcrCliModel._is_auto` to `False`

* fix(ocr): fix `TesseractOcrCliModel._is_auto` computation

* chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel`

---------

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro
2025-05-21 18:12:33 +02:00
committed by GitHub
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions

View File

@@ -1,3 +1,11 @@
from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
def map_tesseract_script(script: str) -> str:
r""" """
if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
elif script == "Korean":
script = "Hangul"
return script
def parse_tesseract_orientation(orientation: str) -> int:
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
# are [0, 360[ counterclockwise
parsed = int(orientation)
if parsed not in CLIPPED_ORIENTATIONS:
msg = (
f"invalid tesseract document orientation {orientation}, "
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
)
raise ValueError(msg)
parsed = -parsed
parsed %= 360
return parsed
def tesseract_box_to_bounding_rectangle(
bbox: BoundingBox,
*,
original_offset: Optional[BoundingBox] = None,
scale: float,
orientation: int,
im_size: Tuple[int, int],
) -> BoundingRectangle:
# box is in the top, left, height, width format, top left coordinates
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
rect = BoundingRectangle(
r_x0=rect.r_x0 / scale,
r_y0=rect.r_y0 / scale,
r_x1=rect.r_x1 / scale,
r_y1=rect.r_y1 / scale,
r_x2=rect.r_x2 / scale,
r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT,
)
if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
raise ValueError(msg)
if original_offset is not None:
rect.r_x0 += original_offset.l
rect.r_x1 += original_offset.l
rect.r_x2 += original_offset.l
rect.r_x3 += original_offset.l
rect.r_y0 += original_offset.t
rect.r_y1 += original_offset.t
rect.r_y2 += original_offset.t
rect.r_y3 += original_offset.t
return rect

View File

@@ -0,0 +1,71 @@
from typing import Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def rotate_bounding_box(
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
) -> BoundingRectangle:
# The box is left top width height in TOPLEFT coordinates
# Bounding rectangle start with r_0 at the bottom left whatever the
# coordinate system. Then other corners are found rotating counterclockwise
bbox = bbox.to_top_left_origin(im_size[1])
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
im_h, im_w = im_size
angle = angle % 360
if angle == 0:
r_x0 = left
r_y0 = top + height
r_x1 = r_x0 + width
r_y1 = r_y0
r_x2 = r_x0 + width
r_y2 = r_y0 - height
r_x3 = r_x0
r_y3 = r_y0 - height
elif angle == 90:
r_x0 = im_w - (top + height)
r_y0 = left
r_x1 = r_x0
r_y1 = r_y0 + width
r_x2 = r_x0 + height
r_y2 = r_y0 + width
r_x3 = r_x0
r_y3 = r_y0 + width
elif angle == 180:
r_x0 = im_h - left
r_y0 = im_w - (top + height)
r_x1 = r_x0 - width
r_y1 = r_y0
r_x2 = r_x0 - width
r_y2 = r_y0 + height
r_x3 = r_x0
r_y3 = r_y0 + height
elif angle == 270:
r_x0 = top + height
r_y0 = im_h - left
r_x1 = r_x0
r_y1 = r_y0 - width
r_x2 = r_x0 - height
r_y2 = r_y0 - width
r_x3 = r_x0 - height
r_y3 = r_y0
else:
msg = (
f"invalid orientation {angle}, expected values in:"
f" {sorted(CLIPPED_ORIENTATIONS)}"
)
raise ValueError(msg)
return BoundingRectangle(
r_x0=r_x0,
r_y0=r_y0,
r_x1=r_x1,
r_y1=r_y1,
r_x2=r_x2,
r_y2=r_y2,
r_x3=r_x3,
r_y3=r_y3,
coord_origin=CoordOrigin.TOPLEFT,
)