mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-12 06:38:10 +00:00
feat(ocr): auto-detect rotated pages in Tesseract (#1167)
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
@@ -1,3 +1,11 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
|
||||
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
||||
|
||||
|
||||
def map_tesseract_script(script: str) -> str:
|
||||
r""" """
|
||||
if script == "Katakana" or script == "Hiragana":
|
||||
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
|
||||
elif script == "Korean":
|
||||
script = "Hangul"
|
||||
return script
|
||||
|
||||
|
||||
def parse_tesseract_orientation(orientation: str) -> int:
|
||||
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
|
||||
# are [0, 360[ counterclockwise
|
||||
parsed = int(orientation)
|
||||
if parsed not in CLIPPED_ORIENTATIONS:
|
||||
msg = (
|
||||
f"invalid tesseract document orientation {orientation}, "
|
||||
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
parsed = -parsed
|
||||
parsed %= 360
|
||||
return parsed
|
||||
|
||||
|
||||
def tesseract_box_to_bounding_rectangle(
|
||||
bbox: BoundingBox,
|
||||
*,
|
||||
original_offset: Optional[BoundingBox] = None,
|
||||
scale: float,
|
||||
orientation: int,
|
||||
im_size: Tuple[int, int],
|
||||
) -> BoundingRectangle:
|
||||
# box is in the top, left, height, width format, top left coordinates
|
||||
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
|
||||
rect = BoundingRectangle(
|
||||
r_x0=rect.r_x0 / scale,
|
||||
r_y0=rect.r_y0 / scale,
|
||||
r_x1=rect.r_x1 / scale,
|
||||
r_y1=rect.r_y1 / scale,
|
||||
r_x2=rect.r_x2 / scale,
|
||||
r_y2=rect.r_y2 / scale,
|
||||
r_x3=rect.r_x3 / scale,
|
||||
r_y3=rect.r_y3 / scale,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
if original_offset is not None:
|
||||
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
||||
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
||||
raise ValueError(msg)
|
||||
if original_offset is not None:
|
||||
rect.r_x0 += original_offset.l
|
||||
rect.r_x1 += original_offset.l
|
||||
rect.r_x2 += original_offset.l
|
||||
rect.r_x3 += original_offset.l
|
||||
rect.r_y0 += original_offset.t
|
||||
rect.r_y1 += original_offset.t
|
||||
rect.r_y2 += original_offset.t
|
||||
rect.r_y3 += original_offset.t
|
||||
return rect
|
||||
|
||||
71
docling/utils/orientation.py
Normal file
71
docling/utils/orientation.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from typing import Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
|
||||
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||
|
||||
|
||||
def rotate_bounding_box(
|
||||
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
||||
) -> BoundingRectangle:
|
||||
# The box is left top width height in TOPLEFT coordinates
|
||||
# Bounding rectangle start with r_0 at the bottom left whatever the
|
||||
# coordinate system. Then other corners are found rotating counterclockwise
|
||||
bbox = bbox.to_top_left_origin(im_size[1])
|
||||
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
|
||||
im_h, im_w = im_size
|
||||
angle = angle % 360
|
||||
if angle == 0:
|
||||
r_x0 = left
|
||||
r_y0 = top + height
|
||||
r_x1 = r_x0 + width
|
||||
r_y1 = r_y0
|
||||
r_x2 = r_x0 + width
|
||||
r_y2 = r_y0 - height
|
||||
r_x3 = r_x0
|
||||
r_y3 = r_y0 - height
|
||||
elif angle == 90:
|
||||
r_x0 = im_w - (top + height)
|
||||
r_y0 = left
|
||||
r_x1 = r_x0
|
||||
r_y1 = r_y0 + width
|
||||
r_x2 = r_x0 + height
|
||||
r_y2 = r_y0 + width
|
||||
r_x3 = r_x0
|
||||
r_y3 = r_y0 + width
|
||||
elif angle == 180:
|
||||
r_x0 = im_h - left
|
||||
r_y0 = im_w - (top + height)
|
||||
r_x1 = r_x0 - width
|
||||
r_y1 = r_y0
|
||||
r_x2 = r_x0 - width
|
||||
r_y2 = r_y0 + height
|
||||
r_x3 = r_x0
|
||||
r_y3 = r_y0 + height
|
||||
elif angle == 270:
|
||||
r_x0 = top + height
|
||||
r_y0 = im_h - left
|
||||
r_x1 = r_x0
|
||||
r_y1 = r_y0 - width
|
||||
r_x2 = r_x0 - height
|
||||
r_y2 = r_y0 - width
|
||||
r_x3 = r_x0 - height
|
||||
r_y3 = r_y0
|
||||
else:
|
||||
msg = (
|
||||
f"invalid orientation {angle}, expected values in:"
|
||||
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
return BoundingRectangle(
|
||||
r_x0=r_x0,
|
||||
r_y0=r_y0,
|
||||
r_x1=r_x1,
|
||||
r_y1=r_y1,
|
||||
r_x2=r_x2,
|
||||
r_y2=r_y2,
|
||||
r_x3=r_x3,
|
||||
r_y3=r_y3,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
Reference in New Issue
Block a user