diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 28d197ad..e9948752 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -21,11 +21,11 @@ from docling.datamodel.pipeline_options import ( from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel from docling.utils.ocr_utils import ( - Box, map_tesseract_script, parse_tesseract_orientation, tesseract_box_to_bounding_rectangle, ) +from docling.utils.orientation import Box from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/utils/ocr_utils.py b/docling/utils/ocr_utils.py index 48d1755d..0e144145 100644 --- a/docling/utils/ocr_utils.py +++ b/docling/utils/ocr_utils.py @@ -1,13 +1,14 @@ -from typing import Optional, Tuple +from typing import Optional from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle -_TESSERACT_ORIENTATIONS = {0, 90, 180, 270} - -Point = Tuple[float, float] -Box = Tuple[float, float, float, float] -Size = Tuple[int, int] +from docling.utils.orientation import ( + Box, + Size, + CLIPPED_ORIENTATIONS, + rotate_ltwh_bounding_box, +) def map_tesseract_script(script: str) -> str: @@ -21,45 +22,18 @@ def map_tesseract_script(script: str) -> str: return script -def reverse_tesseract_preprocessing_rotation( - box: Box, orientation: int, rotated_im_size: Size -) -> tuple[Point, Point, Point, Point]: - # The box is left top width height in TOPLEFT coordinates - # Bounding rectangle start with r_0 at the bottom left whatever the - # coordinate system. Then other corners are found rotating counterclockwise - l, t, w, h = box - rotated_im_w, rotated_im_h = rotated_im_size - if orientation == 0: - r0_x = l - r0_y = t + h - return (r0_x, r0_y), (r0_x + w, r0_y), (r0_x + w, r0_y - h), (r0_x, r0_y - h) - if orientation == 90: - r0_x = rotated_im_h - (t + h) - r0_y = l - return (r0_x, r0_y), (r0_x, r0_y + w), (r0_x + h, r0_y + w), (r0_x, r0_y + w) - if orientation == 180: - r0_x = rotated_im_w - l - r0_y = rotated_im_h - (t + h) - return (r0_x, r0_y), (r0_x - w, r0_y), (r0_x - w, r0_y + h), (r0_x, r0_y + h) - if orientation == 270: - r0_x = t + h - r0_y = rotated_im_w - l - return (r0_x, r0_y), (r0_x, r0_y - w), (r0_x - h, r0_y - w), (r0_x - h, r0_y) - msg = ( - f"invalid tesseract document orientation {orientation}, " - f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}" - ) - raise ValueError(msg) - - def parse_tesseract_orientation(orientation: str) -> int: + # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles + # are [0, 360[ counterclockwise parsed = int(orientation) - if parsed not in _TESSERACT_ORIENTATIONS: + if parsed not in CLIPPED_ORIENTATIONS: msg = ( f"invalid tesseract document orientation {orientation}, " - f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}" + f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}" ) raise ValueError(msg) + parsed = -parsed + parsed %= 360 return parsed @@ -72,9 +46,7 @@ def tesseract_box_to_bounding_rectangle( rotated_image_size: Size, ) -> BoundingRectangle: # box is in the top, left, height, width format + top left orientation - r_0, r_1, r_2, r_3 = reverse_tesseract_preprocessing_rotation( - box, orientation, rotated_image_size - ) + r_0, r_1, r_2, r_3 = rotate_ltwh_bounding_box(box, orientation, rotated_image_size) rect = BoundingRectangle( r_x0=r_0[0] / scale, r_y0=r_0[1] / scale, diff --git a/docling/utils/orientation.py b/docling/utils/orientation.py index da076378..39c5ca80 100644 --- a/docling/utils/orientation.py +++ b/docling/utils/orientation.py @@ -1,13 +1,19 @@ from collections import Counter from operator import itemgetter +from typing import Tuple from docling_core.types.doc.page import TextCell -_ORIENTATIONS = [0, 90, 180, 270] + +Point = Tuple[float, float] +Box = Tuple[float, float, float, float] +Size = Tuple[int, int] + +CLIPPED_ORIENTATIONS = [0, 90, 180, 270] def _clipped_orientation(angle: float) -> int: - return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1] + return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1] def detect_orientation(cells: list[TextCell]) -> int: @@ -15,3 +21,34 @@ def detect_orientation(cells: list[TextCell]) -> int: return 0 orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells) return max(orientation_counter.items(), key=itemgetter(1))[0] + + +def rotate_ltwh_bounding_box( + box: Box, orientation: int, rotated_im_size: Size +) -> tuple[Point, Point, Point, Point]: + # The box is left top width height in TOPLEFT coordinates + # Bounding rectangle start with r_0 at the bottom left whatever the + # coordinate system. Then other corners are found rotating counterclockwise + l, t, w, h = box + rotated_im_w, rotated_im_h = rotated_im_size + if orientation == 0: + r0_x = l + r0_y = t + h + return (r0_x, r0_y), (r0_x + w, r0_y), (r0_x + w, r0_y - h), (r0_x, r0_y - h) + if orientation == 90: + r0_x = t + h + r0_y = rotated_im_w - l + return (r0_x, r0_y), (r0_x, r0_y - w), (r0_x - h, r0_y - w), (r0_x - h, r0_y) + if orientation == 180: + r0_x = rotated_im_w - l + r0_y = rotated_im_h - (t + h) + return (r0_x, r0_y), (r0_x - w, r0_y), (r0_x - w, r0_y + h), (r0_x, r0_y + h) + if orientation == 270: + r0_x = rotated_im_h - (t + h) + r0_y = l + return (r0_x, r0_y), (r0_x, r0_y + w), (r0_x + h, r0_y + w), (r0_x, r0_y + w) + msg = ( + f"orientation {orientation}, expected values in:" + f" {sorted(CLIPPED_ORIENTATIONS)}" + ) + raise ValueError(msg)