fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro 2025-04-08 17:28:06 +02:00
parent 0b39bb58bf
commit fdc6a01bc8
12 changed files with 126 additions and 88 deletions

View File

@ -6,10 +6,11 @@ import tempfile
from collections.abc import Iterable
from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen
from typing import List, Optional, Tuple, Type, cast
from typing import List, Optional, Tuple, Type
import pandas as pd
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
@ -25,7 +26,6 @@ from docling.utils.ocr_utils import (
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.orientation import Box
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@ -235,7 +235,7 @@ class TesseractOcrCliModel(BaseOcrModel):
doc_orientation = _parse_orientation(df_osd)
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
doc_orientation, expand=True
-doc_orientation, expand=True
)
high_res_image.save(fname)
df_result = self._run_tesseract(fname, df_osd)
@ -250,21 +250,18 @@ class TesseractOcrCliModel(BaseOcrModel):
text = row["text"]
conf = row["conf"]
rotated_bbox = (
row["left"],
row["top"],
row["width"],
row["height"],
)
rotated_bbox = cast(
Box, tuple(float(c) for c in rotated_bbox)
l, t = float(row["left"]), float(row["top"])
r = l + float(row["width"])
b = t + row["height"]
bbox = BoundingBox(
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
)
rect = tesseract_box_to_bounding_rectangle(
rotated_bbox,
offset=ocr_rect,
bbox,
original_offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
rotated_image_size=high_res_image.size,
im_size=high_res_image.size,
)
cell = TextCell(
index=ix,

View File

@ -5,6 +5,7 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Dict, Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
@ -151,7 +152,7 @@ class TesseractOcrModel(BaseOcrModel):
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
doc_orientation, expand=True
-doc_orientation, expand=True
)
if "auto" in self.options.lang:
script = osd["script_name"]
@ -193,13 +194,18 @@ class TesseractOcrModel(BaseOcrModel):
# Extract text within the bounding box
text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf()
rotated_bbox = (box["x"], box["y"], box["w"], box["h"])
l, t = box["x"], box["y"]
r = l + box["w"]
b = t + box["h"]
bbox = BoundingBox(
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
)
rect = tesseract_box_to_bounding_rectangle(
rotated_bbox,
offset=ocr_rect,
bbox,
original_offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
rotated_image_size=high_res_image.size,
im_size=high_res_image.size,
)
cells.append(
TextCell(

View File

@ -1,14 +1,9 @@
from typing import Optional
from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
from docling.utils.orientation import (
Box,
Size,
CLIPPED_ORIENTATIONS,
rotate_ltwh_bounding_box,
)
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
def map_tesseract_script(script: str) -> str:
@ -38,33 +33,37 @@ def parse_tesseract_orientation(orientation: str) -> int:
def tesseract_box_to_bounding_rectangle(
box: Box,
bbox: BoundingBox,
*,
offset: Optional[BoundingBox] = None,
original_offset: Optional[BoundingBox] = None,
scale: float,
orientation: int,
rotated_image_size: Size,
im_size: Tuple[int, int],
) -> BoundingRectangle:
# box is in the top, left, height, width format + top left orientation
r_0, r_1, r_2, r_3 = rotate_ltwh_bounding_box(box, orientation, rotated_image_size)
# box is in the top, left, height, width format, top left coordinates
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
rect = BoundingRectangle(
r_x0=r_0[0] / scale,
r_y0=r_0[1] / scale,
r_x1=r_1[0] / scale,
r_y1=r_1[1] / scale,
r_x2=r_2[0] / scale,
r_y2=r_2[1] / scale,
r_x3=r_3[0] / scale,
r_y3=r_3[1] / scale,
r_x0=rect.r_x0 / scale,
r_y0=rect.r_y0 / scale,
r_x1=rect.r_x1 / scale,
r_y1=rect.r_y1 / scale,
r_x2=rect.r_x2 / scale,
r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT,
)
if offset is not None:
rect.r_x0 += offset.l
rect.r_x1 += offset.l
rect.r_x2 += offset.l
rect.r_x3 += offset.l
rect.r_y0 += offset.t
rect.r_y1 += offset.t
rect.r_y2 += offset.t
rect.r_y3 += offset.t
if original_offset is not None:
if not original_offset.coord_origin is CoordOrigin.TOPLEFT:
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
raise ValueError(msg)
if original_offset is not None:
rect.r_x0 += original_offset.l
rect.r_x1 += original_offset.l
rect.r_x2 += original_offset.l
rect.r_x3 += original_offset.l
rect.r_y0 += original_offset.t
rect.r_y1 += original_offset.t
rect.r_y2 += original_offset.t
rect.r_y3 += original_offset.t
return rect

View File

@ -2,12 +2,8 @@ from collections import Counter
from operator import itemgetter
from typing import Tuple
from docling_core.types.doc.page import TextCell
Point = Tuple[float, float]
Box = Tuple[float, float, float, float]
Size = Tuple[int, int]
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
@ -23,32 +19,66 @@ def detect_orientation(cells: list[TextCell]) -> int:
return max(orientation_counter.items(), key=itemgetter(1))[0]
def rotate_ltwh_bounding_box(
box: Box, orientation: int, rotated_im_size: Size
) -> tuple[Point, Point, Point, Point]:
def rotate_bounding_box(
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
) -> BoundingRectangle:
# The box is left top width height in TOPLEFT coordinates
# Bounding rectangle start with r_0 at the bottom left whatever the
# coordinate system. Then other corners are found rotating counterclockwise
l, t, w, h = box
rotated_im_w, rotated_im_h = rotated_im_size
if orientation == 0:
r0_x = l
r0_y = t + h
return (r0_x, r0_y), (r0_x + w, r0_y), (r0_x + w, r0_y - h), (r0_x, r0_y - h)
if orientation == 90:
r0_x = t + h
r0_y = rotated_im_w - l
return (r0_x, r0_y), (r0_x, r0_y - w), (r0_x - h, r0_y - w), (r0_x - h, r0_y)
if orientation == 180:
r0_x = rotated_im_w - l
r0_y = rotated_im_h - (t + h)
return (r0_x, r0_y), (r0_x - w, r0_y), (r0_x - w, r0_y + h), (r0_x, r0_y + h)
if orientation == 270:
r0_x = rotated_im_h - (t + h)
r0_y = l
return (r0_x, r0_y), (r0_x, r0_y + w), (r0_x + h, r0_y + w), (r0_x, r0_y + w)
msg = (
f"orientation {orientation}, expected values in:"
f" {sorted(CLIPPED_ORIENTATIONS)}"
bbox = bbox.to_top_left_origin(im_size[1])
l, t, w, h = bbox.l, bbox.t, bbox.width, bbox.height
im_h, im_w = im_size
angle = angle % 360
if angle == 0:
r_x0 = l
r_y0 = t + h
r_x1 = r_x0 + w
r_y1 = r_y0
r_x2 = r_x0 + w
r_y2 = r_y0 - h
r_x3 = r_x0
r_y3 = r_y0 - h
elif angle == 90:
r_x0 = im_w - (t + h)
r_y0 = l
r_x1 = r_x0
r_y1 = r_y0 + w
r_x2 = r_x0 + h
r_y2 = r_y0 + w
r_x3 = r_x0
r_y3 = r_y0 + w
elif angle == 180:
r_x0 = im_h - l
r_y0 = im_w - (t + h)
r_x1 = r_x0 - w
r_y1 = r_y0
r_x2 = r_x0 - w
r_y2 = r_y0 + h
r_x3 = r_x0
r_y3 = r_y0 + h
elif angle == 270:
r_x0 = t + h
r_y0 = im_h - l
r_x1 = r_x0
r_y1 = r_y0 - w
r_x2 = r_x0 - h
r_y2 = r_y0 - w
r_x3 = r_x0 - h
r_y3 = r_y0
else:
msg = (
f"invalid orientation {angle}, expected values in:"
f" {sorted(CLIPPED_ORIENTATIONS)}"
)
raise ValueError(msg)
return BoundingRectangle(
r_x0=r_x0,
r_y0=r_y0,
r_x1=r_x1,
r_y1=r_y1,
r_x2=r_x2,
r_y2=r_y2,
r_x3=r_x3,
r_y3=r_y3,
coord_origin=CoordOrigin.TOPLEFT,
)
raise ValueError(msg)

View File

@ -1,4 +1,5 @@
<document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
<paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph>
</document>

View File

@ -1 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 104.58835856119788, 523.0, 123.25502522786462], "page": 1, "span": [0, 43], "__ref_s3_data": null}], "text": "JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 521.3333333333334, 96.58835856119788], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1,3 +1,5 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
JSON and Markdown in an easy self contained
Docling bundles PDF document conversion to

File diff suppressed because one or more lines are too long

View File

@ -1,3 +1,4 @@
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
<text><loc_77><loc_427><loc_439><loc_438>JSON and Markdown in an easy self contained</text>
<text><loc_77><loc_443><loc_438><loc_454>Docling bundles PDF document conversion to</text>
</doctag>

View File

@ -1 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}, {"cref": "#/texts/2"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 104.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 43]}], "orig": "JSON and Markdown in an easy self contained", "text": "JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/2", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 96.58835856119788, "r": 521.3333333333334, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 42]}], "orig": "Docling bundles PDF document conversion to", "text": "Docling bundles PDF document conversion to", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -1,3 +1,5 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
JSON and Markdown in an easy self contained
Docling bundles PDF document conversion to

File diff suppressed because one or more lines are too long