fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro 2025-04-08 17:28:06 +02:00
parent 0b39bb58bf
commit fdc6a01bc8
12 changed files with 126 additions and 88 deletions

View File

@ -6,10 +6,11 @@ import tempfile
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import List, Optional, Tuple, Type, cast from typing import List, Optional, Tuple, Type
import pandas as pd import pandas as pd
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
@ -25,7 +26,6 @@ from docling.utils.ocr_utils import (
parse_tesseract_orientation, parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle, tesseract_box_to_bounding_rectangle,
) )
from docling.utils.orientation import Box
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -235,7 +235,7 @@ class TesseractOcrCliModel(BaseOcrModel):
doc_orientation = _parse_orientation(df_osd) doc_orientation = _parse_orientation(df_osd)
if doc_orientation != 0: if doc_orientation != 0:
high_res_image = high_res_image.rotate( high_res_image = high_res_image.rotate(
doc_orientation, expand=True -doc_orientation, expand=True
) )
high_res_image.save(fname) high_res_image.save(fname)
df_result = self._run_tesseract(fname, df_osd) df_result = self._run_tesseract(fname, df_osd)
@ -250,21 +250,18 @@ class TesseractOcrCliModel(BaseOcrModel):
text = row["text"] text = row["text"]
conf = row["conf"] conf = row["conf"]
rotated_bbox = ( l, t = float(row["left"]), float(row["top"])
row["left"], r = l + float(row["width"])
row["top"], b = t + row["height"]
row["width"], bbox = BoundingBox(
row["height"], l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
)
rotated_bbox = cast(
Box, tuple(float(c) for c in rotated_bbox)
) )
rect = tesseract_box_to_bounding_rectangle( rect = tesseract_box_to_bounding_rectangle(
rotated_bbox, bbox,
offset=ocr_rect, original_offset=ocr_rect,
scale=self.scale, scale=self.scale,
orientation=doc_orientation, orientation=doc_orientation,
rotated_image_size=high_res_image.size, im_size=high_res_image.size,
) )
cell = TextCell( cell = TextCell(
index=ix, index=ix,

View File

@ -5,6 +5,7 @@ from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, Optional, Type from typing import Dict, Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
@ -151,7 +152,7 @@ class TesseractOcrModel(BaseOcrModel):
doc_orientation = parse_tesseract_orientation(osd["orient_deg"]) doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0: if doc_orientation != 0:
high_res_image = high_res_image.rotate( high_res_image = high_res_image.rotate(
doc_orientation, expand=True -doc_orientation, expand=True
) )
if "auto" in self.options.lang: if "auto" in self.options.lang:
script = osd["script_name"] script = osd["script_name"]
@ -193,13 +194,18 @@ class TesseractOcrModel(BaseOcrModel):
# Extract text within the bounding box # Extract text within the bounding box
text = local_reader.GetUTF8Text().strip() text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf() confidence = local_reader.MeanTextConf()
rotated_bbox = (box["x"], box["y"], box["w"], box["h"]) l, t = box["x"], box["y"]
r = l + box["w"]
b = t + box["h"]
bbox = BoundingBox(
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
)
rect = tesseract_box_to_bounding_rectangle( rect = tesseract_box_to_bounding_rectangle(
rotated_bbox, bbox,
offset=ocr_rect, original_offset=ocr_rect,
scale=self.scale, scale=self.scale,
orientation=doc_orientation, orientation=doc_orientation,
rotated_image_size=high_res_image.size, im_size=high_res_image.size,
) )
cells.append( cells.append(
TextCell( TextCell(

View File

@ -1,14 +1,9 @@
from typing import Optional from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle from docling_core.types.doc.page import BoundingRectangle
from docling.utils.orientation import ( from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
Box,
Size,
CLIPPED_ORIENTATIONS,
rotate_ltwh_bounding_box,
)
def map_tesseract_script(script: str) -> str: def map_tesseract_script(script: str) -> str:
@ -38,33 +33,37 @@ def parse_tesseract_orientation(orientation: str) -> int:
def tesseract_box_to_bounding_rectangle( def tesseract_box_to_bounding_rectangle(
box: Box, bbox: BoundingBox,
*, *,
offset: Optional[BoundingBox] = None, original_offset: Optional[BoundingBox] = None,
scale: float, scale: float,
orientation: int, orientation: int,
rotated_image_size: Size, im_size: Tuple[int, int],
) -> BoundingRectangle: ) -> BoundingRectangle:
# box is in the top, left, height, width format + top left orientation # box is in the top, left, height, width format, top left coordinates
r_0, r_1, r_2, r_3 = rotate_ltwh_bounding_box(box, orientation, rotated_image_size) rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
rect = BoundingRectangle( rect = BoundingRectangle(
r_x0=r_0[0] / scale, r_x0=rect.r_x0 / scale,
r_y0=r_0[1] / scale, r_y0=rect.r_y0 / scale,
r_x1=r_1[0] / scale, r_x1=rect.r_x1 / scale,
r_y1=r_1[1] / scale, r_y1=rect.r_y1 / scale,
r_x2=r_2[0] / scale, r_x2=rect.r_x2 / scale,
r_y2=r_2[1] / scale, r_y2=rect.r_y2 / scale,
r_x3=r_3[0] / scale, r_x3=rect.r_x3 / scale,
r_y3=r_3[1] / scale, r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT, coord_origin=CoordOrigin.TOPLEFT,
) )
if offset is not None: if original_offset is not None:
rect.r_x0 += offset.l if not original_offset.coord_origin is CoordOrigin.TOPLEFT:
rect.r_x1 += offset.l msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
rect.r_x2 += offset.l raise ValueError(msg)
rect.r_x3 += offset.l if original_offset is not None:
rect.r_y0 += offset.t rect.r_x0 += original_offset.l
rect.r_y1 += offset.t rect.r_x1 += original_offset.l
rect.r_y2 += offset.t rect.r_x2 += original_offset.l
rect.r_y3 += offset.t rect.r_x3 += original_offset.l
rect.r_y0 += original_offset.t
rect.r_y1 += original_offset.t
rect.r_y2 += original_offset.t
rect.r_y3 += original_offset.t
return rect return rect

View File

@ -2,12 +2,8 @@ from collections import Counter
from operator import itemgetter from operator import itemgetter
from typing import Tuple from typing import Tuple
from docling_core.types.doc.page import TextCell from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
Point = Tuple[float, float]
Box = Tuple[float, float, float, float]
Size = Tuple[int, int]
CLIPPED_ORIENTATIONS = [0, 90, 180, 270] CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
@ -23,32 +19,66 @@ def detect_orientation(cells: list[TextCell]) -> int:
return max(orientation_counter.items(), key=itemgetter(1))[0] return max(orientation_counter.items(), key=itemgetter(1))[0]
def rotate_ltwh_bounding_box( def rotate_bounding_box(
box: Box, orientation: int, rotated_im_size: Size bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
) -> tuple[Point, Point, Point, Point]: ) -> BoundingRectangle:
# The box is left top width height in TOPLEFT coordinates # The box is left top width height in TOPLEFT coordinates
# Bounding rectangle start with r_0 at the bottom left whatever the # Bounding rectangle start with r_0 at the bottom left whatever the
# coordinate system. Then other corners are found rotating counterclockwise # coordinate system. Then other corners are found rotating counterclockwise
l, t, w, h = box bbox = bbox.to_top_left_origin(im_size[1])
rotated_im_w, rotated_im_h = rotated_im_size l, t, w, h = bbox.l, bbox.t, bbox.width, bbox.height
if orientation == 0: im_h, im_w = im_size
r0_x = l angle = angle % 360
r0_y = t + h if angle == 0:
return (r0_x, r0_y), (r0_x + w, r0_y), (r0_x + w, r0_y - h), (r0_x, r0_y - h) r_x0 = l
if orientation == 90: r_y0 = t + h
r0_x = t + h r_x1 = r_x0 + w
r0_y = rotated_im_w - l r_y1 = r_y0
return (r0_x, r0_y), (r0_x, r0_y - w), (r0_x - h, r0_y - w), (r0_x - h, r0_y) r_x2 = r_x0 + w
if orientation == 180: r_y2 = r_y0 - h
r0_x = rotated_im_w - l r_x3 = r_x0
r0_y = rotated_im_h - (t + h) r_y3 = r_y0 - h
return (r0_x, r0_y), (r0_x - w, r0_y), (r0_x - w, r0_y + h), (r0_x, r0_y + h) elif angle == 90:
if orientation == 270: r_x0 = im_w - (t + h)
r0_x = rotated_im_h - (t + h) r_y0 = l
r0_y = l r_x1 = r_x0
return (r0_x, r0_y), (r0_x, r0_y + w), (r0_x + h, r0_y + w), (r0_x, r0_y + w) r_y1 = r_y0 + w
r_x2 = r_x0 + h
r_y2 = r_y0 + w
r_x3 = r_x0
r_y3 = r_y0 + w
elif angle == 180:
r_x0 = im_h - l
r_y0 = im_w - (t + h)
r_x1 = r_x0 - w
r_y1 = r_y0
r_x2 = r_x0 - w
r_y2 = r_y0 + h
r_x3 = r_x0
r_y3 = r_y0 + h
elif angle == 270:
r_x0 = t + h
r_y0 = im_h - l
r_x1 = r_x0
r_y1 = r_y0 - w
r_x2 = r_x0 - h
r_y2 = r_y0 - w
r_x3 = r_x0 - h
r_y3 = r_y0
else:
msg = ( msg = (
f"orientation {orientation}, expected values in:" f"invalid orientation {angle}, expected values in:"
f" {sorted(CLIPPED_ORIENTATIONS)}" f" {sorted(CLIPPED_ORIENTATIONS)}"
) )
raise ValueError(msg) raise ValueError(msg)
return BoundingRectangle(
r_x0=r_x0,
r_y0=r_y0,
r_x1=r_x1,
r_y1=r_y1,
r_x2=r_x2,
r_y2=r_y2,
r_x3=r_x3,
r_y3=r_y3,
coord_origin=CoordOrigin.TOPLEFT,
)

View File

@ -1,4 +1,5 @@
<document> <document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph> <paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph> <paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph>
</document> </document>

View File

@ -1 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} {"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 104.58835856119788, 523.0, 123.25502522786462], "page": 1, "span": [0, 43], "__ref_s3_data": null}], "text": "JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 521.3333333333334, 96.58835856119788], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1,3 +1,5 @@
package package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained JSON and Markdown in an easy self contained
Docling bundles PDF document conversion to

File diff suppressed because one or more lines are too long

View File

@ -1,3 +1,4 @@
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text> <doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text> <text><loc_77><loc_427><loc_439><loc_438>JSON and Markdown in an easy self contained</text>
<text><loc_77><loc_443><loc_438><loc_454>Docling bundles PDF document conversion to</text>
</doctag> </doctag>

View File

@ -1 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}} {"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}, {"cref": "#/texts/2"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 104.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 43]}], "orig": "JSON and Markdown in an easy self contained", "text": "JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/2", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 96.58835856119788, "r": 521.3333333333334, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 42]}], "orig": "Docling bundles PDF document conversion to", "text": "Docling bundles PDF document conversion to", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -1,3 +1,5 @@
package package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained JSON and Markdown in an easy self contained
Docling bundles PDF document conversion to

File diff suppressed because one or more lines are too long