mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fix(ocr): refactor rotation utilities
Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
parent
0b39bb58bf
commit
fdc6a01bc8
@ -6,10 +6,11 @@ import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import List, Optional, Tuple, Type, cast
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
@ -25,7 +26,6 @@ from docling.utils.ocr_utils import (
|
||||
parse_tesseract_orientation,
|
||||
tesseract_box_to_bounding_rectangle,
|
||||
)
|
||||
from docling.utils.orientation import Box
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -235,7 +235,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
doc_orientation = _parse_orientation(df_osd)
|
||||
if doc_orientation != 0:
|
||||
high_res_image = high_res_image.rotate(
|
||||
doc_orientation, expand=True
|
||||
-doc_orientation, expand=True
|
||||
)
|
||||
high_res_image.save(fname)
|
||||
df_result = self._run_tesseract(fname, df_osd)
|
||||
@ -250,21 +250,18 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
rotated_bbox = (
|
||||
row["left"],
|
||||
row["top"],
|
||||
row["width"],
|
||||
row["height"],
|
||||
)
|
||||
rotated_bbox = cast(
|
||||
Box, tuple(float(c) for c in rotated_bbox)
|
||||
l, t = float(row["left"]), float(row["top"])
|
||||
r = l + float(row["width"])
|
||||
b = t + row["height"]
|
||||
bbox = BoundingBox(
|
||||
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
|
||||
)
|
||||
rect = tesseract_box_to_bounding_rectangle(
|
||||
rotated_bbox,
|
||||
offset=ocr_rect,
|
||||
bbox,
|
||||
original_offset=ocr_rect,
|
||||
scale=self.scale,
|
||||
orientation=doc_orientation,
|
||||
rotated_image_size=high_res_image.size,
|
||||
im_size=high_res_image.size,
|
||||
)
|
||||
cell = TextCell(
|
||||
index=ix,
|
||||
|
@ -5,6 +5,7 @@ from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Optional, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
@ -151,7 +152,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
||||
if doc_orientation != 0:
|
||||
high_res_image = high_res_image.rotate(
|
||||
doc_orientation, expand=True
|
||||
-doc_orientation, expand=True
|
||||
)
|
||||
if "auto" in self.options.lang:
|
||||
script = osd["script_name"]
|
||||
@ -193,13 +194,18 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
# Extract text within the bounding box
|
||||
text = local_reader.GetUTF8Text().strip()
|
||||
confidence = local_reader.MeanTextConf()
|
||||
rotated_bbox = (box["x"], box["y"], box["w"], box["h"])
|
||||
l, t = box["x"], box["y"]
|
||||
r = l + box["w"]
|
||||
b = t + box["h"]
|
||||
bbox = BoundingBox(
|
||||
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
|
||||
)
|
||||
rect = tesseract_box_to_bounding_rectangle(
|
||||
rotated_bbox,
|
||||
offset=ocr_rect,
|
||||
bbox,
|
||||
original_offset=ocr_rect,
|
||||
scale=self.scale,
|
||||
orientation=doc_orientation,
|
||||
rotated_image_size=high_res_image.size,
|
||||
im_size=high_res_image.size,
|
||||
)
|
||||
cells.append(
|
||||
TextCell(
|
||||
|
@ -1,14 +1,9 @@
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
|
||||
from docling.utils.orientation import (
|
||||
Box,
|
||||
Size,
|
||||
CLIPPED_ORIENTATIONS,
|
||||
rotate_ltwh_bounding_box,
|
||||
)
|
||||
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
||||
|
||||
|
||||
def map_tesseract_script(script: str) -> str:
|
||||
@ -38,33 +33,37 @@ def parse_tesseract_orientation(orientation: str) -> int:
|
||||
|
||||
|
||||
def tesseract_box_to_bounding_rectangle(
|
||||
box: Box,
|
||||
bbox: BoundingBox,
|
||||
*,
|
||||
offset: Optional[BoundingBox] = None,
|
||||
original_offset: Optional[BoundingBox] = None,
|
||||
scale: float,
|
||||
orientation: int,
|
||||
rotated_image_size: Size,
|
||||
im_size: Tuple[int, int],
|
||||
) -> BoundingRectangle:
|
||||
# box is in the top, left, height, width format + top left orientation
|
||||
r_0, r_1, r_2, r_3 = rotate_ltwh_bounding_box(box, orientation, rotated_image_size)
|
||||
# box is in the top, left, height, width format, top left coordinates
|
||||
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
|
||||
rect = BoundingRectangle(
|
||||
r_x0=r_0[0] / scale,
|
||||
r_y0=r_0[1] / scale,
|
||||
r_x1=r_1[0] / scale,
|
||||
r_y1=r_1[1] / scale,
|
||||
r_x2=r_2[0] / scale,
|
||||
r_y2=r_2[1] / scale,
|
||||
r_x3=r_3[0] / scale,
|
||||
r_y3=r_3[1] / scale,
|
||||
r_x0=rect.r_x0 / scale,
|
||||
r_y0=rect.r_y0 / scale,
|
||||
r_x1=rect.r_x1 / scale,
|
||||
r_y1=rect.r_y1 / scale,
|
||||
r_x2=rect.r_x2 / scale,
|
||||
r_y2=rect.r_y2 / scale,
|
||||
r_x3=rect.r_x3 / scale,
|
||||
r_y3=rect.r_y3 / scale,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
if offset is not None:
|
||||
rect.r_x0 += offset.l
|
||||
rect.r_x1 += offset.l
|
||||
rect.r_x2 += offset.l
|
||||
rect.r_x3 += offset.l
|
||||
rect.r_y0 += offset.t
|
||||
rect.r_y1 += offset.t
|
||||
rect.r_y2 += offset.t
|
||||
rect.r_y3 += offset.t
|
||||
if original_offset is not None:
|
||||
if not original_offset.coord_origin is CoordOrigin.TOPLEFT:
|
||||
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
||||
raise ValueError(msg)
|
||||
if original_offset is not None:
|
||||
rect.r_x0 += original_offset.l
|
||||
rect.r_x1 += original_offset.l
|
||||
rect.r_x2 += original_offset.l
|
||||
rect.r_x3 += original_offset.l
|
||||
rect.r_y0 += original_offset.t
|
||||
rect.r_y1 += original_offset.t
|
||||
rect.r_y2 += original_offset.t
|
||||
rect.r_y3 += original_offset.t
|
||||
return rect
|
||||
|
@ -2,12 +2,8 @@ from collections import Counter
|
||||
from operator import itemgetter
|
||||
from typing import Tuple
|
||||
|
||||
from docling_core.types.doc.page import TextCell
|
||||
|
||||
|
||||
Point = Tuple[float, float]
|
||||
Box = Tuple[float, float, float, float]
|
||||
Size = Tuple[int, int]
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||
|
||||
@ -23,32 +19,66 @@ def detect_orientation(cells: list[TextCell]) -> int:
|
||||
return max(orientation_counter.items(), key=itemgetter(1))[0]
|
||||
|
||||
|
||||
def rotate_ltwh_bounding_box(
|
||||
box: Box, orientation: int, rotated_im_size: Size
|
||||
) -> tuple[Point, Point, Point, Point]:
|
||||
def rotate_bounding_box(
|
||||
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
||||
) -> BoundingRectangle:
|
||||
# The box is left top width height in TOPLEFT coordinates
|
||||
# Bounding rectangle start with r_0 at the bottom left whatever the
|
||||
# coordinate system. Then other corners are found rotating counterclockwise
|
||||
l, t, w, h = box
|
||||
rotated_im_w, rotated_im_h = rotated_im_size
|
||||
if orientation == 0:
|
||||
r0_x = l
|
||||
r0_y = t + h
|
||||
return (r0_x, r0_y), (r0_x + w, r0_y), (r0_x + w, r0_y - h), (r0_x, r0_y - h)
|
||||
if orientation == 90:
|
||||
r0_x = t + h
|
||||
r0_y = rotated_im_w - l
|
||||
return (r0_x, r0_y), (r0_x, r0_y - w), (r0_x - h, r0_y - w), (r0_x - h, r0_y)
|
||||
if orientation == 180:
|
||||
r0_x = rotated_im_w - l
|
||||
r0_y = rotated_im_h - (t + h)
|
||||
return (r0_x, r0_y), (r0_x - w, r0_y), (r0_x - w, r0_y + h), (r0_x, r0_y + h)
|
||||
if orientation == 270:
|
||||
r0_x = rotated_im_h - (t + h)
|
||||
r0_y = l
|
||||
return (r0_x, r0_y), (r0_x, r0_y + w), (r0_x + h, r0_y + w), (r0_x, r0_y + w)
|
||||
msg = (
|
||||
f"orientation {orientation}, expected values in:"
|
||||
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
||||
bbox = bbox.to_top_left_origin(im_size[1])
|
||||
l, t, w, h = bbox.l, bbox.t, bbox.width, bbox.height
|
||||
im_h, im_w = im_size
|
||||
angle = angle % 360
|
||||
if angle == 0:
|
||||
r_x0 = l
|
||||
r_y0 = t + h
|
||||
r_x1 = r_x0 + w
|
||||
r_y1 = r_y0
|
||||
r_x2 = r_x0 + w
|
||||
r_y2 = r_y0 - h
|
||||
r_x3 = r_x0
|
||||
r_y3 = r_y0 - h
|
||||
elif angle == 90:
|
||||
r_x0 = im_w - (t + h)
|
||||
r_y0 = l
|
||||
r_x1 = r_x0
|
||||
r_y1 = r_y0 + w
|
||||
r_x2 = r_x0 + h
|
||||
r_y2 = r_y0 + w
|
||||
r_x3 = r_x0
|
||||
r_y3 = r_y0 + w
|
||||
elif angle == 180:
|
||||
r_x0 = im_h - l
|
||||
r_y0 = im_w - (t + h)
|
||||
r_x1 = r_x0 - w
|
||||
r_y1 = r_y0
|
||||
r_x2 = r_x0 - w
|
||||
r_y2 = r_y0 + h
|
||||
r_x3 = r_x0
|
||||
r_y3 = r_y0 + h
|
||||
elif angle == 270:
|
||||
r_x0 = t + h
|
||||
r_y0 = im_h - l
|
||||
r_x1 = r_x0
|
||||
r_y1 = r_y0 - w
|
||||
r_x2 = r_x0 - h
|
||||
r_y2 = r_y0 - w
|
||||
r_x3 = r_x0 - h
|
||||
r_y3 = r_y0
|
||||
else:
|
||||
msg = (
|
||||
f"invalid orientation {angle}, expected values in:"
|
||||
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
return BoundingRectangle(
|
||||
r_x0=r_x0,
|
||||
r_y0=r_y0,
|
||||
r_x1=r_x1,
|
||||
r_y1=r_y1,
|
||||
r_x2=r_x2,
|
||||
r_y2=r_y2,
|
||||
r_x3=r_x3,
|
||||
r_y3=r_y3,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
@ -1,4 +1,5 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph>
|
||||
</document>
|
@ -1 +1 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 104.58835856119788, 523.0, 123.25502522786462], "page": 1, "span": [0, 43], "__ref_s3_data": null}], "text": "JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 521.3333333333334, 96.58835856119788], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -1,3 +1,5 @@
|
||||
package
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
||||
JSON and Markdown in an easy self contained
|
||||
|
||||
Docling bundles PDF document conversion to
|
File diff suppressed because one or more lines are too long
@ -1,3 +1,4 @@
|
||||
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
|
||||
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
<text><loc_77><loc_427><loc_439><loc_438>JSON and Markdown in an easy self contained</text>
|
||||
<text><loc_77><loc_443><loc_438><loc_454>Docling bundles PDF document conversion to</text>
|
||||
</doctag>
|
@ -1 +1 @@
|
||||
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
||||
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}, {"cref": "#/texts/2"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 104.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 43]}], "orig": "JSON and Markdown in an easy self contained", "text": "JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/2", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 96.58835856119788, "r": 521.3333333333334, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 42]}], "orig": "Docling bundles PDF document conversion to", "text": "Docling bundles PDF document conversion to", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
@ -1,3 +1,5 @@
|
||||
package
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
||||
JSON and Markdown in an easy self contained
|
||||
|
||||
Docling bundles PDF document conversion to
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user