feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
2025-12-10 13:48:13 +00:00 · 2025-05-21 18:12:33 +02:00
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -2,6 +2,7 @@ import csv
 import io
 import logging
 import os
+import subprocess
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell

 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
        self._version: Optional[str] = None
        self._tesseract_languages: Optional[List[str]] = None
        self._script_prefix: Optional[str] = None
+        self._is_auto: bool = "auto" in self.options.lang

        if self.enabled:
            try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):

        return name, version

-    def _run_tesseract(self, ifilename: str):
+    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
        r"""
        Run tesseract CLI
        """
        cmd = [self.options.tesseract_cmd]
-
-        if "auto" in self.options.lang:
-            lang = self._detect_language(ifilename)
+        if self._is_auto:
+            lang = self._parse_language(osd)
            if lang is not None:
                cmd.append("-l")
                cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
        cmd += [ifilename, "stdout", "tsv"]
        _log.info("command: {}".format(" ".join(cmd)))

-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)

        # _log.info(output)

        # Decode the byte string to a regular string
-        decoded_data = output.decode("utf-8")
+        decoded_data = output.stdout.decode("utf-8")
        # _log.info(decoded_data)

        # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):

        return df_filtered

-    def _detect_language(self, ifilename: str):
+    def _perform_osd(self, ifilename: str) -> pd.DataFrame:
        r"""
        Run tesseract in PSM 0 mode to detect the language
        """
-        assert self._tesseract_languages is not None

        cmd = [self.options.tesseract_cmd]
        cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
        _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, capture_output=True, check=True)
+        decoded_data = output.stdout.decode("utf-8")
        df_detected = pd.read_csv(
            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
        )
-        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
+        return df_detected
+
+    def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
+        assert self._tesseract_languages is not None
+        scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
        if len(scripts) == 0:
            _log.warning("Tesseract cannot detect the script of the page")
            return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
        cmd = [self.options.tesseract_cmd]
        cmd.append("--list-langs")
        _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
+        decoded_data = output.stdout.decode("utf-8")
        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
        self._tesseract_languages = df_list[0].tolist()[1:]

@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
            yield from page_batch
            return

-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                    ocr_rects = self.get_ocr_rects(page)

                    all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                        # Skip zero area boxes
                        if ocr_rect.area() == 0:
                            continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
                            ) as image_file:
                                fname = image_file.name
                                high_res_image.save(image_file)
-
-                            df_result = self._run_tesseract(fname)
+                            doc_orientation = 0
+                            try:
+                                df_osd = self._perform_osd(fname)
+                                doc_orientation = _parse_orientation(df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "OSD failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                # Skipping if OSD fail when in auto mode, otherwise proceed
+                                # to OCR in the hope OCR will succeed while OSD failed
+                                if self._is_auto:
+                                    continue
+                            if doc_orientation != 0:
+                                high_res_image = high_res_image.rotate(
+                                    -doc_orientation, expand=True
+                                )
+                                high_res_image.save(fname)
+                            try:
+                                df_result = self._run_tesseract(fname, df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "tesseract OCR failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
                            text = row["text"]
                            conf = row["conf"]

-                            l = float(row["left"])  # noqa: E741
-                            b = float(row["top"])
-                            w = float(row["width"])
-                            h = float(row["height"])
-
-                            t = b + h
-                            r = l + w
-
+                            left, top = float(row["left"]), float(row["top"])
+                            right = left + float(row["width"])
+                            bottom = top + row["height"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                            cell = TextCell(
                                index=ix,
                                text=str(text),
                                orig=str(text),
                                from_ocr=True,
                                confidence=conf / 100.0,
-                                rect=BoundingRectangle.from_bounding_box(
-                                    BoundingBox.from_tuple(
-                                        coord=(
-                                            (l / self.scale) + ocr_rect.l,
-                                            (b / self.scale) + ocr_rect.t,
-                                            (r / self.scale) + ocr_rect.l,
-                                            (t / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
-                                    )
-                                ),
+                                rect=rect,
                            )
                            all_ocr_cells.append(cell)

@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
    @classmethod
    def get_options_type(cls) -> Type[OcrOptions]:
        return TesseractCliOcrOptions
+
+
+def _parse_orientation(df_osd: pd.DataFrame) -> int:
+    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
+    orientation = parse_tesseract_orientation(orientations[0].strip())
+    return orientation
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -1,12 +1,11 @@
 from __future__ import annotations

 import logging
-from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional, Type
+from typing import Iterable, Optional, Type

 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell

 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
            accelerator_options=accelerator_options,
        )
        self.options: TesseractOcrOptions
-
+        self._is_auto: bool = "auto" in self.options.lang
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        self.reader = None
        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):

            if lang == "auto":
                self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
-                self.osd_reader = tesserocr.PyTessBaseAPI(
-                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
-                )
            else:
                self.reader = tesserocr.PyTessBaseAPI(
                    **{"lang": lang} | tesserocr_kwargs,
                )
+            self.osd_reader = tesserocr.PyTessBaseAPI(
+                **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
+            )
            self.reader_RIL = tesserocr.RIL

    def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
            yield from page_batch
            return

-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
                    assert self.reader is not None
+                    assert self.osd_reader is not None
                    assert self._tesserocr_languages is not None

                    ocr_rects = self.get_ocr_rects(page)

                    all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                        # Skip zero area boxes
                        if ocr_rect.area() == 0:
                            continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
                        )

                        local_reader = self.reader
-                        if "auto" in self.options.lang:
-                            assert self.osd_reader is not None
-
-                            self.osd_reader.SetImage(high_res_image)
-                            osd = self.osd_reader.DetectOrientationScript()
-
-                            # No text, probably
-                            if osd is None:
+                        self.osd_reader.SetImage(high_res_image)
+                        osd = self.osd_reader.DetectOrientationScript()
+                        # No text, or Orientation and Script detection failure
+                        if osd is None:
+                            _log.error(
+                                "OSD failed for doc (doc %s, page: %s, "
+                                "OCR rectangle: %s)",
+                                conv_res.input.file,
+                                page_i,
+                                ocr_rect_i,
+                            )
+                            # Skipping if OSD fail when in auto mode, otherwise proceed
+                            # to OCR in the hope OCR will succeed while OSD failed
+                            if self._is_auto:
                                continue
-
+                        doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
+                        if doc_orientation != 0:
+                            high_res_image = high_res_image.rotate(
+                                -doc_orientation, expand=True
+                            )
+                        if self._is_auto:
                            script = osd["script_name"]
                            script = map_tesseract_script(script)
                            lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
                            # Extract text within the bounding box
                            text = local_reader.GetUTF8Text().strip()
                            confidence = local_reader.MeanTextConf()
-                            left = box["x"] / self.scale
-                            bottom = box["y"] / self.scale
-                            right = (box["x"] + box["w"]) / self.scale
-                            top = (box["y"] + box["h"]) / self.scale
-
+                            left, top = box["x"], box["y"]
+                            right = left + box["w"]
+                            bottom = top + box["h"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                            cells.append(
                                TextCell(
                                    index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
                                    orig=text,
                                    from_ocr=True,
                                    confidence=confidence,
-                                    rect=BoundingRectangle.from_bounding_box(
-                                        BoundingBox.from_tuple(
-                                            coord=(left, top, right, bottom),
-                                            origin=CoordOrigin.TOPLEFT,
-                                        ),
-                                    ),
+                                    rect=rect,
                                )
                            )

--- a/docling/utils/ocr_utils.py
+++ b/docling/utils/ocr_utils.py
@@ -1,3 +1,11 @@
+from typing import Optional, Tuple
+
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle
+
+from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
+
+
 def map_tesseract_script(script: str) -> str:
    r""" """
    if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
    elif script == "Korean":
        script = "Hangul"
    return script
+
+
+def parse_tesseract_orientation(orientation: str) -> int:
+    # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
+    # are [0, 360[ counterclockwise
+    parsed = int(orientation)
+    if parsed not in CLIPPED_ORIENTATIONS:
+        msg = (
+            f"invalid tesseract document orientation {orientation}, "
+            f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
+        )
+        raise ValueError(msg)
+    parsed = -parsed
+    parsed %= 360
+    return parsed
+
+
+def tesseract_box_to_bounding_rectangle(
+    bbox: BoundingBox,
+    *,
+    original_offset: Optional[BoundingBox] = None,
+    scale: float,
+    orientation: int,
+    im_size: Tuple[int, int],
+) -> BoundingRectangle:
+    # box is in the top, left, height, width format, top left coordinates
+    rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
+    rect = BoundingRectangle(
+        r_x0=rect.r_x0 / scale,
+        r_y0=rect.r_y0 / scale,
+        r_x1=rect.r_x1 / scale,
+        r_y1=rect.r_y1 / scale,
+        r_x2=rect.r_x2 / scale,
+        r_y2=rect.r_y2 / scale,
+        r_x3=rect.r_x3 / scale,
+        r_y3=rect.r_y3 / scale,
+        coord_origin=CoordOrigin.TOPLEFT,
+    )
+    if original_offset is not None:
+        if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
+            msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
+            raise ValueError(msg)
+        if original_offset is not None:
+            rect.r_x0 += original_offset.l
+            rect.r_x1 += original_offset.l
+            rect.r_x2 += original_offset.l
+            rect.r_x3 += original_offset.l
+            rect.r_y0 += original_offset.t
+            rect.r_y1 += original_offset.t
+            rect.r_y2 += original_offset.t
+            rect.r_y3 += original_offset.t
+    return rect
--- a/docling/utils/orientation.py
+++ b/docling/utils/orientation.py
@@ -0,0 +1,71 @@
+from typing import Tuple
+
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle
+
+CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
+
+
+def rotate_bounding_box(
+    bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
+) -> BoundingRectangle:
+    # The box is left top width height in TOPLEFT coordinates
+    # Bounding rectangle start with r_0 at the bottom left whatever the
+    # coordinate system. Then other corners are found rotating counterclockwise
+    bbox = bbox.to_top_left_origin(im_size[1])
+    left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
+    im_h, im_w = im_size
+    angle = angle % 360
+    if angle == 0:
+        r_x0 = left
+        r_y0 = top + height
+        r_x1 = r_x0 + width
+        r_y1 = r_y0
+        r_x2 = r_x0 + width
+        r_y2 = r_y0 - height
+        r_x3 = r_x0
+        r_y3 = r_y0 - height
+    elif angle == 90:
+        r_x0 = im_w - (top + height)
+        r_y0 = left
+        r_x1 = r_x0
+        r_y1 = r_y0 + width
+        r_x2 = r_x0 + height
+        r_y2 = r_y0 + width
+        r_x3 = r_x0
+        r_y3 = r_y0 + width
+    elif angle == 180:
+        r_x0 = im_h - left
+        r_y0 = im_w - (top + height)
+        r_x1 = r_x0 - width
+        r_y1 = r_y0
+        r_x2 = r_x0 - width
+        r_y2 = r_y0 + height
+        r_x3 = r_x0
+        r_y3 = r_y0 + height
+    elif angle == 270:
+        r_x0 = top + height
+        r_y0 = im_h - left
+        r_x1 = r_x0
+        r_y1 = r_y0 - width
+        r_x2 = r_x0 - height
+        r_y2 = r_y0 - width
+        r_x3 = r_x0 - height
+        r_y3 = r_y0
+    else:
+        msg = (
+            f"invalid orientation {angle}, expected values in:"
+            f" {sorted(CLIPPED_ORIENTATIONS)}"
+        )
+        raise ValueError(msg)
+    return BoundingRectangle(
+        r_x0=r_x0,
+        r_y0=r_y0,
+        r_x1=r_x1,
+        r_y1=r_y1,
+        r_x2=r_x2,
+        r_y2=r_y2,
+        r_x3=r_x3,
+        r_y3=r_y3,
+        coord_origin=CoordOrigin.TOPLEFT,
+    )