fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
2025-07-27 04:24:45 +00:00 · 2025-03-14 14:31:24 +01:00 · 2025-03-14 14:31:24 +01:00 · 7a3ef336fd
commit 7a3ef336fd
parent 98b5eeb844
35 changed files with 224 additions and 71 deletions
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -6,10 +6,9 @@ import tempfile
 from collections.abc import Iterable
 from pathlib import Path
 from subprocess import DEVNULL, PIPE, Popen
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, cast
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.base_models import Page
@ -21,7 +20,12 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
    Box,
    map_tesseract_script,
    parse_tesseract_orientation,
    tesseract_box_to_bounding_rectangle,
 )
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@ -93,14 +97,13 @@ class TesseractOcrCliModel(BaseOcrModel):
        return name, version
-    def _run_tesseract(self, ifilename: str):
+    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
        r"""
        Run tesseract CLI
        """
        cmd = [self.options.tesseract_cmd]
        if "auto" in self.options.lang:
-            lang = self._detect_language(ifilename)
+            lang = self._parse_language(osd)
            if lang is not None:
                cmd.append("-l")
                cmd.append(lang)
@ -139,11 +142,10 @@ class TesseractOcrCliModel(BaseOcrModel):
        return df_filtered
-    def _detect_language(self, ifilename: str):
+    def _perform_osd(self, ifilename: str) -> pd.DataFrame:
        r"""
        Run tesseract in PSM 0 mode to detect the language
        """
        assert self._tesseract_languages is not None
        cmd = [self.options.tesseract_cmd]
        cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
@ -154,7 +156,11 @@ class TesseractOcrCliModel(BaseOcrModel):
        df_detected = pd.read_csv(
            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
        )
-        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
+        return df_detected
    def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
        assert self._tesseract_languages is not None
        scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
        if len(scripts) == 0:
            _log.warning("Tesseract cannot detect the script of the page")
            return None
@ -225,8 +231,14 @@ class TesseractOcrCliModel(BaseOcrModel):
                            ) as image_file:
                                fname = image_file.name
                                high_res_image.save(image_file)
-
+                            df_osd = self._perform_osd(fname)
-                            df_result = self._run_tesseract(fname)
+                            doc_orientation = _parse_orientation(df_osd)
                            if doc_orientation != 0:
                                high_res_image = high_res_image.rotate(
                                    doc_orientation, expand=True
                                )
                                high_res_image.save(fname)
                            df_result = self._run_tesseract(fname, df_osd)
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)
@ -238,13 +250,22 @@ class TesseractOcrCliModel(BaseOcrModel):
                            text = row["text"]
                            conf = row["conf"]
-                            l = float(row["left"])  # noqa: E741
+                            rotated_bbox = (
-                            b = float(row["top"])
+                                row["left"],
-                            w = float(row["width"])
+                                row["top"],
-                            h = float(row["height"])
+                                row["width"],
-
+                                row["height"],
-                            t = b + h
+                            )
-                            r = l + w
+                            rotated_bbox = cast(
                                Box, tuple(float(c) for c in rotated_bbox)
                            )
                            rect = tesseract_box_to_bounding_rectangle(
                                rotated_bbox,
                                offset=ocr_rect,
                                scale=self.scale,
                                orientation=doc_orientation,
                                rotated_image_size=high_res_image.size,
                            )
                            cell = TextCell(
                                index=ix,
@ -252,17 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                                orig=str(text),
                                from_ocr=True,
                                confidence=conf / 100.0,
-                                rect=BoundingRectangle.from_bounding_box(
+                                rect=rect,
                                    BoundingBox.from_tuple(
                                        coord=(
                                            (l / self.scale) + ocr_rect.l,
                                            (b / self.scale) + ocr_rect.t,
                                            (r / self.scale) + ocr_rect.l,
                                            (t / self.scale) + ocr_rect.t,
                                        ),
                                        origin=CoordOrigin.TOPLEFT,
                                    )
                                ),
                            )
                            all_ocr_cells.append(cell)
@ -278,3 +289,9 @@ class TesseractOcrCliModel(BaseOcrModel):
    @classmethod
    def get_options_type(cls) -> Type[OcrOptions]:
        return TesseractCliOcrOptions
 def _parse_orientation(df_osd: pd.DataFrame) -> int:
    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
    orientation = parse_tesseract_orientation(orientations[0].strip())
    return orientation
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -3,10 +3,9 @@ from __future__ import annotations
 import logging
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional, Type
+from typing import Dict, Iterable, Optional, Type
-from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
    map_tesseract_script,
    parse_tesseract_orientation,
    tesseract_box_to_bounding_rectangle,
 )
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
            if lang == "auto":
                self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
                self.osd_reader = tesserocr.PyTessBaseAPI(
                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
                )
            else:
                self.reader = tesserocr.PyTessBaseAPI(
                    **{"lang": lang} | tesserocr_kwargs,
                )
            self.osd_reader = tesserocr.PyTessBaseAPI(
                **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
            )
            self.reader_RIL = tesserocr.RIL
    def __del__(self):
@ -125,6 +128,7 @@ class TesseractOcrModel(BaseOcrModel):
            else:
                with TimeRecorder(conv_res, "ocr"):
                    assert self.reader is not None
                    assert self.osd_reader is not None
                    assert self._tesserocr_languages is not None
                    ocr_rects = self.get_ocr_rects(page)
@ -139,16 +143,17 @@ class TesseractOcrModel(BaseOcrModel):
                        )
                        local_reader = self.reader
                        if "auto" in self.options.lang:
                            assert self.osd_reader is not None
                        self.osd_reader.SetImage(high_res_image)
                        osd = self.osd_reader.DetectOrientationScript()
                        # No text, probably
                        if osd is None:
                            continue
-
+                        doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
                        if doc_orientation != 0:
                            high_res_image = high_res_image.rotate(
                                doc_orientation, expand=True
                            )
                        if "auto" in self.options.lang:
                            script = osd["script_name"]
                            script = map_tesseract_script(script)
                            lang = f"{self.script_prefix}{script}"
@ -188,11 +193,14 @@ class TesseractOcrModel(BaseOcrModel):
                            # Extract text within the bounding box
                            text = local_reader.GetUTF8Text().strip()
                            confidence = local_reader.MeanTextConf()
-                            left = box["x"] / self.scale
+                            rotated_bbox = (box["x"], box["y"], box["w"], box["h"])
-                            bottom = box["y"] / self.scale
+                            rect = tesseract_box_to_bounding_rectangle(
-                            right = (box["x"] + box["w"]) / self.scale
+                                rotated_bbox,
-                            top = (box["y"] + box["h"]) / self.scale
+                                offset=ocr_rect,
-
+                                scale=self.scale,
                                orientation=doc_orientation,
                                rotated_image_size=high_res_image.size,
                            )
                            cells.append(
                                TextCell(
                                    index=ix,
@ -200,12 +208,7 @@ class TesseractOcrModel(BaseOcrModel):
                                    orig=text,
                                    from_ocr=True,
                                    confidence=confidence,
-                                    rect=BoundingRectangle.from_bounding_box(
+                                    rect=rect,
                                        BoundingBox.from_tuple(
                                            coord=(left, top, right, bottom),
                                            origin=CoordOrigin.TOPLEFT,
                                        ),
                                    ),
                                )
                            )
--- a/docling/utils/ocr_utils.py
+++ b/docling/utils/ocr_utils.py
@ -1,3 +1,15 @@
 from typing import Optional, Tuple
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle
 _TESSERACT_ORIENTATIONS = {0, 90, 180, 270}
 Point = Tuple[float, float]
 Box = Tuple[float, float, float, float]
 Size = Tuple[int, int]
 def map_tesseract_script(script: str) -> str:
    r""" """
    if script == "Katakana" or script == "Hiragana":
@ -7,3 +19,75 @@ def map_tesseract_script(script: str) -> str:
    elif script == "Korean":
        script = "Hangul"
    return script
 def reverse_tesseract_preprocessing_rotation(
    box: Box, orientation: int, rotated_im_size: Size
 ) -> tuple[Point, Point, Point, Point]:
    l, t, w, h = box
    rotated_w, rotated_h = rotated_im_size
    if orientation == 0:
        return (l, t), (l + w, t), (l + w, t + h), (l, t + h)
    if orientation == 90:
        x0 = rotated_h - t
        y0 = l
        return (x0, y0), (x0, y0 + w), (x0 - h, y0 + w), (x0 - h, y0)
    if orientation == 180:
        x0 = rotated_w - l
        y0 = rotated_h - t
        return (x0, y0), (x0 - w, y0), (x0 - w, y0 - h), (x0, y0 - h)
    if orientation == 270:
        x0 = t
        y0 = rotated_w - l
        return (x0, y0), (x0, y0 - w), (x0 + h, y0 - w), (x0 + h, y0)
    msg = (
        f"invalid tesseract document orientation {orientation}, "
        f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}"
    )
    raise ValueError(msg)
 def parse_tesseract_orientation(orientation: str) -> int:
    parsed = int(orientation)
    if parsed not in _TESSERACT_ORIENTATIONS:
        msg = (
            f"invalid tesseract document orientation {orientation}, "
            f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}"
        )
        raise ValueError(msg)
    return parsed
 def tesseract_box_to_bounding_rectangle(
    box: Box,
    *,
    offset: Optional[BoundingBox] = None,
    scale: float,
    orientation: int,
    rotated_image_size: Size,
 ) -> BoundingRectangle:
    # box is in the top, left, height, width format + top left orientation
    r_0, r_1, r_2, r_3 = reverse_tesseract_preprocessing_rotation(
        box, orientation, rotated_image_size
    )
    rect = BoundingRectangle(
        r_x0=r_0[0] / scale,
        r_y0=r_0[1] / scale,
        r_x1=r_1[0] / scale,
        r_y1=r_1[1] / scale,
        r_x2=r_2[0] / scale,
        r_y2=r_2[1] / scale,
        r_x3=r_3[0] / scale,
        r_y3=r_3[1] / scale,
        coord_origin=CoordOrigin.TOPLEFT,
    )
    if offset is not None:
        rect.r_x0 += offset.l
        rect.r_x1 += offset.l
        rect.r_x2 += offset.l
        rect.r_x3 += offset.l
        rect.r_y0 += offset.t
        rect.r_y1 += offset.t
        rect.r_y2 += offset.t
        rect.r_y3 += offset.t
    return rect
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt
@ -0,0 +1,3 @@
 <document>
 <paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json
@ -0,0 +1 @@
 {"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md
@ -0,0 +1 @@
 package
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json
@ -0,0 +1 @@
 [{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
@ -0,0 +1,4 @@
 <document>
 <paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
 <paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json
@ -0,0 +1 @@
 {"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md
@ -0,0 +1,3 @@
 package
 Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
@ -0,0 +1,3 @@
 <document>
 <paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json
@ -0,0 +1 @@
 {"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_270.pdf", "filename-prov": null, "document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [691.6666666666666, 444.53450520833337, 710.3333333333334, 521.5345052083334], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md
@ -0,0 +1 @@
 package
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
@ -0,0 +1,3 @@
 <document>
 <paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
 </document>
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
@ -0,0 +1 @@
 {"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_90.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.66666666666666, 73.53450520833337, 150.33333333333334, 150.53450520833331], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md
@ -0,0 +1 @@
 package
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt
@ -0,0 +1,3 @@
 <doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
 <text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json
@ -0,0 +1 @@
 {"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md
@ -0,0 +1,3 @@
 package
 Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt
@ -0,0 +1,3 @@
 <doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
 <text><loc_411><loc_62><loc_422><loc_127>package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json
@ -0,0 +1 @@
 {"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_270", "origin": {"mimetype": "application/pdf", "binary_hash": 10890858393843077593, "filename": "ocr_test_rotated_270.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 718.6666666666666, "t": 522.8678385416666, "r": 764.0, "b": 91.86783854166669, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 691.6666666666666, "t": 521.5345052083334, "r": 710.3333333333334, "b": 444.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}}
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md
@ -0,0 +1 @@
 package
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt
@ -0,0 +1,3 @@
 <doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
 <text><loc_78><loc_374><loc_89><loc_438>package</text>
 </doctag>
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json
@ -0,0 +1 @@
 {"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_90", "origin": {"mimetype": "application/pdf", "binary_hash": 6989291015361162334, "filename": "ocr_test_rotated_90.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 78.0, "t": 503.201171875, "r": 123.33333333333333, "b": 72.201171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 131.66666666666666, "t": 150.53450520833331, "r": 150.33333333333334, "b": 73.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}}
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md
@ -0,0 +1 @@
 package
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
--- a/tests/data_scanned/ocr_test_rotated_180.pdf
+++ b/tests/data_scanned/ocr_test_rotated_180.pdf
--- a/tests/data_scanned/ocr_test_rotated_270.pdf
+++ b/tests/data_scanned/ocr_test_rotated_270.pdf
--- a/tests/data_scanned/ocr_test_rotated_90.pdf
+++ b/tests/data_scanned/ocr_test_rotated_90.pdf
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -1,6 +1,6 @@
 import sys
 from pathlib import Path
-from typing import List
+from typing import List, Tuple
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -56,33 +56,35 @@ def get_converter(ocr_options: OcrOptions):
 def test_e2e_conversions():
    pdf_paths = get_pdf_paths()
-    engines: List[OcrOptions] = [
+    engines: List[Tuple[OcrOptions, bool]] = [
-        EasyOcrOptions(),
+        (EasyOcrOptions(), False),
-        TesseractOcrOptions(),
+        (TesseractOcrOptions(), True),
-        TesseractCliOcrOptions(),
+        (TesseractCliOcrOptions(), True),
-        EasyOcrOptions(force_full_page_ocr=True),
+        (EasyOcrOptions(force_full_page_ocr=True), False),
-        TesseractOcrOptions(force_full_page_ocr=True),
+        (TesseractOcrOptions(force_full_page_ocr=True), True),
-        TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+        (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
-        TesseractCliOcrOptions(force_full_page_ocr=True),
+        (TesseractCliOcrOptions(force_full_page_ocr=True), True),
-        TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+        (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
    ]
    # rapidocr is only available for Python >=3.6,<3.13
    if sys.version_info < (3, 13):
-        engines.append(RapidOcrOptions())
+        engines.append((RapidOcrOptions(), False))
-        engines.append(RapidOcrOptions(force_full_page_ocr=True))
+        engines.append((RapidOcrOptions(force_full_page_ocr=True), False))
    # only works on mac
    if "darwin" == sys.platform:
-        engines.append(OcrMacOptions())
+        engines.append((OcrMacOptions(), True))
-        engines.append(OcrMacOptions(force_full_page_ocr=True))
+        engines.append((OcrMacOptions(force_full_page_ocr=True), True))
-    for ocr_options in engines:
+    for ocr_options, supports_rotation in engines:
        print(
            f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
        )
        converter = get_converter(ocr_options=ocr_options)
        for pdf_path in pdf_paths:
            if not supports_rotation and "rotated" in pdf_path.name:
                continue
            print(f"converting {pdf_path}")
            doc_result: ConversionResult = converter.convert(pdf_path)
		`@ -0,0 +1 @@`
							{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
		`@ -0,0 +1 @@`
							[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]
		`@ -0,0 +1,3 @@`
							`package`

							`Docling bundles PDF document conversion to JSON and Markdown in an easy self contained`
		`@ -0,0 +1 @@`
							{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}