From 4aaf128384d837ae0385343202c9d3080869ddec Mon Sep 17 00:00:00 2001
From: NuRi <nuridol+github@gmail.com>
Date: Fri, 8 Nov 2024 09:08:55 +0900
Subject: [PATCH] feat: add support for `ocrmac` OCR engine on macOS

- Integrates `ocrmac` as an OCR engine option for macOS users.
- Adds configuration options and dependencies for `ocrmac`.
- Updates documentation to reflect new engine support.

This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility.

Signed-off-by: Suhwan Seo <nuridol@gmail.com>
---
 docling/cli/main.py                       |   4 +
 docling/datamodel/pipeline_options.py     |  12 ++-
 docling/models/ocr_mac_model.py           | 123 ++++++++++++++++++++++
 docling/pipeline/standard_pdf_pipeline.py |   7 ++
 docs/installation.md                      |   1 +
 pyproject.toml                            |   1 +
 6 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 docling/models/ocr_mac_model.py

diff --git a/docling/cli/main.py b/docling/cli/main.py
index 35ae01df..e965e07a 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -28,6 +28,7 @@ from docling.datamodel.pipeline_options import (
     TableFormerMode,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
+    OcrMacOptions,
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 
@@ -73,6 +74,7 @@ class OcrEngine(str, Enum):
     EASYOCR = "easyocr"
     TESSERACT_CLI = "tesseract_cli"
     TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"
 
 
 def export_documents(
@@ -224,6 +226,8 @@ def convert(
             ocr_options = TesseractCliOcrOptions()
         case OcrEngine.TESSERACT:
             ocr_options = TesseractOcrOptions()
+        case OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions()
         case _:
             raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
 
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index d57f1671..9efcfc6f 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -60,6 +60,16 @@ class TesseractOcrOptions(OcrOptions):
         extra="forbid",
     )
 
+class OcrMacOptions(OcrOptions):
+    kind: Literal["ocrmac"] = "ocrmac"
+    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
+    recognition: str = "accurate"
+    framework: str = "vision"
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
 
 class PipelineOptions(BaseModel):
     create_legacy_output: bool = (
@@ -73,7 +83,7 @@ class PdfPipelineOptions(PipelineOptions):
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
         Field(EasyOcrOptions(), discriminator="kind")
     )
 
diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py
new file mode 100644
index 00000000..cbe8fbd9
--- /dev/null
+++ b/docling/models/ocr_mac_model.py
@@ -0,0 +1,123 @@
+import logging
+import tempfile
+
+from typing import Iterable, Optional, Tuple
+
+from docling_core.types.doc import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import OcrMacOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
+
+
+_log = logging.getLogger(__name__)
+
+
+class OcrMacModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: OcrMacOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: OcrMacOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            setup_errmsg = (
+                "ocrmac is not correctly installed. "
+                "Please install it via `pip install ocrmac` to use this OCR engine. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation."
+            )
+            try:
+                from ocrmac import ocrmac
+            except ImportError:
+                raise ImportError(setup_errmsg)
+
+            self.reader_RIL = ocrmac.OCR
+
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page_idx, page in enumerate(page_batch):
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+
+                    ocr_rects = self.get_ocr_rects(page)
+
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+
+                            boxes = self.reader_RIL(fname,
+                                recognition_level=self.options.recognition,
+                                framework=self.options.framework,
+                                language_preference=self.options.lang,
+                            ).recognize()
+
+                        im_width, im_height = high_res_image.size
+                        cells = []
+                        for ix, (text, confidence, box) in enumerate(boxes):
+                            x = float(box[0])
+                            y = float(box[1])
+                            w = float(box[2])
+                            h = float(box[3])
+
+                            x1 = x * im_width
+                            y2 = (1 - y) * im_height
+
+                            x2 = x1 + w * im_width
+                            y1 = y2 - h * im_height
+
+                            left = x1 / self.scale
+                            top = y1 / self.scale
+                            right = x2 / self.scale
+                            bottom = y2 / self.scale
+
+                            cells.append(
+                                OcrCell(
+                                    id=ix,
+                                    text=text,
+                                    confidence=confidence,
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(left, top, right, bottom),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    ),
+                                )
+                            )
+
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
+
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
+
+                    page.cells.extend(filtered_ocr_cells)
+
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+
+                yield page
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 65803d4f..5e059fc8 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
     PdfPipelineOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
+    OcrMacOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
@@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import (
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
+from docling.models.ocr_mac_model import OcrMacModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 
@@ -118,6 +120,11 @@ class StandardPdfPipeline(PaginatedPipeline):
                 enabled=self.pipeline_options.do_ocr,
                 options=self.pipeline_options.ocr_options,
             )
+        elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
+            return OcrMacModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
         return None
 
     def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
diff --git a/docs/installation.md b/docs/installation.md
index df18dece..7701543b 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
     | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
     | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
     | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+    | OcrMac | System dependency. See description below. | `OcrMacOptions` |
 
     The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
 
diff --git a/pyproject.toml b/pyproject.toml
index c1196d01..7504f74d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,6 +89,7 @@ torchvision = [
   {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
   {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
 ]
+ocrmac = {markers = "sys_platform == 'darwin'", version = "^1.0.0"}
 
 [tool.poetry.extras]
 tesserocr = ["tesserocr"]