feat: add support for ocrmac OCR engine on macOS

- Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <nuridol@gmail.com>
2025-07-30 14:04:27 +00:00 · 2024-11-08 09:08:55 +09:00 · 2024-11-08 09:08:55 +09:00 · 4aaf128384
commit 4aaf128384
parent 6c22cba0a7
6 changed files with 147 additions and 1 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -28,6 +28,7 @@ from docling.datamodel.pipeline_options import (
    TableFormerMode,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
+    OcrMacOptions,
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

@ -73,6 +74,7 @@ class OcrEngine(str, Enum):
    EASYOCR = "easyocr"
    TESSERACT_CLI = "tesseract_cli"
    TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"


 def export_documents(
@ -224,6 +226,8 @@ def convert(
            ocr_options = TesseractCliOcrOptions()
        case OcrEngine.TESSERACT:
            ocr_options = TesseractOcrOptions()
+        case OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions()
        case _:
            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -60,6 +60,16 @@ class TesseractOcrOptions(OcrOptions):
        extra="forbid",
    )

+class OcrMacOptions(OcrOptions):
+    kind: Literal["ocrmac"] = "ocrmac"
+    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
+    recognition: str = "accurate"
+    framework: str = "vision"
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+

 class PipelineOptions(BaseModel):
    create_legacy_output: bool = (
@ -73,7 +83,7 @@ class PdfPipelineOptions(PipelineOptions):
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
        Field(EasyOcrOptions(), discriminator="kind")
    )

--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -0,0 +1,123 @@
+import logging
+import tempfile
+
+from typing import Iterable, Optional, Tuple
+
+from docling_core.types.doc import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import OcrMacOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
+
+
+_log = logging.getLogger(__name__)
+
+
+class OcrMacModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: OcrMacOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: OcrMacOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            setup_errmsg = (
+                "ocrmac is not correctly installed. "
+                "Please install it via `pip install ocrmac` to use this OCR engine. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation."
+            )
+            try:
+                from ocrmac import ocrmac
+            except ImportError:
+                raise ImportError(setup_errmsg)
+
+            self.reader_RIL = ocrmac.OCR
+
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page_idx, page in enumerate(page_batch):
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+
+                    ocr_rects = self.get_ocr_rects(page)
+
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+
+                            boxes = self.reader_RIL(fname,
+                                recognition_level=self.options.recognition,
+                                framework=self.options.framework,
+                                language_preference=self.options.lang,
+                            ).recognize()
+
+                        im_width, im_height = high_res_image.size
+                        cells = []
+                        for ix, (text, confidence, box) in enumerate(boxes):
+                            x = float(box[0])
+                            y = float(box[1])
+                            w = float(box[2])
+                            h = float(box[3])
+
+                            x1 = x * im_width
+                            y2 = (1 - y) * im_height
+
+                            x2 = x1 + w * im_width
+                            y1 = y2 - h * im_height
+
+                            left = x1 / self.scale
+                            top = y1 / self.scale
+                            right = x2 / self.scale
+                            bottom = y2 / self.scale
+
+                            cells.append(
+                                OcrCell(
+                                    id=ix,
+                                    text=text,
+                                    confidence=confidence,
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(left, top, right, bottom),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    ),
+                                )
+                            )
+
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
+
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
+
+                    page.cells.extend(filtered_ocr_cells)
+
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+
+                yield page
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
+    OcrMacOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import (
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
+from docling.models.ocr_mac_model import OcrMacModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@ -118,6 +120,11 @@ class StandardPdfPipeline(PaginatedPipeline):
                enabled=self.pipeline_options.do_ocr,
                options=self.pipeline_options.ocr_options,
            )
+        elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
+            return OcrMacModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
        return None

    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
--- a/docs/installation.md
+++ b/docs/installation.md
@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
    | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
    | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
    | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+    | OcrMac | System dependency. See description below. | `OcrMacOptions` |

    The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example

--- a/pyproject.toml
+++ b/pyproject.toml
@ -89,6 +89,7 @@ torchvision = [
  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
 ]
+ocrmac = {markers = "sys_platform == 'darwin'", version = "^1.0.0"}

 [tool.poetry.extras]
 tesserocr = ["tesserocr"]