From 944988cb3049995043e14ba6e30d1204be5c1558 Mon Sep 17 00:00:00 2001
From: Suhwan Seo <nuridol@gmail.com>
Date: Fri, 8 Nov 2024 19:05:33 +0900
Subject: [PATCH] Fix linting issues, update CLI docs, and add error for ocrmac
 use on non-Mac systems

- Resolved formatting and linting issues
- Updated `--ocr-engine` CLI option documentation for `ocrmac`
- Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms

Signed-off-by: Suhwan Seo <nuridol@gmail.com>
---
 docling/cli/main.py                       |  2 +-
 docling/datamodel/pipeline_options.py     |  7 ++-
 docling/models/ocr_mac_model.py           |  6 +-
 docling/pipeline/standard_pdf_pipeline.py |  9 ++-
 docs/usage.md                             | 68 +++++++++++------------
 pyproject.toml                            |  1 +
 tests/test_e2e_ocr_conversion.py          |  6 ++
 7 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index e965e07a..157ac8d8 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -23,12 +23,12 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     OcrOptions,
     PdfPipelineOptions,
     TableFormerMode,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
-    OcrMacOptions,
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 9efcfc6f..fa5e9a85 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -60,6 +60,7 @@ class TesseractOcrOptions(OcrOptions):
         extra="forbid",
     )
 
+
 class OcrMacOptions(OcrOptions):
     kind: Literal["ocrmac"] = "ocrmac"
     lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
@@ -83,9 +84,9 @@ class PdfPipelineOptions(PipelineOptions):
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
-        Field(EasyOcrOptions(), discriminator="kind")
-    )
+    ocr_options: Union[
+        EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
+    ] = Field(EasyOcrOptions(), discriminator="kind")
 
     images_scale: float = 1.0
     generate_page_images: bool = False
diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py
index cbe8fbd9..c0c587ff 100644
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@@ -1,6 +1,5 @@
 import logging
 import tempfile
-
 from typing import Iterable, Optional, Tuple
 
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -12,7 +11,6 @@ from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.profiling import TimeRecorder
 
-
 _log = logging.getLogger(__name__)
 
 
@@ -36,7 +34,6 @@ class OcrMacModel(BaseOcrModel):
 
             self.reader_RIL = ocrmac.OCR
 
-
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -69,7 +66,8 @@ class OcrMacModel(BaseOcrModel):
                             fname = image_file.name
                             high_res_image.save(fname)
 
-                            boxes = self.reader_RIL(fname,
+                            boxes = self.reader_RIL(
+                                fname,
                                 recognition_level=self.options.recognition,
                                 framework=self.options.framework,
                                 language_preference=self.options.lang,
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 5e059fc8..63a7a89f 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional
 
@@ -10,15 +11,16 @@ from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     PdfPipelineOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
-    OcrMacOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
+from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
 from docling.models.page_preprocessing_model import (
     PagePreprocessingModel,
@@ -27,7 +29,6 @@ from docling.models.page_preprocessing_model import (
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
-from docling.models.ocr_mac_model import OcrMacModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 
@@ -121,6 +122,10 @@ class StandardPdfPipeline(PaginatedPipeline):
                 options=self.pipeline_options.ocr_options,
             )
         elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
+            if "darwin" != sys.platform:
+                raise RuntimeError(
+                    f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
+                )
             return OcrMacModel(
                 enabled=self.pipeline_options.do_ocr,
                 options=self.pipeline_options.ocr_options,
diff --git a/docs/usage.md b/docs/usage.md
index f58a72dd..90a33f10 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -33,40 +33,40 @@ Here are the available options as of this writing (for an up-to-date listing, ru
 $ docling --help
 
  Usage: docling [OPTIONS] source                                                                                             
-                                                                                                                             
-╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None]         │
-│                                 [required]                                                                                │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --from                                     [docx|pptx|html|image|pdf|asciidoc|md]  Specify input formats to convert from. │
-│                                                                                    Defaults to all formats.               │
-│                                                                                    [default: None]                        │
-│ --to                                       [md|json|text|doctags]                  Specify output formats. Defaults to    │
-│                                                                                    Markdown.                              │
-│                                                                                    [default: None]                        │
-│ --ocr               --no-ocr                                                       If enabled, the bitmap content will be │
-│                                                                                    processed using OCR.                   │
-│                                                                                    [default: ocr]                         │
-│ --ocr-engine                               [easyocr|tesseract_cli|tesseract]       The OCR engine to use.                 │
-│                                                                                    [default: easyocr]                     │
-│ --pdf-backend                              [pypdfium2|dlparse_v1|dlparse_v2]       The PDF backend to use.                │
-│                                                                                    [default: dlparse_v1]                  │
-│ --table-mode                               [fast|accurate]                         The mode to use in the table structure │
-│                                                                                    model.                                 │
-│                                                                                    [default: fast]                        │
-│ --artifacts-path                           PATH                                    If provided, the location of the model │
-│                                                                                    artifacts.                             │
-│                                                                                    [default: None]                        │
-│ --abort-on-error    --no-abort-on-error                                            If enabled, the bitmap content will be │
-│                                                                                    processed using OCR.                   │
-│                                                                                    [default: no-abort-on-error]           │
-│ --output                                   PATH                                    Output directory where results are     │
-│                                                                                    saved.                                 │
-│                                                                                    [default: .]                           │
-│ --version                                                                          Show version information.              │
-│ --help                                                                             Show this message and exit.            │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
+╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None]           │
+│                                 [required]                                                                                  │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --from                                     [docx|pptx|html|image|pdf|asciidoc|md]    Specify input formats to convert from. │
+│                                                                                      Defaults to all formats.               │
+│                                                                                      [default: None]                        │
+│ --to                                       [md|json|text|doctags]                    Specify output formats. Defaults to    │
+│                                                                                      Markdown.                              │
+│                                                                                      [default: None]                        │
+│ --ocr               --no-ocr                                                         If enabled, the bitmap content will be │
+│                                                                                      processed using OCR.                   │
+│                                                                                      [default: ocr]                         │
+│ --ocr-engine                               [easyocr|tesseract_cli|tesseract|ocrmac]  The OCR engine to use.                 │
+│                                                                                      [default: easyocr]                     │
+│ --pdf-backend                              [pypdfium2|dlparse_v1|dlparse_v2]         The PDF backend to use.                │
+│                                                                                      [default: dlparse_v1]                  │
+│ --table-mode                               [fast|accurate]                           The mode to use in the table structure │
+│                                                                                      model.                                 │
+│                                                                                      [default: fast]                        │
+│ --artifacts-path                           PATH                                      If provided, the location of the model │
+│                                                                                      artifacts.                             │
+│                                                                                      [default: None]                        │
+│ --abort-on-error    --no-abort-on-error                                              If enabled, the bitmap content will be │
+│                                                                                      processed using OCR.                   │
+│                                                                                      [default: no-abort-on-error]           │
+│ --output                                   PATH                                      Output directory where results are     │
+│                                                                                      saved.                                 │
+│                                                                                      [default: .]                           │
+│ --version                                                                            Show version information.              │
+│ --help                                                                               Show this message and exit.            │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
 ```
 </details>
diff --git a/pyproject.toml b/pyproject.toml
index 3340d336..f9a4a876 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -128,6 +128,7 @@ module = [
     "tesserocr.*",
     "docling_ibm_models.*",
     "easyocr.*",
+    "ocrmac.*",
     "deepsearch_glm.*",
     "lxml.*",
     "bs4.*",
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 2aeda467..1bd43936 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -1,3 +1,4 @@
+import sys
 from pathlib import Path
 from typing import List
 
@@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     OcrOptions,
     PdfPipelineOptions,
     TesseractCliOcrOptions,
@@ -83,6 +85,10 @@ def test_e2e_conversions():
         TesseractCliOcrOptions(),
     ]
 
+    # only works on mac
+    if "darwin" == sys.platform:
+        engines.append(OcrMacOptions())
+
     for ocr_options in engines:
         print(f"Converting with ocr_engine: {ocr_options.kind}")
         converter = get_converter(ocr_options=ocr_options)