Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems

- Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo <nuridol@gmail.com>
2025-07-30 14:04:27 +00:00 · 2024-11-08 19:05:33 +09:00 · 2024-11-08 19:05:33 +09:00 · 944988cb30
commit 944988cb30
parent 719cfe93c3
7 changed files with 55 additions and 44 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -23,12 +23,12 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
+    OcrMacOptions,
    OcrOptions,
    PdfPipelineOptions,
    TableFormerMode,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
-    OcrMacOptions,
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -60,6 +60,7 @@ class TesseractOcrOptions(OcrOptions):
        extra="forbid",
    )

+
 class OcrMacOptions(OcrOptions):
    kind: Literal["ocrmac"] = "ocrmac"
    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
@ -83,9 +84,9 @@ class PdfPipelineOptions(PipelineOptions):
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
-        Field(EasyOcrOptions(), discriminator="kind")
-    )
+    ocr_options: Union[
+        EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
+    ] = Field(EasyOcrOptions(), discriminator="kind")

    images_scale: float = 1.0
    generate_page_images: bool = False
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -1,6 +1,5 @@
 import logging
 import tempfile
-
 from typing import Iterable, Optional, Tuple

 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -12,7 +11,6 @@ from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.profiling import TimeRecorder

-
 _log = logging.getLogger(__name__)


@ -36,7 +34,6 @@ class OcrMacModel(BaseOcrModel):

            self.reader_RIL = ocrmac.OCR

-
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -69,7 +66,8 @@ class OcrMacModel(BaseOcrModel):
                            fname = image_file.name
                            high_res_image.save(fname)

-                            boxes = self.reader_RIL(fname,
+                            boxes = self.reader_RIL(
+                                fname,
                                recognition_level=self.options.recognition,
                                framework=self.options.framework,
                                language_preference=self.options.lang,
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional

@ -10,15 +11,16 @@ from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
+    OcrMacOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
-    OcrMacOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
+from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
 from docling.models.page_preprocessing_model import (
    PagePreprocessingModel,
@ -27,7 +29,6 @@ from docling.models.page_preprocessing_model import (
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
-from docling.models.ocr_mac_model import OcrMacModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder

@ -121,6 +122,10 @@ class StandardPdfPipeline(PaginatedPipeline):
                options=self.pipeline_options.ocr_options,
            )
        elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
+            if "darwin" != sys.platform:
+                raise RuntimeError(
+                    f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
+                )
            return OcrMacModel(
                enabled=self.pipeline_options.do_ocr,
                options=self.pipeline_options.ocr_options,
--- a/docs/usage.md
+++ b/docs/usage.md
@ -33,40 +33,40 @@ Here are the available options as of this writing (for an up-to-date listing, ru
 $ docling --help

 Usage: docling [OPTIONS] source                                                                                             
-                                                                                                                             
-╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None]         │
-│                                 [required]                                                                                │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --from                                     [docx|pptx|html|image|pdf|asciidoc|md]  Specify input formats to convert from. │
-│                                                                                    Defaults to all formats.               │
-│                                                                                    [default: None]                        │
-│ --to                                       [md|json|text|doctags]                  Specify output formats. Defaults to    │
-│                                                                                    Markdown.                              │
-│                                                                                    [default: None]                        │
-│ --ocr               --no-ocr                                                       If enabled, the bitmap content will be │
-│                                                                                    processed using OCR.                   │
-│                                                                                    [default: ocr]                         │
-│ --ocr-engine                               [easyocr|tesseract_cli|tesseract]       The OCR engine to use.                 │
-│                                                                                    [default: easyocr]                     │
-│ --pdf-backend                              [pypdfium2|dlparse_v1|dlparse_v2]       The PDF backend to use.                │
-│                                                                                    [default: dlparse_v1]                  │
-│ --table-mode                               [fast|accurate]                         The mode to use in the table structure │
-│                                                                                    model.                                 │
-│                                                                                    [default: fast]                        │
-│ --artifacts-path                           PATH                                    If provided, the location of the model │
-│                                                                                    artifacts.                             │
-│                                                                                    [default: None]                        │
-│ --abort-on-error    --no-abort-on-error                                            If enabled, the bitmap content will be │
-│                                                                                    processed using OCR.                   │
-│                                                                                    [default: no-abort-on-error]           │
-│ --output                                   PATH                                    Output directory where results are     │
-│                                                                                    saved.                                 │
-│                                                                                    [default: .]                           │
-│ --version                                                                          Show version information.              │
-│ --help                                                                             Show this message and exit.            │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
+╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None]           │
+│                                 [required]                                                                                  │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --from                                     [docx|pptx|html|image|pdf|asciidoc|md]    Specify input formats to convert from. │
+│                                                                                      Defaults to all formats.               │
+│                                                                                      [default: None]                        │
+│ --to                                       [md|json|text|doctags]                    Specify output formats. Defaults to    │
+│                                                                                      Markdown.                              │
+│                                                                                      [default: None]                        │
+│ --ocr               --no-ocr                                                         If enabled, the bitmap content will be │
+│                                                                                      processed using OCR.                   │
+│                                                                                      [default: ocr]                         │
+│ --ocr-engine                               [easyocr|tesseract_cli|tesseract|ocrmac]  The OCR engine to use.                 │
+│                                                                                      [default: easyocr]                     │
+│ --pdf-backend                              [pypdfium2|dlparse_v1|dlparse_v2]         The PDF backend to use.                │
+│                                                                                      [default: dlparse_v1]                  │
+│ --table-mode                               [fast|accurate]                           The mode to use in the table structure │
+│                                                                                      model.                                 │
+│                                                                                      [default: fast]                        │
+│ --artifacts-path                           PATH                                      If provided, the location of the model │
+│                                                                                      artifacts.                             │
+│                                                                                      [default: None]                        │
+│ --abort-on-error    --no-abort-on-error                                              If enabled, the bitmap content will be │
+│                                                                                      processed using OCR.                   │
+│                                                                                      [default: no-abort-on-error]           │
+│ --output                                   PATH                                      Output directory where results are     │
+│                                                                                      saved.                                 │
+│                                                                                      [default: .]                           │
+│ --version                                                                            Show version information.              │
+│ --help                                                                               Show this message and exit.            │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

 ```
 </details>
--- a/pyproject.toml
+++ b/pyproject.toml
@ -128,6 +128,7 @@ module = [
    "tesserocr.*",
    "docling_ibm_models.*",
    "easyocr.*",
+    "ocrmac.*",
    "deepsearch_glm.*",
    "lxml.*",
    "bs4.*",
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -1,3 +1,4 @@
+import sys
 from pathlib import Path
 from typing import List

@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
+    OcrMacOptions,
    OcrOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
@ -83,6 +85,10 @@ def test_e2e_conversions():
        TesseractCliOcrOptions(),
    ]

+    # only works on mac
+    if "darwin" == sys.platform:
+        engines.append(OcrMacOptions())
+
    for ocr_options in engines:
        print(f"Converting with ocr_engine: {ocr_options.kind}")
        converter = get_converter(ocr_options=ocr_options)