From 944988cb3049995043e14ba6e30d1204be5c1558 Mon Sep 17 00:00:00 2001 From: Suhwan Seo Date: Fri, 8 Nov 2024 19:05:33 +0900 Subject: [PATCH] Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems - Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo --- docling/cli/main.py | 2 +- docling/datamodel/pipeline_options.py | 7 ++- docling/models/ocr_mac_model.py | 6 +- docling/pipeline/standard_pdf_pipeline.py | 9 ++- docs/usage.md | 68 +++++++++++------------ pyproject.toml | 1 + tests/test_e2e_ocr_conversion.py | 6 ++ 7 files changed, 55 insertions(+), 44 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index e965e07a..157ac8d8 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -23,12 +23,12 @@ from docling.datamodel.base_models import ( from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, OcrOptions, PdfPipelineOptions, TableFormerMode, TesseractCliOcrOptions, TesseractOcrOptions, - OcrMacOptions, ) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 9efcfc6f..fa5e9a85 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -60,6 +60,7 @@ class TesseractOcrOptions(OcrOptions): extra="forbid", ) + class OcrMacOptions(OcrOptions): kind: Literal["ocrmac"] = "ocrmac" lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"] @@ -83,9 +84,9 @@ class PdfPipelineOptions(PipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = ( - Field(EasyOcrOptions(), discriminator="kind") - ) + ocr_options: Union[ + EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions + ] = Field(EasyOcrOptions(), discriminator="kind") images_scale: float = 1.0 generate_page_images: bool = False diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index cbe8fbd9..c0c587ff 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -1,6 +1,5 @@ import logging import tempfile - from typing import Iterable, Optional, Tuple from docling_core.types.doc import BoundingBox, CoordOrigin @@ -12,7 +11,6 @@ from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel from docling.utils.profiling import TimeRecorder - _log = logging.getLogger(__name__) @@ -36,7 +34,6 @@ class OcrMacModel(BaseOcrModel): self.reader_RIL = ocrmac.OCR - def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: @@ -69,7 +66,8 @@ class OcrMacModel(BaseOcrModel): fname = image_file.name high_res_image.save(fname) - boxes = self.reader_RIL(fname, + boxes = self.reader_RIL( + fname, recognition_level=self.options.recognition, framework=self.options.framework, language_preference=self.options.lang, diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 5e059fc8..63a7a89f 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -1,4 +1,5 @@ import logging +import sys from pathlib import Path from typing import Optional @@ -10,15 +11,16 @@ from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, - OcrMacOptions, ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel +from docling.models.ocr_mac_model import OcrMacModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_preprocessing_model import ( PagePreprocessingModel, @@ -27,7 +29,6 @@ from docling.models.page_preprocessing_model import ( from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel -from docling.models.ocr_mac_model import OcrMacModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -121,6 +122,10 @@ class StandardPdfPipeline(PaginatedPipeline): options=self.pipeline_options.ocr_options, ) elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions): + if "darwin" != sys.platform: + raise RuntimeError( + f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}." + ) return OcrMacModel( enabled=self.pipeline_options.do_ocr, options=self.pipeline_options.ocr_options, diff --git a/docs/usage.md b/docs/usage.md index f58a72dd..90a33f10 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -33,40 +33,40 @@ Here are the available options as of this writing (for an up-to-date listing, ru $ docling --help Usage: docling [OPTIONS] source - -╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │ -│ [required] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │ -│ Defaults to all formats. │ -│ [default: None] │ -│ --to [md|json|text|doctags] Specify output formats. Defaults to │ -│ Markdown. │ -│ [default: None] │ -│ --ocr --no-ocr If enabled, the bitmap content will be │ -│ processed using OCR. │ -│ [default: ocr] │ -│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │ -│ [default: easyocr] │ -│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ -│ [default: dlparse_v1] │ -│ --table-mode [fast|accurate] The mode to use in the table structure │ -│ model. │ -│ [default: fast] │ -│ --artifacts-path PATH If provided, the location of the model │ -│ artifacts. │ -│ [default: None] │ -│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │ -│ processed using OCR. │ -│ [default: no-abort-on-error] │ -│ --output PATH Output directory where results are │ -│ saved. │ -│ [default: .] │ -│ --version Show version information. │ -│ --help Show this message and exit. │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + +╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │ +│ [required] │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │ +│ Defaults to all formats. │ +│ [default: None] │ +│ --to [md|json|text|doctags] Specify output formats. Defaults to │ +│ Markdown. │ +│ [default: None] │ +│ --ocr --no-ocr If enabled, the bitmap content will be │ +│ processed using OCR. │ +│ [default: ocr] │ +│ --ocr-engine [easyocr|tesseract_cli|tesseract|ocrmac] The OCR engine to use. │ +│ [default: easyocr] │ +│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ +│ [default: dlparse_v1] │ +│ --table-mode [fast|accurate] The mode to use in the table structure │ +│ model. │ +│ [default: fast] │ +│ --artifacts-path PATH If provided, the location of the model │ +│ artifacts. │ +│ [default: None] │ +│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │ +│ processed using OCR. │ +│ [default: no-abort-on-error] │ +│ --output PATH Output directory where results are │ +│ saved. │ +│ [default: .] │ +│ --version Show version information. │ +│ --help Show this message and exit. │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` diff --git a/pyproject.toml b/pyproject.toml index 3340d336..f9a4a876 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -128,6 +128,7 @@ module = [ "tesserocr.*", "docling_ibm_models.*", "easyocr.*", + "ocrmac.*", "deepsearch_glm.*", "lxml.*", "bs4.*", diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 2aeda467..1bd43936 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path from typing import List @@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, OcrOptions, PdfPipelineOptions, TesseractCliOcrOptions, @@ -83,6 +85,10 @@ def test_e2e_conversions(): TesseractCliOcrOptions(), ] + # only works on mac + if "darwin" == sys.platform: + engines.append(OcrMacOptions()) + for ocr_options in engines: print(f"Converting with ocr_engine: {ocr_options.kind}") converter = get_converter(ocr_options=ocr_options)