Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems

- Resolved formatting and linting issues
- Updated `--ocr-engine` CLI option documentation for `ocrmac`
- Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms

Signed-off-by: Suhwan Seo <nuridol@gmail.com>
This commit is contained in:
Suhwan Seo 2024-11-08 19:05:33 +09:00
parent 719cfe93c3
commit 944988cb30
7 changed files with 55 additions and 44 deletions

View File

@ -23,12 +23,12 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions,
OcrOptions, OcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
TableFormerMode, TableFormerMode,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
OcrMacOptions,
) )
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

View File

@ -60,6 +60,7 @@ class TesseractOcrOptions(OcrOptions):
extra="forbid", extra="forbid",
) )
class OcrMacOptions(OcrOptions): class OcrMacOptions(OcrOptions):
kind: Literal["ocrmac"] = "ocrmac" kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"] lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
@ -83,9 +84,9 @@ class PdfPipelineOptions(PipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = ( ocr_options: Union[
Field(EasyOcrOptions(), discriminator="kind") EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
) ] = Field(EasyOcrOptions(), discriminator="kind")
images_scale: float = 1.0 images_scale: float = 1.0
generate_page_images: bool = False generate_page_images: bool = False

View File

@ -1,6 +1,5 @@
import logging import logging
import tempfile import tempfile
from typing import Iterable, Optional, Tuple from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -12,7 +11,6 @@ from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -36,7 +34,6 @@ class OcrMacModel(BaseOcrModel):
self.reader_RIL = ocrmac.OCR self.reader_RIL = ocrmac.OCR
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
@ -69,7 +66,8 @@ class OcrMacModel(BaseOcrModel):
fname = image_file.name fname = image_file.name
high_res_image.save(fname) high_res_image.save(fname)
boxes = self.reader_RIL(fname, boxes = self.reader_RIL(
fname,
recognition_level=self.options.recognition, recognition_level=self.options.recognition,
framework=self.options.framework, framework=self.options.framework,
language_preference=self.options.lang, language_preference=self.options.lang,

View File

@ -1,4 +1,5 @@
import logging import logging
import sys
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -10,15 +11,16 @@ from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
OcrMacOptions,
) )
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import ( from docling.models.page_preprocessing_model import (
PagePreprocessingModel, PagePreprocessingModel,
@ -27,7 +29,6 @@ from docling.models.page_preprocessing_model import (
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.pipeline.base_pipeline import PaginatedPipeline from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -121,6 +122,10 @@ class StandardPdfPipeline(PaginatedPipeline):
options=self.pipeline_options.ocr_options, options=self.pipeline_options.ocr_options,
) )
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions): elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
raise RuntimeError(
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
)
return OcrMacModel( return OcrMacModel(
enabled=self.pipeline_options.do_ocr, enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options, options=self.pipeline_options.ocr_options,

View File

@ -33,40 +33,40 @@ Here are the available options as of this writing (for an up-to-date listing, ru
$ docling --help $ docling --help
Usage: docling [OPTIONS] source Usage: docling [OPTIONS] source
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None]
│ [required] │ │ [required]
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │ │ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
│ Defaults to all formats. │ Defaults to all formats. │
│ [default: None] │ [default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │ │ --to [md|json|text|doctags] Specify output formats. Defaults to │
│ Markdown. │ Markdown. │
│ [default: None] │ [default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │ │ --ocr --no-ocr If enabled, the bitmap content will be │
│ processed using OCR. │ processed using OCR. │
│ [default: ocr] │ [default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │ │ --ocr-engine [easyocr|tesseract_cli|tesseract|ocrmac] The OCR engine to use. │
│ [default: easyocr] │ [default: easyocr] │
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ │ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
│ [default: dlparse_v1] │ [default: dlparse_v1] │
│ --table-mode [fast|accurate] The mode to use in the table structure │ │ --table-mode [fast|accurate] The mode to use in the table structure │
│ model. │ model. │
│ [default: fast] │ [default: fast] │
│ --artifacts-path PATH If provided, the location of the model │ │ --artifacts-path PATH If provided, the location of the model │
│ artifacts. │ artifacts. │
│ [default: None] │ [default: None] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │ │ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
│ processed using OCR. │ processed using OCR. │
│ [default: no-abort-on-error] │ [default: no-abort-on-error] │
│ --output PATH Output directory where results are │ │ --output PATH Output directory where results are │
│ saved. │ saved. │
│ [default: .] │ [default: .] │
│ --version Show version information. │ │ --version Show version information. │
│ --help Show this message and exit. │ │ --help Show this message and exit. │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
``` ```
</details> </details>

View File

@ -128,6 +128,7 @@ module = [
"tesserocr.*", "tesserocr.*",
"docling_ibm_models.*", "docling_ibm_models.*",
"easyocr.*", "easyocr.*",
"ocrmac.*",
"deepsearch_glm.*", "deepsearch_glm.*",
"lxml.*", "lxml.*",
"bs4.*", "bs4.*",

View File

@ -1,3 +1,4 @@
import sys
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions,
OcrOptions, OcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
@ -83,6 +85,10 @@ def test_e2e_conversions():
TesseractCliOcrOptions(), TesseractCliOcrOptions(),
] ]
# only works on mac
if "darwin" == sys.platform:
engines.append(OcrMacOptions())
for ocr_options in engines: for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}") print(f"Converting with ocr_engine: {ocr_options.kind}")
converter = get_converter(ocr_options=ocr_options) converter = get_converter(ocr_options=ocr_options)