Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems

- Resolved formatting and linting issues
- Updated `--ocr-engine` CLI option documentation for `ocrmac`
- Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms

Signed-off-by: Suhwan Seo <nuridol@gmail.com>
This commit is contained in:
Suhwan Seo 2024-11-08 19:05:33 +09:00
parent 719cfe93c3
commit 944988cb30
7 changed files with 55 additions and 44 deletions

View File

@ -23,12 +23,12 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
PdfPipelineOptions,
TableFormerMode,
TesseractCliOcrOptions,
TesseractOcrOptions,
OcrMacOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

View File

@ -60,6 +60,7 @@ class TesseractOcrOptions(OcrOptions):
extra="forbid",
)
class OcrMacOptions(OcrOptions):
kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
@ -83,9 +84,9 @@ class PdfPipelineOptions(PipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
Field(EasyOcrOptions(), discriminator="kind")
)
ocr_options: Union[
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
] = Field(EasyOcrOptions(), discriminator="kind")
images_scale: float = 1.0
generate_page_images: bool = False

View File

@ -1,6 +1,5 @@
import logging
import tempfile
from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -12,7 +11,6 @@ from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@ -36,7 +34,6 @@ class OcrMacModel(BaseOcrModel):
self.reader_RIL = ocrmac.OCR
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
@ -69,7 +66,8 @@ class OcrMacModel(BaseOcrModel):
fname = image_file.name
high_res_image.save(fname)
boxes = self.reader_RIL(fname,
boxes = self.reader_RIL(
fname,
recognition_level=self.options.recognition,
framework=self.options.framework,
language_preference=self.options.lang,

View File

@ -1,4 +1,5 @@
import logging
import sys
from pathlib import Path
from typing import Optional
@ -10,15 +11,16 @@ from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
OcrMacOptions,
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
@ -27,7 +29,6 @@ from docling.models.page_preprocessing_model import (
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -121,6 +122,10 @@ class StandardPdfPipeline(PaginatedPipeline):
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
raise RuntimeError(
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
)
return OcrMacModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,

View File

@ -33,40 +33,40 @@ Here are the available options as of this writing (for an up-to-date listing, ru
$ docling --help
Usage: docling [OPTIONS] source
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
│ [required] │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
│ Defaults to all formats. │
│ [default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
│ Markdown. │
│ [default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │
│ [default: easyocr] │
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
│ [default: dlparse_v1] │
│ --table-mode [fast|accurate] The mode to use in the table structure │
│ model. │
│ [default: fast] │
│ --artifacts-path PATH If provided, the location of the model │
│ artifacts. │
│ [default: None] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: no-abort-on-error] │
│ --output PATH Output directory where results are │
│ saved. │
│ [default: .] │
│ --version Show version information. │
│ --help Show this message and exit. │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None]
│ [required]
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
Defaults to all formats. │
[default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
Markdown. │
[default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │
processed using OCR. │
[default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract|ocrmac] The OCR engine to use. │
[default: easyocr] │
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
[default: dlparse_v1] │
│ --table-mode [fast|accurate] The mode to use in the table structure │
model. │
[default: fast] │
│ --artifacts-path PATH If provided, the location of the model │
artifacts. │
[default: None] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
processed using OCR. │
[default: no-abort-on-error] │
│ --output PATH Output directory where results are │
saved. │
[default: .] │
│ --version Show version information. │
│ --help Show this message and exit. │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
```
</details>

View File

@ -128,6 +128,7 @@ module = [
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"ocrmac.*",
"deepsearch_glm.*",
"lxml.*",
"bs4.*",

View File

@ -1,3 +1,4 @@
import sys
from pathlib import Path
from typing import List
@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
@ -83,6 +85,10 @@ def test_e2e_conversions():
TesseractCliOcrOptions(),
]
# only works on mac
if "darwin" == sys.platform:
engines.append(OcrMacOptions())
for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}")
converter = get_converter(ocr_options=ocr_options)