mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems
- Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo <nuridol@gmail.com>
This commit is contained in:
parent
719cfe93c3
commit
944988cb30
@ -23,12 +23,12 @@ from docling.datamodel.base_models import (
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TableFormerMode,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
OcrMacOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
|
||||
|
@ -60,6 +60,7 @@ class TesseractOcrOptions(OcrOptions):
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
|
||||
class OcrMacOptions(OcrOptions):
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
@ -83,9 +84,9 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = (
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
ocr_options: Union[
|
||||
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
|
||||
] = Field(EasyOcrOptions(), discriminator="kind")
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
|
@ -1,6 +1,5 @@
|
||||
import logging
|
||||
import tempfile
|
||||
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -12,7 +11,6 @@ from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -36,7 +34,6 @@ class OcrMacModel(BaseOcrModel):
|
||||
|
||||
self.reader_RIL = ocrmac.OCR
|
||||
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
@ -69,7 +66,8 @@ class OcrMacModel(BaseOcrModel):
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
|
||||
boxes = self.reader_RIL(fname,
|
||||
boxes = self.reader_RIL(
|
||||
fname,
|
||||
recognition_level=self.options.recognition,
|
||||
framework=self.options.framework,
|
||||
language_preference=self.options.lang,
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -10,15 +11,16 @@ from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
OcrMacOptions,
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||
from docling.models.page_preprocessing_model import (
|
||||
PagePreprocessingModel,
|
||||
@ -27,7 +29,6 @@ from docling.models.page_preprocessing_model import (
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
@ -121,6 +122,10 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
||||
if "darwin" != sys.platform:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
|
||||
)
|
||||
return OcrMacModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
|
@ -33,40 +33,40 @@ Here are the available options as of this writing (for an up-to-date listing, ru
|
||||
$ docling --help
|
||||
|
||||
Usage: docling [OPTIONS] source
|
||||
|
||||
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
|
||||
│ [required] │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
|
||||
│ Defaults to all formats. │
|
||||
│ [default: None] │
|
||||
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
|
||||
│ Markdown. │
|
||||
│ [default: None] │
|
||||
│ --ocr --no-ocr If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: ocr] │
|
||||
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │
|
||||
│ [default: easyocr] │
|
||||
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
|
||||
│ [default: dlparse_v1] │
|
||||
│ --table-mode [fast|accurate] The mode to use in the table structure │
|
||||
│ model. │
|
||||
│ [default: fast] │
|
||||
│ --artifacts-path PATH If provided, the location of the model │
|
||||
│ artifacts. │
|
||||
│ [default: None] │
|
||||
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: no-abort-on-error] │
|
||||
│ --output PATH Output directory where results are │
|
||||
│ saved. │
|
||||
│ [default: .] │
|
||||
│ --version Show version information. │
|
||||
│ --help Show this message and exit. │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
|
||||
╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
|
||||
│ [required] │
|
||||
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
|
||||
│ Defaults to all formats. │
|
||||
│ [default: None] │
|
||||
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
|
||||
│ Markdown. │
|
||||
│ [default: None] │
|
||||
│ --ocr --no-ocr If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: ocr] │
|
||||
│ --ocr-engine [easyocr|tesseract_cli|tesseract|ocrmac] The OCR engine to use. │
|
||||
│ [default: easyocr] │
|
||||
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
|
||||
│ [default: dlparse_v1] │
|
||||
│ --table-mode [fast|accurate] The mode to use in the table structure │
|
||||
│ model. │
|
||||
│ [default: fast] │
|
||||
│ --artifacts-path PATH If provided, the location of the model │
|
||||
│ artifacts. │
|
||||
│ [default: None] │
|
||||
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: no-abort-on-error] │
|
||||
│ --output PATH Output directory where results are │
|
||||
│ saved. │
|
||||
│ [default: .] │
|
||||
│ --version Show version information. │
|
||||
│ --help Show this message and exit. │
|
||||
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
|
||||
```
|
||||
</details>
|
||||
|
@ -128,6 +128,7 @@ module = [
|
||||
"tesserocr.*",
|
||||
"docling_ibm_models.*",
|
||||
"easyocr.*",
|
||||
"ocrmac.*",
|
||||
"deepsearch_glm.*",
|
||||
"lxml.*",
|
||||
"bs4.*",
|
||||
|
@ -1,3 +1,4 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
@ -83,6 +85,10 @@ def test_e2e_conversions():
|
||||
TesseractCliOcrOptions(),
|
||||
]
|
||||
|
||||
# only works on mac
|
||||
if "darwin" == sys.platform:
|
||||
engines.append(OcrMacOptions())
|
||||
|
||||
for ocr_options in engines:
|
||||
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||
converter = get_converter(ocr_options=ocr_options)
|
||||
|
Loading…
Reference in New Issue
Block a user