diff --git a/docling/cli/models_download.py b/docling/cli/models_download.py index 2db36521..8a19bbcc 100644 --- a/docling/cli/models_download.py +++ b/docling/cli/models_download.py @@ -1,26 +1,19 @@ import logging import warnings +from enum import Enum from pathlib import Path -from typing import Annotated +from typing import Annotated, Optional import typer +from rich.console import Console +from rich.logging import RichHandler from docling.datamodel.settings import settings -from docling.models.code_formula_model import CodeFormulaModel -from docling.models.document_picture_classifier import DocumentPictureClassifier -from docling.models.easyocr_model import EasyOcrModel -from docling.models.layout_model import LayoutModel -from docling.models.rapid_ocr_model import RapidOcrModel -from docling.models.table_structure_model import TableStructureModel -from docling.utils.models_downloader import download_all +from docling.utils.models_downloader import download_models warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") -_log = logging.getLogger(__name__) -from rich.console import Console -from rich.logging import RichHandler - console = Console() err_console = Console(stderr=True) @@ -33,6 +26,14 @@ app = typer.Typer( ) +class _AvailableModels(str, Enum): + LAYOUT = "layout" + TABLEFORMER = "tableformer" + CODE_FORMULA = "code_formula" + PICTURE_CLASSIFIER = "picture_classifier" + EASYOCR = "easyocr" + + @app.command("download") def download( output_dir: Annotated[ @@ -43,51 +44,27 @@ def download( "--output-dir", help="The directory where all the models are downloaded.", ), - ] = settings.cache_dir - / "models", + ] = (settings.cache_dir / "models"), force: Annotated[ bool, typer.Option(..., help="If true, the download will be forced") ] = False, - quite: Annotated[ + models: Annotated[ + Optional[list[_AvailableModels]], + typer.Argument( + help=f"Models to download (default behavior: all will be downloaded)", + ), + ] = None, + quiet: Annotated[ bool, typer.Option( ..., "-q", - help="No extra output is generated, the CLI print only the directory with the cached models.", + "--quiet", + help="No extra output is generated, the CLI prints only the directory with the cached models.", ), ] = False, - layout: Annotated[ - bool, - typer.Option(..., help="If true, the layout model weights are downloaded."), - ] = True, - tableformer: Annotated[ - bool, - typer.Option( - ..., help="If true, the tableformer model weights are downloaded." - ), - ] = True, - code_formula: Annotated[ - bool, - typer.Option( - ..., help="If true, the code formula model weights are downloaded." - ), - ] = True, - picture_classifier: Annotated[ - bool, - typer.Option( - ..., help="If true, the picture classifier model weights are downloaded." - ), - ] = True, - easyocr: Annotated[ - bool, - typer.Option(..., help="If true, the easyocr model weights are downloaded."), - ] = True, - rapidocr: Annotated[ - bool, - typer.Option(..., help="If true, the rapidocr model weights are downloaded."), - ] = True, ): - if not quite: + if not quiet: FORMAT = "%(message)s" logging.basicConfig( level=logging.INFO, @@ -95,25 +72,22 @@ def download( datefmt="[%X]", handlers=[RichHandler(show_level=False, show_time=False, markup=True)], ) - - output_dir = download_all( + to_download = models or [m for m in _AvailableModels] + output_dir = download_models( output_dir=output_dir, force=force, - progress=(not quite), - layout=layout, - tableformer=tableformer, - code_formula=code_formula, - picture_classifier=picture_classifier, - easyocr=easyocr, - rapidocr=rapidocr, + progress=(not quiet), + with_layout=_AvailableModels.LAYOUT in to_download, + with_tableformer=_AvailableModels.TABLEFORMER in to_download, + with_code_formula=_AvailableModels.CODE_FORMULA in to_download, + with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download, + with_easyocr=_AvailableModels.EASYOCR in to_download, ) - if quite: + if quiet: typer.echo(output_dir) else: - typer.secho( - f"\nAll models downloaded in the directory {output_dir}.", fg="green" - ) + typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green") console.print( "\n", diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 85bd1075..fa5ccedc 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -39,7 +39,7 @@ from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_pipeline import PaginatedPipeline -from docling.utils.models_downloader import download_all +from docling.utils.models_downloader import download_models from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) @@ -135,7 +135,7 @@ class StandardPdfPipeline(PaginatedPipeline): stacklevel=3, ) - output_dir = download_all(output_dir=local_dir, force=force, progress=False) + output_dir = download_models(output_dir=local_dir, force=force, progress=False) return output_dir def get_ocr_model( diff --git a/docling/utils/models_downloader.py b/docling/utils/models_downloader.py index 27307fd1..504618ec 100644 --- a/docling/utils/models_downloader.py +++ b/docling/utils/models_downloader.py @@ -7,23 +7,21 @@ from docling.models.code_formula_model import CodeFormulaModel from docling.models.document_picture_classifier import DocumentPictureClassifier from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel -from docling.models.rapid_ocr_model import RapidOcrModel from docling.models.table_structure_model import TableStructureModel _log = logging.getLogger(__name__) -def download_all( +def download_models( output_dir: Optional[Path] = None, *, force: bool = False, progress: bool = False, - layout: bool = True, - tableformer: bool = True, - code_formula: bool = True, - picture_classifier: bool = True, - easyocr: bool = True, - rapidocr: bool = True, + with_layout: bool = True, + with_tableformer: bool = True, + with_code_formula: bool = True, + with_picture_classifier: bool = True, + with_easyocr: bool = True, ): if output_dir is None: output_dir = settings.cache_dir / "models" @@ -31,7 +29,7 @@ def download_all( # Make sure the folder exists output_dir.mkdir(exist_ok=True, parents=True) - if layout: + if with_layout: _log.info(f"Downloading layout model...") LayoutModel.download_models( local_dir=output_dir / LayoutModel._model_repo_folder, @@ -39,7 +37,7 @@ def download_all( progress=progress, ) - if tableformer: + if with_tableformer: _log.info(f"Downloading tableformer model...") TableStructureModel.download_models( local_dir=output_dir / TableStructureModel._model_repo_folder, @@ -47,7 +45,7 @@ def download_all( progress=progress, ) - if picture_classifier: + if with_picture_classifier: _log.info(f"Downloading picture classifier model...") DocumentPictureClassifier.download_models( local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, @@ -55,7 +53,7 @@ def download_all( progress=progress, ) - if code_formula: + if with_code_formula: _log.info(f"Downloading code formula model...") CodeFormulaModel.download_models( local_dir=output_dir / CodeFormulaModel._model_repo_folder, @@ -63,7 +61,7 @@ def download_all( progress=progress, ) - if easyocr: + if with_easyocr: _log.info(f"Downloading easyocr models...") EasyOcrModel.download_models( local_dir=output_dir / EasyOcrModel._model_repo_folder, diff --git a/docs/usage.md b/docs/usage.md index ff086e2f..ab125d31 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -26,17 +26,36 @@ To see all available options (export formats etc.) run `docling --help`. More de ### Advanced options -#### Provide specific artifacts path (offline mode) +#### Model prefetching and offline usage -By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows: +By default, models are downloaded automatically upon first usage. If you would prefer +to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do +that as follows: + +**Step 1: Prefetch the models** + +Use the `docling-tools models download` utility: + +```sh +$ docling-tools models download +Downloading layout model... +Downloading tableformer model... +Downloading picture classifier model... +Downloading code formula model... +Downloading easyocr models... +Models downloaded into $HOME/.cache/docling/models. +``` + +Alternatively, models can be programmatically downloaded using `docling.utils.models_downloader.download_models()`. + +**Step 2: Use the prefetched models** ```python from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption -# dowload all models with `docling-tools models download` -artifacts_path = "/local/path/to/artifacts" +artifacts_path = "/local/path/to/models" pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) doc_converter = DocumentConverter( @@ -46,21 +65,12 @@ doc_converter = DocumentConverter( ) ``` -To download all the artifacts needed to run offline, Docling provides the `docling-tools models download` utility. +Or using the CLI: ```sh -$ docling-tools models download -Downloading layout model... -Downloading tableformer model... -Downloading picture classifier model... -Downloading code formula model... -Downloading easyocr models... -All models downloaded in the directory $HOME/.cache/docling/models. +docling --artifacts-path="/local/path/to/models" FILE ``` -Alternatively, the download of all models can be triggered also with `docling.utils.models_downloader.download_all()`. - - #### Adjust pipeline features The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways