simplify downloading specific model(s)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-08-01 23:12:20 +00:00 · 2025-02-06 13:46:32 +01:00 · 2025-02-06 13:46:32 +01:00 · 3af9b9d34e
commit 3af9b9d34e
parent 5692cdb19d
4 changed files with 72 additions and 90 deletions
--- a/docling/cli/models_download.py
+++ b/docling/cli/models_download.py
@ -1,26 +1,19 @@
 import logging
 import warnings
+from enum import Enum
 from pathlib import Path
-from typing import Annotated
+from typing import Annotated, Optional

 import typer
+from rich.console import Console
+from rich.logging import RichHandler

 from docling.datamodel.settings import settings
-from docling.models.code_formula_model import CodeFormulaModel
-from docling.models.document_picture_classifier import DocumentPictureClassifier
-from docling.models.easyocr_model import EasyOcrModel
-from docling.models.layout_model import LayoutModel
-from docling.models.rapid_ocr_model import RapidOcrModel
-from docling.models.table_structure_model import TableStructureModel
-from docling.utils.models_downloader import download_all
+from docling.utils.models_downloader import download_models

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

-_log = logging.getLogger(__name__)
-from rich.console import Console
-from rich.logging import RichHandler
-
 console = Console()
 err_console = Console(stderr=True)

@ -33,6 +26,14 @@ app = typer.Typer(
 )


+class _AvailableModels(str, Enum):
+    LAYOUT = "layout"
+    TABLEFORMER = "tableformer"
+    CODE_FORMULA = "code_formula"
+    PICTURE_CLASSIFIER = "picture_classifier"
+    EASYOCR = "easyocr"
+
+
@app.command("download")
 def download(
    output_dir: Annotated[
@ -43,51 +44,27 @@ def download(
            "--output-dir",
            help="The directory where all the models are downloaded.",
        ),
-    ] = settings.cache_dir
-    / "models",
+    ] = (settings.cache_dir / "models"),
    force: Annotated[
        bool, typer.Option(..., help="If true, the download will be forced")
    ] = False,
-    quite: Annotated[
+    models: Annotated[
+        Optional[list[_AvailableModels]],
+        typer.Argument(
+            help=f"Models to download (default behavior: all will be downloaded)",
+        ),
+    ] = None,
+    quiet: Annotated[
        bool,
        typer.Option(
            ...,
            "-q",
-            help="No extra output is generated, the CLI print only the directory with the cached models.",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
        ),
    ] = False,
-    layout: Annotated[
-        bool,
-        typer.Option(..., help="If true, the layout model weights are downloaded."),
-    ] = True,
-    tableformer: Annotated[
-        bool,
-        typer.Option(
-            ..., help="If true, the tableformer model weights are downloaded."
-        ),
-    ] = True,
-    code_formula: Annotated[
-        bool,
-        typer.Option(
-            ..., help="If true, the code formula model weights are downloaded."
-        ),
-    ] = True,
-    picture_classifier: Annotated[
-        bool,
-        typer.Option(
-            ..., help="If true, the picture classifier model weights are downloaded."
-        ),
-    ] = True,
-    easyocr: Annotated[
-        bool,
-        typer.Option(..., help="If true, the easyocr model weights are downloaded."),
-    ] = True,
-    rapidocr: Annotated[
-        bool,
-        typer.Option(..., help="If true, the rapidocr model weights are downloaded."),
-    ] = True,
 ):
-    if not quite:
+    if not quiet:
        FORMAT = "%(message)s"
        logging.basicConfig(
            level=logging.INFO,
@ -95,25 +72,22 @@ def download(
            datefmt="[%X]",
            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
        )
-
-    output_dir = download_all(
+    to_download = models or [m for m in _AvailableModels]
+    output_dir = download_models(
        output_dir=output_dir,
        force=force,
-        progress=(not quite),
-        layout=layout,
-        tableformer=tableformer,
-        code_formula=code_formula,
-        picture_classifier=picture_classifier,
-        easyocr=easyocr,
-        rapidocr=rapidocr,
+        progress=(not quiet),
+        with_layout=_AvailableModels.LAYOUT in to_download,
+        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
+        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
+        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_easyocr=_AvailableModels.EASYOCR in to_download,
    )

-    if quite:
+    if quiet:
        typer.echo(output_dir)
    else:
-        typer.secho(
-            f"\nAll models downloaded in the directory {output_dir}.", fg="green"
-        )
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")

        console.print(
            "\n",
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -39,7 +39,7 @@ from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
-from docling.utils.models_downloader import download_all
+from docling.utils.models_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder

 _log = logging.getLogger(__name__)
@ -135,7 +135,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            stacklevel=3,
        )

-        output_dir = download_all(output_dir=local_dir, force=force, progress=False)
+        output_dir = download_models(output_dir=local_dir, force=force, progress=False)
        return output_dir

    def get_ocr_model(
--- a/docling/utils/models_downloader.py
+++ b/docling/utils/models_downloader.py
@ -7,23 +7,21 @@ from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
-from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.table_structure_model import TableStructureModel

 _log = logging.getLogger(__name__)


-def download_all(
+def download_models(
    output_dir: Optional[Path] = None,
    *,
    force: bool = False,
    progress: bool = False,
-    layout: bool = True,
-    tableformer: bool = True,
-    code_formula: bool = True,
-    picture_classifier: bool = True,
-    easyocr: bool = True,
-    rapidocr: bool = True,
+    with_layout: bool = True,
+    with_tableformer: bool = True,
+    with_code_formula: bool = True,
+    with_picture_classifier: bool = True,
+    with_easyocr: bool = True,
 ):
    if output_dir is None:
        output_dir = settings.cache_dir / "models"
@ -31,7 +29,7 @@ def download_all(
    # Make sure the folder exists
    output_dir.mkdir(exist_ok=True, parents=True)

-    if layout:
+    if with_layout:
        _log.info(f"Downloading layout model...")
        LayoutModel.download_models(
            local_dir=output_dir / LayoutModel._model_repo_folder,
@ -39,7 +37,7 @@ def download_all(
            progress=progress,
        )

-    if tableformer:
+    if with_tableformer:
        _log.info(f"Downloading tableformer model...")
        TableStructureModel.download_models(
            local_dir=output_dir / TableStructureModel._model_repo_folder,
@ -47,7 +45,7 @@ def download_all(
            progress=progress,
        )

-    if picture_classifier:
+    if with_picture_classifier:
        _log.info(f"Downloading picture classifier model...")
        DocumentPictureClassifier.download_models(
            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
@ -55,7 +53,7 @@ def download_all(
            progress=progress,
        )

-    if code_formula:
+    if with_code_formula:
        _log.info(f"Downloading code formula model...")
        CodeFormulaModel.download_models(
            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
@ -63,7 +61,7 @@ def download_all(
            progress=progress,
        )

-    if easyocr:
+    if with_easyocr:
        _log.info(f"Downloading easyocr models...")
        EasyOcrModel.download_models(
            local_dir=output_dir / EasyOcrModel._model_repo_folder,
--- a/docs/usage.md
+++ b/docs/usage.md
@ -26,17 +26,36 @@ To see all available options (export formats etc.) run `docling --help`. More de

 ### Advanced options

-#### Provide specific artifacts path (offline mode)
+#### Model prefetching and offline usage

-By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
+By default, models are downloaded automatically upon first usage. If you would prefer
+to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
+that as follows:
+
+**Step 1: Prefetch the models**
+
+Use the `docling-tools models download` utility:
+
+```sh
+$ docling-tools models download
+Downloading layout model...
+Downloading tableformer model...
+Downloading picture classifier model...
+Downloading code formula model...
+Downloading easyocr models...
+Models downloaded into $HOME/.cache/docling/models.
+```
+
+Alternatively, models can be programmatically downloaded using `docling.utils.models_downloader.download_models()`.
+
+**Step 2: Use the prefetched models**

 ```python
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption

-# dowload all models with `docling-tools models download`
-artifacts_path = "/local/path/to/artifacts"
+artifacts_path = "/local/path/to/models"

 pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
 doc_converter = DocumentConverter(
@ -46,21 +65,12 @@ doc_converter = DocumentConverter(
 )
 ```

-To download all the artifacts needed to run offline, Docling provides the `docling-tools models download` utility.
+Or using the CLI:

 ```sh
-$ docling-tools models download
-Downloading layout model...
-Downloading tableformer model...
-Downloading picture classifier model...
-Downloading code formula model...
-Downloading easyocr models...
-All models downloaded in the directory $HOME/.cache/docling/models.
+docling --artifacts-path="/local/path/to/models" FILE
 ```

-Alternatively, the download of all models can be triggered also with `docling.utils.models_downloader.download_all()`.
-
-
 #### Adjust pipeline features

 The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways