From cdf079dd06955358d5bb74d6d3be9dee7b30c082 Mon Sep 17 00:00:00 2001 From: VIktor Kuropiantnyk <103574791+vku-ibm@users.noreply.github.com> Date: Fri, 22 Aug 2025 15:23:29 +0200 Subject: [PATCH] feat(CLI): Option to download arbitrary HuggingFace model (#2123) * Added option to docling-tools to download arbitrary HuggingFace model Signed-off-by: Viktor Kuropiatnyk * Added note in documentation Signed-off-by: Viktor Kuropiatnyk * Removed note on custom artifact path usage from HF download option Signed-off-by: Viktor Kuropiatnyk * Fixed typo Signed-off-by: Viktor Kuropiatnyk --------- Signed-off-by: Viktor Kuropiatnyk --- docling/cli/models.py | 56 ++++++++++++++++++++++++++++++++++ docs/usage/advanced_options.md | 7 +++++ 2 files changed, 63 insertions(+) diff --git a/docling/cli/models.py b/docling/cli/models.py index 4632053f..ff0eed52 100644 --- a/docling/cli/models.py +++ b/docling/cli/models.py @@ -9,6 +9,7 @@ from rich.console import Console from rich.logging import RichHandler from docling.datamodel.settings import settings +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.model_downloader import download_models warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") @@ -128,6 +129,61 @@ def download( ) +@app.command("download-hf-repo") +def download_hf_repo( + models: Annotated[ + list[str], + typer.Argument( + help="Specific models to download from HuggingFace identified by their repo id. For example: ds4sd/docling-models .", + ), + ], + output_dir: Annotated[ + Path, + typer.Option( + ..., + "-o", + "--output-dir", + help="The directory where to download the models.", + ), + ] = (settings.cache_dir / "models"), + force: Annotated[ + bool, typer.Option(..., help="If true, the download will be forced.") + ] = False, + quiet: Annotated[ + bool, + typer.Option( + ..., + "-q", + "--quiet", + help="No extra output is generated, the CLI prints only the directory with the cached models.", + ), + ] = False, +): + if not quiet: + logging.basicConfig( + level=logging.INFO, + format="[blue]%(message)s[/blue]", + datefmt="[%X]", + handlers=[RichHandler(show_level=False, show_time=False, markup=True)], + ) + + for item in models: + typer.secho(f"\nDownloading {item} model from HuggingFace...") + download_hf_model( + repo_id=item, + # would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76 + # but creating options objects seams like an overkill + local_dir=output_dir / item.replace("/", "--"), + force=force, + progress=(not quiet), + ) + + if quiet: + typer.echo(output_dir) + else: + typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green") + + click_app = typer.main.get_command(app) if __name__ == "__main__": diff --git a/docs/usage/advanced_options.md b/docs/usage/advanced_options.md index fbe9362a..92f33808 100644 --- a/docs/usage/advanced_options.md +++ b/docs/usage/advanced_options.md @@ -20,6 +20,13 @@ Models downloaded into $HOME/.cache/docling/models. Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()`. +Also, you can use `download-hf-repo` parameter to download arbitrary models from HuggingFace by specifying repo id: + +```sh +$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview +Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace... +``` + **Step 2: Use the prefetched models** ```python