feat: new artifacts path and CLI utility (#876)

* fix artifacts path Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docling-models utility Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing formatting Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename utility to docling-tools Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename download methods and deprecation warnings Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * propagate artifacts path usage for ocr models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move function to utils Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * simplify downloading specific model(s) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * minor refactor Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-12-10 13:48:13 +00:00 · 2025-02-06 15:46:32 +01:00
parent 722a6eb7b9
commit ed74fe2ec0
15 changed files with 467 additions and 68 deletions
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@@ -0,0 +1,105 @@
+import logging
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Optional
+
+import typer
+from rich.console import Console
+from rich.logging import RichHandler
+
+from docling.datamodel.settings import settings
+from docling.utils.model_downloader import download_models
+
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
+
+console = Console()
+err_console = Console(stderr=True)
+
+
+app = typer.Typer(
+    name="Docling models helper",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+
+
+class _AvailableModels(str, Enum):
+    LAYOUT = "layout"
+    TABLEFORMER = "tableformer"
+    CODE_FORMULA = "code_formula"
+    PICTURE_CLASSIFIER = "picture_classifier"
+    EASYOCR = "easyocr"
+
+
+@app.command("download")
+def download(
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where all the models are downloaded.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced")
+    ] = False,
+    models: Annotated[
+        Optional[list[_AvailableModels]],
+        typer.Argument(
+            help=f"Models to download (default behavior: all will be downloaded)",
+        ),
+    ] = None,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if not quiet:
+        FORMAT = "%(message)s"
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    to_download = models or [m for m in _AvailableModels]
+    output_dir = download_models(
+        output_dir=output_dir,
+        force=force,
+        progress=(not quiet),
+        with_layout=_AvailableModels.LAYOUT in to_download,
+        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
+        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
+        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_easyocr=_AvailableModels.EASYOCR in to_download,
+    )
+
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
+
+        console.print(
+            "\n",
+            "Docling can now be configured for running offline using the local artifacts.\n\n",
+            "Using the CLI:",
+            f"`docling --artifacts-path={output_dir} FILE`",
+            "\n",
+            "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
+        )
+
+
+click_app = typer.main.get_command(app)
+
+if __name__ == "__main__":
+    app()
--- a/docling/cli/tools.py
+++ b/docling/cli/tools.py
@@ -0,0 +1,17 @@
+import typer
+
+from docling.cli.models import app as models_app
+
+app = typer.Typer(
+    name="Docling helpers",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+
+app.add_typer(models_app, name="models")
+
+click_app = typer.main.get_command(app)
+
+if __name__ == "__main__":
+    app()
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
    perf: BatchConcurrencySettings
    debug: DebugSettings

+    cache_dir: Path = Path.home() / ".cache" / "docling"
+

 settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
        Processes the given batch of elements and enriches them with predictions.
    """

+    _model_repo_folder = "CodeFormula"
    elements_batch_size = 5
    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
    expansion_factor = 0.03
@@ -68,7 +69,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Optional[Path],
        options: CodeFormulaModelOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -97,9 +98,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            )

            if artifacts_path is None:
-                artifacts_path = self.download_models_hf()
+                artifacts_path = self.download_models()
            else:
-                artifacts_path = Path(artifacts_path)
+                artifacts_path = artifacts_path / self._model_repo_folder

            self.code_formula_model = CodeFormulaPredictor(
                artifacts_path=artifacts_path,
@@ -108,13 +109,16 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            )

    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars

-        disable_progress_bars()
+        if not progress:
+            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/CodeFormula",
            force_download=force,
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@@ -55,12 +55,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
        Processes a batch of elements and adds classification annotations.
    """

+    _model_repo_folder = "DocumentFigureClassifier"
    images_scale = 2

    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Optional[Path],
        options: DocumentPictureClassifierOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -88,9 +89,9 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
            )

            if artifacts_path is None:
-                artifacts_path = self.download_models_hf()
+                artifacts_path = self.download_models()
            else:
-                artifacts_path = Path(artifacts_path)
+                artifacts_path = artifacts_path / self._model_repo_folder

            self.document_picture_classifier = DocumentFigureClassifierPredictor(
                artifacts_path=artifacts_path,
@@ -99,13 +100,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
            )

    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
+    def download_models(
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars

-        disable_progress_bars()
+        if not progress:
+            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/DocumentFigureClassifier",
            force_download=force,
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -1,7 +1,10 @@
 import logging
 import warnings
-from typing import Iterable
+import zipfile
+from pathlib import Path
+from typing import Iterable, List, Optional

+import httpx
 import numpy
 import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
+from docling.utils.utils import download_url_with_progress

 _log = logging.getLogger(__name__)


 class EasyOcrModel(BaseOcrModel):
+    _model_repo_folder = "EasyOcr"
+
    def __init__(
        self,
        enabled: bool,
+        artifacts_path: Optional[Path],
        options: EasyOcrOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
                )
                use_gpu = self.options.use_gpu

+            download_enabled = self.options.download_enabled
+            model_storage_directory = self.options.model_storage_directory
+            if artifacts_path is not None and model_storage_directory is None:
+                download_enabled = False
+                model_storage_directory = str(artifacts_path / self._model_repo_folder)
+
            self.reader = easyocr.Reader(
                lang_list=self.options.lang,
                gpu=use_gpu,
-                model_storage_directory=self.options.model_storage_directory,
+                model_storage_directory=model_storage_directory,
                recog_network=self.options.recog_network,
-                download_enabled=self.options.download_enabled,
+                download_enabled=download_enabled,
                verbose=False,
            )

+    @staticmethod
+    def download_models(
+        detection_models: List[str] = ["craft"],
+        recognition_models: List[str] = ["english_g2", "latin_g2"],
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
+        from easyocr.config import detection_models as det_models_dict
+        from easyocr.config import recognition_models as rec_models_dict
+
+        if local_dir is None:
+            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
+
+        local_dir.mkdir(parents=True, exist_ok=True)
+
+        # Collect models to download
+        download_list = []
+        for model_name in detection_models:
+            if model_name in det_models_dict:
+                download_list.append(det_models_dict[model_name])
+        for model_name in recognition_models:
+            if model_name in rec_models_dict["gen2"]:
+                download_list.append(rec_models_dict["gen2"][model_name])
+
+        # Download models
+        for model_details in download_list:
+            buf = download_url_with_progress(model_details["url"], progress=progress)
+            with zipfile.ZipFile(buf, "r") as zip_ref:
+                zip_ref.extractall(local_dir)
+
+        return local_dir
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -1,7 +1,8 @@
 import copy
 import logging
+import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union

 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)


 class LayoutModel(BasePageModel):
+    _model_repo_folder = "docling-models"
+    _model_path = "model_artifacts/layout"

    TEXT_ELEM_LABELS = [
        DocItemLabel.TEXT,
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
    FORMULA_LABEL = DocItemLabel.FORMULA
    CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]

-    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
+    def __init__(
+        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+    ):
        device = decide_device(accelerator_options.device)

+        if artifacts_path is None:
+            artifacts_path = self.download_models() / self._model_path
+        else:
+            # will become the default in the future
+            if (artifacts_path / self._model_repo_folder).exists():
+                artifacts_path = (
+                    artifacts_path / self._model_repo_folder / self._model_path
+                )
+            elif (artifacts_path / self._model_path).exists():
+                warnings.warn(
+                    "The usage of artifacts_path containing directly "
+                    f"{self._model_path} is deprecated. Please point "
+                    "the artifacts_path to the parent containing "
+                    f"the {self._model_repo_folder} folder.",
+                    DeprecationWarning,
+                    stacklevel=3,
+                )
+                artifacts_path = artifacts_path / self._model_path
+
        self.layout_predictor = LayoutPredictor(
            artifact_path=str(artifacts_path),
            device=device,
            num_threads=accelerator_options.num_threads,
        )

+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.1.0",
+        )
+
+        return Path(download_path)
+
    def draw_clusters_and_cells_side_by_side(
        self, conv_res, page, clusters, mode_prefix: str, show: bool = False
    ):
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -1,6 +1,7 @@
 import copy
+import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union

 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder


 class TableStructureModel(BasePageModel):
+    _model_repo_folder = "docling-models"
+    _model_path = "model_artifacts/tableformer"
+
    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Path,
+        artifacts_path: Optional[Path],
        options: TableStructureOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):

        self.enabled = enabled
        if self.enabled:
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models() / self._model_path
+            else:
+                # will become the default in the future
+                if (artifacts_path / self._model_repo_folder).exists():
+                    artifacts_path = (
+                        artifacts_path / self._model_repo_folder / self._model_path
+                    )
+                elif (artifacts_path / self._model_path).exists():
+                    warnings.warn(
+                        "The usage of artifacts_path containing directly "
+                        f"{self._model_path} is deprecated. Please point "
+                        "the artifacts_path to the parent containing "
+                        f"the {self._model_repo_folder} folder.",
+                        DeprecationWarning,
+                        stacklevel=3,
+                    )
+                    artifacts_path = artifacts_path / self._model_path
+
            if self.mode == TableFormerMode.ACCURATE:
                artifacts_path = artifacts_path / "accurate"
            else:
@@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
            )
            self.scale = 2.0  # Scale up table input images to 144 dpi

+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.1.0",
+        )
+
+        return Path(download_path)
+
    def draw_table_and_cells(
        self,
        conv_res: ConversionResult,
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -1,5 +1,6 @@
 import logging
 import sys
+import warnings
 from pathlib import Path
 from typing import Optional

@@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
 from docling.models.document_picture_classifier import (
@@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
+from docling.utils.model_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder

 _log = logging.getLogger(__name__)


 class StandardPdfPipeline(PaginatedPipeline):
-    _layout_model_path = "model_artifacts/layout"
-    _table_model_path = "model_artifacts/tableformer"
+    _layout_model_path = LayoutModel._model_path
+    _table_model_path = TableStructureModel._model_path

    def __init__(self, pipeline_options: PdfPipelineOptions):
        super().__init__(pipeline_options)
        self.pipeline_options: PdfPipelineOptions

-        if pipeline_options.artifacts_path is None:
-            self.artifacts_path = self.download_models_hf()
-        else:
-            self.artifacts_path = Path(pipeline_options.artifacts_path)
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()

        self.keep_images = (
            self.pipeline_options.generate_page_images
@@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):

        self.glm_model = GlmModel(options=GlmOptions())

-        if (ocr_model := self.get_ocr_model()) is None:
+        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
            raise RuntimeError(
                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
            )
@@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
            ocr_model,
            # Layout model
            LayoutModel(
-                artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._layout_model_path,
+                artifacts_path=artifacts_path,
                accelerator_options=pipeline_options.accelerator_options,
            ),
            # Table structure model
            TableStructureModel(
                enabled=pipeline_options.do_table_structure,
-                artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._table_model_path,
+                artifacts_path=artifacts_path,
                options=pipeline_options.table_structure_options,
                accelerator_options=pipeline_options.accelerator_options,
            ),
@@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            CodeFormulaModel(
                enabled=pipeline_options.do_code_enrichment
                or pipeline_options.do_formula_enrichment,
-                artifacts_path=pipeline_options.artifacts_path,
+                artifacts_path=artifacts_path,
                options=CodeFormulaModelOptions(
                    do_code_enrichment=pipeline_options.do_code_enrichment,
                    do_formula_enrichment=pipeline_options.do_formula_enrichment,
@@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            # Document Picture Classifier
            DocumentPictureClassifier(
                enabled=pipeline_options.do_picture_classification,
-                artifacts_path=pipeline_options.artifacts_path,
+                artifacts_path=artifacts_path,
                options=DocumentPictureClassifierOptions(),
                accelerator_options=pipeline_options.accelerator_options,
            ),
@@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
    def download_models_hf(
        local_dir: Optional[Path] = None, force: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.1.0",
+        warnings.warn(
+            "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
+            "use instead the utility `docling-tools models download`, or "
+            "the upstream method docling.utils.models_downloader.download_all()",
+            DeprecationWarning,
+            stacklevel=3,
        )

-        return Path(download_path)
+        output_dir = download_models(output_dir=local_dir, force=force, progress=False)
+        return output_dir

-    def get_ocr_model(self) -> Optional[BaseOcrModel]:
+    def get_ocr_model(
+        self, artifacts_path: Optional[Path] = None
+    ) -> Optional[BaseOcrModel]:
        if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
            return EasyOcrModel(
                enabled=self.pipeline_options.do_ocr,
+                artifacts_path=artifacts_path,
                options=self.pipeline_options.ocr_options,
                accelerator_options=self.pipeline_options.accelerator_options,
            )
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -0,0 +1,72 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+from docling.datamodel.settings import settings
+from docling.models.code_formula_model import CodeFormulaModel
+from docling.models.document_picture_classifier import DocumentPictureClassifier
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.table_structure_model import TableStructureModel
+
+_log = logging.getLogger(__name__)
+
+
+def download_models(
+    output_dir: Optional[Path] = None,
+    *,
+    force: bool = False,
+    progress: bool = False,
+    with_layout: bool = True,
+    with_tableformer: bool = True,
+    with_code_formula: bool = True,
+    with_picture_classifier: bool = True,
+    with_easyocr: bool = True,
+):
+    if output_dir is None:
+        output_dir = settings.cache_dir / "models"
+
+    # Make sure the folder exists
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    if with_layout:
+        _log.info(f"Downloading layout model...")
+        LayoutModel.download_models(
+            local_dir=output_dir / LayoutModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_tableformer:
+        _log.info(f"Downloading tableformer model...")
+        TableStructureModel.download_models(
+            local_dir=output_dir / TableStructureModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_picture_classifier:
+        _log.info(f"Downloading picture classifier model...")
+        DocumentPictureClassifier.download_models(
+            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_code_formula:
+        _log.info(f"Downloading code formula model...")
+        CodeFormulaModel.download_models(
+            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_easyocr:
+        _log.info(f"Downloading easyocr models...")
+        EasyOcrModel.download_models(
+            local_dir=output_dir / EasyOcrModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    return output_dir
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@@ -4,6 +4,9 @@ from itertools import islice
 from pathlib import Path
 from typing import List, Union

+import requests
+from tqdm import tqdm
+

 def chunkify(iterator, chunk_size):
    """Yield successive chunks of chunk_size from the iterable."""
@@ -39,3 +42,24 @@ def create_hash(string: str):
    hasher.update(string.encode("utf-8"))

    return hasher.hexdigest()
+
+
+def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
+    buf = BytesIO()
+    with requests.get(url, stream=True, allow_redirects=True) as response:
+        total_size = int(response.headers.get("content-length", 0))
+        progress_bar = tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            disable=(not progress),
+        )
+
+        for chunk in response.iter_content(10 * 1024):
+            buf.write(chunk)
+            progress_bar.update(len(chunk))
+        progress_bar.close()
+
+    buf.seek(0)
+    return buf