simplify downloading specific model(s)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2025-02-06 13:46:32 +01:00
parent 5692cdb19d
commit 3af9b9d34e
4 changed files with 72 additions and 90 deletions

View File

@ -1,26 +1,19 @@
import logging
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated
from typing import Annotated, Optional
import typer
from rich.console import Console
from rich.logging import RichHandler
from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
from docling.utils.models_downloader import download_all
from docling.utils.models_downloader import download_models
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
_log = logging.getLogger(__name__)
from rich.console import Console
from rich.logging import RichHandler
console = Console()
err_console = Console(stderr=True)
@ -33,6 +26,14 @@ app = typer.Typer(
)
class _AvailableModels(str, Enum):
LAYOUT = "layout"
TABLEFORMER = "tableformer"
CODE_FORMULA = "code_formula"
PICTURE_CLASSIFIER = "picture_classifier"
EASYOCR = "easyocr"
@app.command("download")
def download(
output_dir: Annotated[
@ -43,51 +44,27 @@ def download(
"--output-dir",
help="The directory where all the models are downloaded.",
),
] = settings.cache_dir
/ "models",
] = (settings.cache_dir / "models"),
force: Annotated[
bool, typer.Option(..., help="If true, the download will be forced")
] = False,
quite: Annotated[
models: Annotated[
Optional[list[_AvailableModels]],
typer.Argument(
help=f"Models to download (default behavior: all will be downloaded)",
),
] = None,
quiet: Annotated[
bool,
typer.Option(
...,
"-q",
help="No extra output is generated, the CLI print only the directory with the cached models.",
"--quiet",
help="No extra output is generated, the CLI prints only the directory with the cached models.",
),
] = False,
layout: Annotated[
bool,
typer.Option(..., help="If true, the layout model weights are downloaded."),
] = True,
tableformer: Annotated[
bool,
typer.Option(
..., help="If true, the tableformer model weights are downloaded."
),
] = True,
code_formula: Annotated[
bool,
typer.Option(
..., help="If true, the code formula model weights are downloaded."
),
] = True,
picture_classifier: Annotated[
bool,
typer.Option(
..., help="If true, the picture classifier model weights are downloaded."
),
] = True,
easyocr: Annotated[
bool,
typer.Option(..., help="If true, the easyocr model weights are downloaded."),
] = True,
rapidocr: Annotated[
bool,
typer.Option(..., help="If true, the rapidocr model weights are downloaded."),
] = True,
):
if not quite:
if not quiet:
FORMAT = "%(message)s"
logging.basicConfig(
level=logging.INFO,
@ -95,25 +72,22 @@ def download(
datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
)
output_dir = download_all(
to_download = models or [m for m in _AvailableModels]
output_dir = download_models(
output_dir=output_dir,
force=force,
progress=(not quite),
layout=layout,
tableformer=tableformer,
code_formula=code_formula,
picture_classifier=picture_classifier,
easyocr=easyocr,
rapidocr=rapidocr,
progress=(not quiet),
with_layout=_AvailableModels.LAYOUT in to_download,
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
with_easyocr=_AvailableModels.EASYOCR in to_download,
)
if quite:
if quiet:
typer.echo(output_dir)
else:
typer.secho(
f"\nAll models downloaded in the directory {output_dir}.", fg="green"
)
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
console.print(
"\n",

View File

@ -39,7 +39,7 @@ from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.models_downloader import download_all
from docling.utils.models_downloader import download_models
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
@ -135,7 +135,7 @@ class StandardPdfPipeline(PaginatedPipeline):
stacklevel=3,
)
output_dir = download_all(output_dir=local_dir, force=force, progress=False)
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
return output_dir
def get_ocr_model(

View File

@ -7,23 +7,21 @@ from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
_log = logging.getLogger(__name__)
def download_all(
def download_models(
output_dir: Optional[Path] = None,
*,
force: bool = False,
progress: bool = False,
layout: bool = True,
tableformer: bool = True,
code_formula: bool = True,
picture_classifier: bool = True,
easyocr: bool = True,
rapidocr: bool = True,
with_layout: bool = True,
with_tableformer: bool = True,
with_code_formula: bool = True,
with_picture_classifier: bool = True,
with_easyocr: bool = True,
):
if output_dir is None:
output_dir = settings.cache_dir / "models"
@ -31,7 +29,7 @@ def download_all(
# Make sure the folder exists
output_dir.mkdir(exist_ok=True, parents=True)
if layout:
if with_layout:
_log.info(f"Downloading layout model...")
LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder,
@ -39,7 +37,7 @@ def download_all(
progress=progress,
)
if tableformer:
if with_tableformer:
_log.info(f"Downloading tableformer model...")
TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder,
@ -47,7 +45,7 @@ def download_all(
progress=progress,
)
if picture_classifier:
if with_picture_classifier:
_log.info(f"Downloading picture classifier model...")
DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
@ -55,7 +53,7 @@ def download_all(
progress=progress,
)
if code_formula:
if with_code_formula:
_log.info(f"Downloading code formula model...")
CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
@ -63,7 +61,7 @@ def download_all(
progress=progress,
)
if easyocr:
if with_easyocr:
_log.info(f"Downloading easyocr models...")
EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder,

View File

@ -26,17 +26,36 @@ To see all available options (export formats etc.) run `docling --help`. More de
### Advanced options
#### Provide specific artifacts path (offline mode)
#### Model prefetching and offline usage
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
By default, models are downloaded automatically upon first usage. If you would prefer
to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
that as follows:
**Step 1: Prefetch the models**
Use the `docling-tools models download` utility:
```sh
$ docling-tools models download
Downloading layout model...
Downloading tableformer model...
Downloading picture classifier model...
Downloading code formula model...
Downloading easyocr models...
Models downloaded into $HOME/.cache/docling/models.
```
Alternatively, models can be programmatically downloaded using `docling.utils.models_downloader.download_models()`.
**Step 2: Use the prefetched models**
```python
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
# dowload all models with `docling-tools models download`
artifacts_path = "/local/path/to/artifacts"
artifacts_path = "/local/path/to/models"
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
doc_converter = DocumentConverter(
@ -46,21 +65,12 @@ doc_converter = DocumentConverter(
)
```
To download all the artifacts needed to run offline, Docling provides the `docling-tools models download` utility.
Or using the CLI:
```sh
$ docling-tools models download
Downloading layout model...
Downloading tableformer model...
Downloading picture classifier model...
Downloading code formula model...
Downloading easyocr models...
All models downloaded in the directory $HOME/.cache/docling/models.
docling --artifacts-path="/local/path/to/models" FILE
```
Alternatively, the download of all models can be triggered also with `docling.utils.models_downloader.download_all()`.
#### Adjust pipeline features
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways