mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
simplify downloading specific model(s)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
5692cdb19d
commit
5131e7ff21
@ -1,26 +1,19 @@
|
||||
import logging
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
from typing import Annotated, Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.code_formula_model import CodeFormulaModel
|
||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.utils.models_downloader import download_all
|
||||
from docling.utils.models_downloader import download_models
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
|
||||
console = Console()
|
||||
err_console = Console(stderr=True)
|
||||
|
||||
@ -33,6 +26,14 @@ app = typer.Typer(
|
||||
)
|
||||
|
||||
|
||||
class _AvailableModels(str, Enum):
|
||||
LAYOUT = "layout"
|
||||
TABLEFORMER = "tableformer"
|
||||
CODE_FORMULA = "code_formula"
|
||||
PICTURE_CLASSIFIER = "picture_classifier"
|
||||
EASYOCR = "easyocr"
|
||||
|
||||
|
||||
@app.command("download")
|
||||
def download(
|
||||
output_dir: Annotated[
|
||||
@ -43,51 +44,27 @@ def download(
|
||||
"--output-dir",
|
||||
help="The directory where all the models are downloaded.",
|
||||
),
|
||||
] = settings.cache_dir
|
||||
/ "models",
|
||||
] = (settings.cache_dir / "models"),
|
||||
force: Annotated[
|
||||
bool, typer.Option(..., help="If true, the download will be forced")
|
||||
] = False,
|
||||
quite: Annotated[
|
||||
models: Annotated[
|
||||
Optional[list[_AvailableModels]],
|
||||
typer.Argument(
|
||||
help=f"Models to download (default behavior: all will be downloaded)",
|
||||
),
|
||||
] = None,
|
||||
quiet: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"-q",
|
||||
help="No extra output is generated, the CLI print only the directory with the cached models.",
|
||||
"--quiet",
|
||||
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
||||
),
|
||||
] = False,
|
||||
layout: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="If true, the layout model weights are downloaded."),
|
||||
] = True,
|
||||
tableformer: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If true, the tableformer model weights are downloaded."
|
||||
),
|
||||
] = True,
|
||||
code_formula: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If true, the code formula model weights are downloaded."
|
||||
),
|
||||
] = True,
|
||||
picture_classifier: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If true, the picture classifier model weights are downloaded."
|
||||
),
|
||||
] = True,
|
||||
easyocr: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="If true, the easyocr model weights are downloaded."),
|
||||
] = True,
|
||||
rapidocr: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="If true, the rapidocr model weights are downloaded."),
|
||||
] = True,
|
||||
):
|
||||
if not quite:
|
||||
if not quiet:
|
||||
FORMAT = "%(message)s"
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@ -95,25 +72,22 @@ def download(
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||
)
|
||||
|
||||
output_dir = download_all(
|
||||
to_download = models or [m for m in _AvailableModels]
|
||||
output_dir = download_models(
|
||||
output_dir=output_dir,
|
||||
force=force,
|
||||
progress=(not quite),
|
||||
layout=layout,
|
||||
tableformer=tableformer,
|
||||
code_formula=code_formula,
|
||||
picture_classifier=picture_classifier,
|
||||
easyocr=easyocr,
|
||||
rapidocr=rapidocr,
|
||||
progress=(not quiet),
|
||||
with_layout=_AvailableModels.LAYOUT in to_download,
|
||||
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
||||
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
||||
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
||||
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
||||
)
|
||||
|
||||
if quite:
|
||||
if quiet:
|
||||
typer.echo(output_dir)
|
||||
else:
|
||||
typer.secho(
|
||||
f"\nAll models downloaded in the directory {output_dir}.", fg="green"
|
||||
)
|
||||
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
||||
|
||||
console.print(
|
||||
"\n",
|
||||
|
@ -39,7 +39,7 @@ from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.models_downloader import download_all
|
||||
from docling.utils.models_downloader import download_models
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -135,7 +135,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
output_dir = download_all(output_dir=local_dir, force=force, progress=False)
|
||||
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
||||
return output_dir
|
||||
|
||||
def get_ocr_model(
|
||||
|
@ -7,23 +7,21 @@ from docling.models.code_formula_model import CodeFormulaModel
|
||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def download_all(
|
||||
def download_models(
|
||||
output_dir: Optional[Path] = None,
|
||||
*,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
layout: bool = True,
|
||||
tableformer: bool = True,
|
||||
code_formula: bool = True,
|
||||
picture_classifier: bool = True,
|
||||
easyocr: bool = True,
|
||||
rapidocr: bool = True,
|
||||
with_layout: bool = True,
|
||||
with_tableformer: bool = True,
|
||||
with_code_formula: bool = True,
|
||||
with_picture_classifier: bool = True,
|
||||
with_easyocr: bool = True,
|
||||
):
|
||||
if output_dir is None:
|
||||
output_dir = settings.cache_dir / "models"
|
||||
@ -31,7 +29,7 @@ def download_all(
|
||||
# Make sure the folder exists
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
if layout:
|
||||
if with_layout:
|
||||
_log.info(f"Downloading layout model...")
|
||||
LayoutModel.download_models(
|
||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||
@ -39,7 +37,7 @@ def download_all(
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if tableformer:
|
||||
if with_tableformer:
|
||||
_log.info(f"Downloading tableformer model...")
|
||||
TableStructureModel.download_models(
|
||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||
@ -47,7 +45,7 @@ def download_all(
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if picture_classifier:
|
||||
if with_picture_classifier:
|
||||
_log.info(f"Downloading picture classifier model...")
|
||||
DocumentPictureClassifier.download_models(
|
||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||
@ -55,7 +53,7 @@ def download_all(
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if code_formula:
|
||||
if with_code_formula:
|
||||
_log.info(f"Downloading code formula model...")
|
||||
CodeFormulaModel.download_models(
|
||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||
@ -63,7 +61,7 @@ def download_all(
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if easyocr:
|
||||
if with_easyocr:
|
||||
_log.info(f"Downloading easyocr models...")
|
||||
EasyOcrModel.download_models(
|
||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||
|
@ -26,17 +26,36 @@ To see all available options (export formats etc.) run `docling --help`. More de
|
||||
|
||||
### Advanced options
|
||||
|
||||
#### Provide specific artifacts path (offline mode)
|
||||
#### Model prefetching and offline usage
|
||||
|
||||
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
||||
By default, models are downloaded automatically upon first usage. If you would prefer
|
||||
to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
|
||||
that as follows:
|
||||
|
||||
**Step 1: Prefetch the models**
|
||||
|
||||
Use the `docling-tools models download` utility:
|
||||
|
||||
```sh
|
||||
$ docling-tools models download
|
||||
Downloading layout model...
|
||||
Downloading tableformer model...
|
||||
Downloading picture classifier model...
|
||||
Downloading code formula model...
|
||||
Downloading easyocr models...
|
||||
Models downloaded into $HOME/.cache/docling/models.
|
||||
```
|
||||
|
||||
Alternatively, models can be programmatically downloaded using `docling.utils.models_downloader.download_models()`.
|
||||
|
||||
**Step 2: Use the prefetched models**
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
# dowload all models with `docling-tools models download`
|
||||
artifacts_path = "/local/path/to/artifacts"
|
||||
artifacts_path = "/local/path/to/models"
|
||||
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
doc_converter = DocumentConverter(
|
||||
@ -46,21 +65,12 @@ doc_converter = DocumentConverter(
|
||||
)
|
||||
```
|
||||
|
||||
To download all the artifacts needed to run offline, Docling provides the `docling-tools models download` utility.
|
||||
Or using the CLI:
|
||||
|
||||
```sh
|
||||
$ docling-tools models download
|
||||
Downloading layout model...
|
||||
Downloading tableformer model...
|
||||
Downloading picture classifier model...
|
||||
Downloading code formula model...
|
||||
Downloading easyocr models...
|
||||
All models downloaded in the directory $HOME/.cache/docling/models.
|
||||
docling --artifacts-path="/local/path/to/models" FILE
|
||||
```
|
||||
|
||||
Alternatively, the download of all models can be triggered also with `docling.utils.models_downloader.download_all()`.
|
||||
|
||||
|
||||
#### Adjust pipeline features
|
||||
|
||||
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
|
||||
|
Loading…
Reference in New Issue
Block a user