mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
simplify downloading specific model(s)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
5692cdb19d
commit
3af9b9d34e
@ -1,26 +1,19 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated
|
from typing import Annotated, Optional
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.logging import RichHandler
|
||||||
|
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.code_formula_model import CodeFormulaModel
|
from docling.utils.models_downloader import download_models
|
||||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
|
||||||
from docling.models.layout_model import LayoutModel
|
|
||||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
|
||||||
from docling.utils.models_downloader import download_all
|
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
from rich.console import Console
|
|
||||||
from rich.logging import RichHandler
|
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
err_console = Console(stderr=True)
|
err_console = Console(stderr=True)
|
||||||
|
|
||||||
@ -33,6 +26,14 @@ app = typer.Typer(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _AvailableModels(str, Enum):
|
||||||
|
LAYOUT = "layout"
|
||||||
|
TABLEFORMER = "tableformer"
|
||||||
|
CODE_FORMULA = "code_formula"
|
||||||
|
PICTURE_CLASSIFIER = "picture_classifier"
|
||||||
|
EASYOCR = "easyocr"
|
||||||
|
|
||||||
|
|
||||||
@app.command("download")
|
@app.command("download")
|
||||||
def download(
|
def download(
|
||||||
output_dir: Annotated[
|
output_dir: Annotated[
|
||||||
@ -43,51 +44,27 @@ def download(
|
|||||||
"--output-dir",
|
"--output-dir",
|
||||||
help="The directory where all the models are downloaded.",
|
help="The directory where all the models are downloaded.",
|
||||||
),
|
),
|
||||||
] = settings.cache_dir
|
] = (settings.cache_dir / "models"),
|
||||||
/ "models",
|
|
||||||
force: Annotated[
|
force: Annotated[
|
||||||
bool, typer.Option(..., help="If true, the download will be forced")
|
bool, typer.Option(..., help="If true, the download will be forced")
|
||||||
] = False,
|
] = False,
|
||||||
quite: Annotated[
|
models: Annotated[
|
||||||
|
Optional[list[_AvailableModels]],
|
||||||
|
typer.Argument(
|
||||||
|
help=f"Models to download (default behavior: all will be downloaded)",
|
||||||
|
),
|
||||||
|
] = None,
|
||||||
|
quiet: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
...,
|
...,
|
||||||
"-q",
|
"-q",
|
||||||
help="No extra output is generated, the CLI print only the directory with the cached models.",
|
"--quiet",
|
||||||
|
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
||||||
),
|
),
|
||||||
] = False,
|
] = False,
|
||||||
layout: Annotated[
|
|
||||||
bool,
|
|
||||||
typer.Option(..., help="If true, the layout model weights are downloaded."),
|
|
||||||
] = True,
|
|
||||||
tableformer: Annotated[
|
|
||||||
bool,
|
|
||||||
typer.Option(
|
|
||||||
..., help="If true, the tableformer model weights are downloaded."
|
|
||||||
),
|
|
||||||
] = True,
|
|
||||||
code_formula: Annotated[
|
|
||||||
bool,
|
|
||||||
typer.Option(
|
|
||||||
..., help="If true, the code formula model weights are downloaded."
|
|
||||||
),
|
|
||||||
] = True,
|
|
||||||
picture_classifier: Annotated[
|
|
||||||
bool,
|
|
||||||
typer.Option(
|
|
||||||
..., help="If true, the picture classifier model weights are downloaded."
|
|
||||||
),
|
|
||||||
] = True,
|
|
||||||
easyocr: Annotated[
|
|
||||||
bool,
|
|
||||||
typer.Option(..., help="If true, the easyocr model weights are downloaded."),
|
|
||||||
] = True,
|
|
||||||
rapidocr: Annotated[
|
|
||||||
bool,
|
|
||||||
typer.Option(..., help="If true, the rapidocr model weights are downloaded."),
|
|
||||||
] = True,
|
|
||||||
):
|
):
|
||||||
if not quite:
|
if not quiet:
|
||||||
FORMAT = "%(message)s"
|
FORMAT = "%(message)s"
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
@ -95,25 +72,22 @@ def download(
|
|||||||
datefmt="[%X]",
|
datefmt="[%X]",
|
||||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||||
)
|
)
|
||||||
|
to_download = models or [m for m in _AvailableModels]
|
||||||
output_dir = download_all(
|
output_dir = download_models(
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
force=force,
|
force=force,
|
||||||
progress=(not quite),
|
progress=(not quiet),
|
||||||
layout=layout,
|
with_layout=_AvailableModels.LAYOUT in to_download,
|
||||||
tableformer=tableformer,
|
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
||||||
code_formula=code_formula,
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
||||||
picture_classifier=picture_classifier,
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
||||||
easyocr=easyocr,
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
||||||
rapidocr=rapidocr,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if quite:
|
if quiet:
|
||||||
typer.echo(output_dir)
|
typer.echo(output_dir)
|
||||||
else:
|
else:
|
||||||
typer.secho(
|
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
||||||
f"\nAll models downloaded in the directory {output_dir}.", fg="green"
|
|
||||||
)
|
|
||||||
|
|
||||||
console.print(
|
console.print(
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -39,7 +39,7 @@ from docling.models.table_structure_model import TableStructureModel
|
|||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.models_downloader import download_all
|
from docling.utils.models_downloader import download_models
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -135,7 +135,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
stacklevel=3,
|
stacklevel=3,
|
||||||
)
|
)
|
||||||
|
|
||||||
output_dir = download_all(output_dir=local_dir, force=force, progress=False)
|
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
||||||
return output_dir
|
return output_dir
|
||||||
|
|
||||||
def get_ocr_model(
|
def get_ocr_model(
|
||||||
|
@ -7,23 +7,21 @@ from docling.models.code_formula_model import CodeFormulaModel
|
|||||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def download_all(
|
def download_models(
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
*,
|
*,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
progress: bool = False,
|
progress: bool = False,
|
||||||
layout: bool = True,
|
with_layout: bool = True,
|
||||||
tableformer: bool = True,
|
with_tableformer: bool = True,
|
||||||
code_formula: bool = True,
|
with_code_formula: bool = True,
|
||||||
picture_classifier: bool = True,
|
with_picture_classifier: bool = True,
|
||||||
easyocr: bool = True,
|
with_easyocr: bool = True,
|
||||||
rapidocr: bool = True,
|
|
||||||
):
|
):
|
||||||
if output_dir is None:
|
if output_dir is None:
|
||||||
output_dir = settings.cache_dir / "models"
|
output_dir = settings.cache_dir / "models"
|
||||||
@ -31,7 +29,7 @@ def download_all(
|
|||||||
# Make sure the folder exists
|
# Make sure the folder exists
|
||||||
output_dir.mkdir(exist_ok=True, parents=True)
|
output_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
if layout:
|
if with_layout:
|
||||||
_log.info(f"Downloading layout model...")
|
_log.info(f"Downloading layout model...")
|
||||||
LayoutModel.download_models(
|
LayoutModel.download_models(
|
||||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||||
@ -39,7 +37,7 @@ def download_all(
|
|||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if tableformer:
|
if with_tableformer:
|
||||||
_log.info(f"Downloading tableformer model...")
|
_log.info(f"Downloading tableformer model...")
|
||||||
TableStructureModel.download_models(
|
TableStructureModel.download_models(
|
||||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||||
@ -47,7 +45,7 @@ def download_all(
|
|||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if picture_classifier:
|
if with_picture_classifier:
|
||||||
_log.info(f"Downloading picture classifier model...")
|
_log.info(f"Downloading picture classifier model...")
|
||||||
DocumentPictureClassifier.download_models(
|
DocumentPictureClassifier.download_models(
|
||||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||||
@ -55,7 +53,7 @@ def download_all(
|
|||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if code_formula:
|
if with_code_formula:
|
||||||
_log.info(f"Downloading code formula model...")
|
_log.info(f"Downloading code formula model...")
|
||||||
CodeFormulaModel.download_models(
|
CodeFormulaModel.download_models(
|
||||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||||
@ -63,7 +61,7 @@ def download_all(
|
|||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if easyocr:
|
if with_easyocr:
|
||||||
_log.info(f"Downloading easyocr models...")
|
_log.info(f"Downloading easyocr models...")
|
||||||
EasyOcrModel.download_models(
|
EasyOcrModel.download_models(
|
||||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||||
|
@ -26,17 +26,36 @@ To see all available options (export formats etc.) run `docling --help`. More de
|
|||||||
|
|
||||||
### Advanced options
|
### Advanced options
|
||||||
|
|
||||||
#### Provide specific artifacts path (offline mode)
|
#### Model prefetching and offline usage
|
||||||
|
|
||||||
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
By default, models are downloaded automatically upon first usage. If you would prefer
|
||||||
|
to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
|
||||||
|
that as follows:
|
||||||
|
|
||||||
|
**Step 1: Prefetch the models**
|
||||||
|
|
||||||
|
Use the `docling-tools models download` utility:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ docling-tools models download
|
||||||
|
Downloading layout model...
|
||||||
|
Downloading tableformer model...
|
||||||
|
Downloading picture classifier model...
|
||||||
|
Downloading code formula model...
|
||||||
|
Downloading easyocr models...
|
||||||
|
Models downloaded into $HOME/.cache/docling/models.
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, models can be programmatically downloaded using `docling.utils.models_downloader.download_models()`.
|
||||||
|
|
||||||
|
**Step 2: Use the prefetched models**
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
|
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
# dowload all models with `docling-tools models download`
|
artifacts_path = "/local/path/to/models"
|
||||||
artifacts_path = "/local/path/to/artifacts"
|
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
@ -46,21 +65,12 @@ doc_converter = DocumentConverter(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
To download all the artifacts needed to run offline, Docling provides the `docling-tools models download` utility.
|
Or using the CLI:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
$ docling-tools models download
|
docling --artifacts-path="/local/path/to/models" FILE
|
||||||
Downloading layout model...
|
|
||||||
Downloading tableformer model...
|
|
||||||
Downloading picture classifier model...
|
|
||||||
Downloading code formula model...
|
|
||||||
Downloading easyocr models...
|
|
||||||
All models downloaded in the directory $HOME/.cache/docling/models.
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, the download of all models can be triggered also with `docling.utils.models_downloader.download_all()`.
|
|
||||||
|
|
||||||
|
|
||||||
#### Adjust pipeline features
|
#### Adjust pipeline features
|
||||||
|
|
||||||
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
|
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
|
||||||
|
Loading…
Reference in New Issue
Block a user