mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
feat: new artifacts path and CLI utility (#876)
* fix artifacts path Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docling-models utility Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing formatting Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename utility to docling-tools Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename download methods and deprecation warnings Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * propagate artifacts path usage for ocr models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move function to utils Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * simplify downloading specific model(s) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * minor refactor Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
105
docling/cli/models.py
Normal file
105
docling/cli/models.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import logging
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.model_downloader import download_models
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
|
||||
console = Console()
|
||||
err_console = Console(stderr=True)
|
||||
|
||||
|
||||
app = typer.Typer(
|
||||
name="Docling models helper",
|
||||
no_args_is_help=True,
|
||||
add_completion=False,
|
||||
pretty_exceptions_enable=False,
|
||||
)
|
||||
|
||||
|
||||
class _AvailableModels(str, Enum):
|
||||
LAYOUT = "layout"
|
||||
TABLEFORMER = "tableformer"
|
||||
CODE_FORMULA = "code_formula"
|
||||
PICTURE_CLASSIFIER = "picture_classifier"
|
||||
EASYOCR = "easyocr"
|
||||
|
||||
|
||||
@app.command("download")
|
||||
def download(
|
||||
output_dir: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
...,
|
||||
"-o",
|
||||
"--output-dir",
|
||||
help="The directory where all the models are downloaded.",
|
||||
),
|
||||
] = (settings.cache_dir / "models"),
|
||||
force: Annotated[
|
||||
bool, typer.Option(..., help="If true, the download will be forced")
|
||||
] = False,
|
||||
models: Annotated[
|
||||
Optional[list[_AvailableModels]],
|
||||
typer.Argument(
|
||||
help=f"Models to download (default behavior: all will be downloaded)",
|
||||
),
|
||||
] = None,
|
||||
quiet: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"-q",
|
||||
"--quiet",
|
||||
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
||||
),
|
||||
] = False,
|
||||
):
|
||||
if not quiet:
|
||||
FORMAT = "%(message)s"
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[blue]%(message)s[/blue]",
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||
)
|
||||
to_download = models or [m for m in _AvailableModels]
|
||||
output_dir = download_models(
|
||||
output_dir=output_dir,
|
||||
force=force,
|
||||
progress=(not quiet),
|
||||
with_layout=_AvailableModels.LAYOUT in to_download,
|
||||
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
||||
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
||||
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
||||
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
||||
)
|
||||
|
||||
if quiet:
|
||||
typer.echo(output_dir)
|
||||
else:
|
||||
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
||||
|
||||
console.print(
|
||||
"\n",
|
||||
"Docling can now be configured for running offline using the local artifacts.\n\n",
|
||||
"Using the CLI:",
|
||||
f"`docling --artifacts-path={output_dir} FILE`",
|
||||
"\n",
|
||||
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
|
||||
)
|
||||
|
||||
|
||||
click_app = typer.main.get_command(app)
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
17
docling/cli/tools.py
Normal file
17
docling/cli/tools.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import typer
|
||||
|
||||
from docling.cli.models import app as models_app
|
||||
|
||||
app = typer.Typer(
|
||||
name="Docling helpers",
|
||||
no_args_is_help=True,
|
||||
add_completion=False,
|
||||
pretty_exceptions_enable=False,
|
||||
)
|
||||
|
||||
app.add_typer(models_app, name="models")
|
||||
|
||||
click_app = typer.main.get_command(app)
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
@@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
|
||||
perf: BatchConcurrencySettings
|
||||
debug: DebugSettings
|
||||
|
||||
cache_dir: Path = Path.home() / ".cache" / "docling"
|
||||
|
||||
|
||||
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
||||
|
||||
@@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
||||
Processes the given batch of elements and enriches them with predictions.
|
||||
"""
|
||||
|
||||
_model_repo_folder = "CodeFormula"
|
||||
elements_batch_size = 5
|
||||
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
||||
expansion_factor = 0.03
|
||||
@@ -68,7 +69,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
artifacts_path: Optional[Path],
|
||||
options: CodeFormulaModelOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
@@ -97,9 +98,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
||||
)
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models_hf()
|
||||
artifacts_path = self.download_models()
|
||||
else:
|
||||
artifacts_path = Path(artifacts_path)
|
||||
artifacts_path = artifacts_path / self._model_repo_folder
|
||||
|
||||
self.code_formula_model = CodeFormulaPredictor(
|
||||
artifacts_path=artifacts_path,
|
||||
@@ -108,13 +109,16 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
def download_models(
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/CodeFormula",
|
||||
force_download=force,
|
||||
|
||||
@@ -55,12 +55,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
Processes a batch of elements and adds classification annotations.
|
||||
"""
|
||||
|
||||
_model_repo_folder = "DocumentFigureClassifier"
|
||||
images_scale = 2
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Optional[Union[Path, str]],
|
||||
artifacts_path: Optional[Path],
|
||||
options: DocumentPictureClassifierOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
@@ -88,9 +89,9 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
)
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models_hf()
|
||||
artifacts_path = self.download_models()
|
||||
else:
|
||||
artifacts_path = Path(artifacts_path)
|
||||
artifacts_path = artifacts_path / self._model_repo_folder
|
||||
|
||||
self.document_picture_classifier = DocumentFigureClassifierPredictor(
|
||||
artifacts_path=artifacts_path,
|
||||
@@ -99,13 +100,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
def download_models(
|
||||
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/DocumentFigureClassifier",
|
||||
force_download=force,
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Iterable
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import httpx
|
||||
import numpy
|
||||
import torch
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docling.utils.utils import download_url_with_progress
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel(BaseOcrModel):
|
||||
_model_repo_folder = "EasyOcr"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Optional[Path],
|
||||
options: EasyOcrOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
@@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
|
||||
)
|
||||
use_gpu = self.options.use_gpu
|
||||
|
||||
download_enabled = self.options.download_enabled
|
||||
model_storage_directory = self.options.model_storage_directory
|
||||
if artifacts_path is not None and model_storage_directory is None:
|
||||
download_enabled = False
|
||||
model_storage_directory = str(artifacts_path / self._model_repo_folder)
|
||||
|
||||
self.reader = easyocr.Reader(
|
||||
lang_list=self.options.lang,
|
||||
gpu=use_gpu,
|
||||
model_storage_directory=self.options.model_storage_directory,
|
||||
model_storage_directory=model_storage_directory,
|
||||
recog_network=self.options.recog_network,
|
||||
download_enabled=self.options.download_enabled,
|
||||
download_enabled=download_enabled,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def download_models(
|
||||
detection_models: List[str] = ["craft"],
|
||||
recognition_models: List[str] = ["english_g2", "latin_g2"],
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
||||
from easyocr.config import detection_models as det_models_dict
|
||||
from easyocr.config import recognition_models as rec_models_dict
|
||||
|
||||
if local_dir is None:
|
||||
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
||||
|
||||
local_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Collect models to download
|
||||
download_list = []
|
||||
for model_name in detection_models:
|
||||
if model_name in det_models_dict:
|
||||
download_list.append(det_models_dict[model_name])
|
||||
for model_name in recognition_models:
|
||||
if model_name in rec_models_dict["gen2"]:
|
||||
download_list.append(rec_models_dict["gen2"][model_name])
|
||||
|
||||
# Download models
|
||||
for model_details in download_list:
|
||||
buf = download_url_with_progress(model_details["url"], progress=progress)
|
||||
with zipfile.ZipFile(buf, "r") as zip_ref:
|
||||
zip_ref.extractall(local_dir)
|
||||
|
||||
return local_dir
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import copy
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from typing import Iterable, Optional, Union
|
||||
|
||||
from docling_core.types.doc import DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayoutModel(BasePageModel):
|
||||
_model_repo_folder = "docling-models"
|
||||
_model_path = "model_artifacts/layout"
|
||||
|
||||
TEXT_ELEM_LABELS = [
|
||||
DocItemLabel.TEXT,
|
||||
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
|
||||
FORMULA_LABEL = DocItemLabel.FORMULA
|
||||
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
||||
|
||||
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
||||
def __init__(
|
||||
self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
|
||||
):
|
||||
device = decide_device(accelerator_options.device)
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models() / self._model_path
|
||||
else:
|
||||
# will become the default in the future
|
||||
if (artifacts_path / self._model_repo_folder).exists():
|
||||
artifacts_path = (
|
||||
artifacts_path / self._model_repo_folder / self._model_path
|
||||
)
|
||||
elif (artifacts_path / self._model_path).exists():
|
||||
warnings.warn(
|
||||
"The usage of artifacts_path containing directly "
|
||||
f"{self._model_path} is deprecated. Please point "
|
||||
"the artifacts_path to the parent containing "
|
||||
f"the {self._model_repo_folder} folder.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
artifacts_path = artifacts_path / self._model_path
|
||||
|
||||
self.layout_predictor = LayoutPredictor(
|
||||
artifact_path=str(artifacts_path),
|
||||
device=device,
|
||||
num_threads=accelerator_options.num_threads,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def download_models(
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.1.0",
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def draw_clusters_and_cells_side_by_side(
|
||||
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
||||
):
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import copy
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from typing import Iterable, Optional, Union
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
@@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
||||
_model_repo_folder = "docling-models"
|
||||
_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Path,
|
||||
artifacts_path: Optional[Path],
|
||||
options: TableStructureOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
@@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):
|
||||
|
||||
self.enabled = enabled
|
||||
if self.enabled:
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models() / self._model_path
|
||||
else:
|
||||
# will become the default in the future
|
||||
if (artifacts_path / self._model_repo_folder).exists():
|
||||
artifacts_path = (
|
||||
artifacts_path / self._model_repo_folder / self._model_path
|
||||
)
|
||||
elif (artifacts_path / self._model_path).exists():
|
||||
warnings.warn(
|
||||
"The usage of artifacts_path containing directly "
|
||||
f"{self._model_path} is deprecated. Please point "
|
||||
"the artifacts_path to the parent containing "
|
||||
f"the {self._model_repo_folder} folder.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
artifacts_path = artifacts_path / self._model_path
|
||||
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "accurate"
|
||||
else:
|
||||
@@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
|
||||
)
|
||||
self.scale = 2.0 # Scale up table input images to 144 dpi
|
||||
|
||||
@staticmethod
|
||||
def download_models(
|
||||
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.1.0",
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def draw_table_and_cells(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
||||
from docling.models.document_picture_classifier import (
|
||||
@@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.model_downloader import download_models
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StandardPdfPipeline(PaginatedPipeline):
|
||||
_layout_model_path = "model_artifacts/layout"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
_layout_model_path = LayoutModel._model_path
|
||||
_table_model_path = TableStructureModel._model_path
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.pipeline_options: PdfPipelineOptions
|
||||
|
||||
if pipeline_options.artifacts_path is None:
|
||||
self.artifacts_path = self.download_models_hf()
|
||||
else:
|
||||
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
||||
artifacts_path: Optional[Path] = None
|
||||
if pipeline_options.artifacts_path is not None:
|
||||
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
||||
|
||||
self.keep_images = (
|
||||
self.pipeline_options.generate_page_images
|
||||
@@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
|
||||
self.glm_model = GlmModel(options=GlmOptions())
|
||||
|
||||
if (ocr_model := self.get_ocr_model()) is None:
|
||||
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||
)
|
||||
@@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
ocr_model,
|
||||
# Layout model
|
||||
LayoutModel(
|
||||
artifacts_path=self.artifacts_path
|
||||
/ StandardPdfPipeline._layout_model_path,
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
),
|
||||
# Table structure model
|
||||
TableStructureModel(
|
||||
enabled=pipeline_options.do_table_structure,
|
||||
artifacts_path=self.artifacts_path
|
||||
/ StandardPdfPipeline._table_model_path,
|
||||
artifacts_path=artifacts_path,
|
||||
options=pipeline_options.table_structure_options,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
),
|
||||
@@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
CodeFormulaModel(
|
||||
enabled=pipeline_options.do_code_enrichment
|
||||
or pipeline_options.do_formula_enrichment,
|
||||
artifacts_path=pipeline_options.artifacts_path,
|
||||
artifacts_path=artifacts_path,
|
||||
options=CodeFormulaModelOptions(
|
||||
do_code_enrichment=pipeline_options.do_code_enrichment,
|
||||
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
||||
@@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
# Document Picture Classifier
|
||||
DocumentPictureClassifier(
|
||||
enabled=pipeline_options.do_picture_classification,
|
||||
artifacts_path=pipeline_options.artifacts_path,
|
||||
artifacts_path=artifacts_path,
|
||||
options=DocumentPictureClassifierOptions(),
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
),
|
||||
@@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.1.0",
|
||||
warnings.warn(
|
||||
"The usage of StandardPdfPipeline.download_models_hf() is deprecated "
|
||||
"use instead the utility `docling-tools models download`, or "
|
||||
"the upstream method docling.utils.models_downloader.download_all()",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
||||
return output_dir
|
||||
|
||||
def get_ocr_model(self) -> Optional[BaseOcrModel]:
|
||||
def get_ocr_model(
|
||||
self, artifacts_path: Optional[Path] = None
|
||||
) -> Optional[BaseOcrModel]:
|
||||
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
||||
return EasyOcrModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
artifacts_path=artifacts_path,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
accelerator_options=self.pipeline_options.accelerator_options,
|
||||
)
|
||||
|
||||
72
docling/utils/model_downloader.py
Normal file
72
docling/utils/model_downloader.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.code_formula_model import CodeFormulaModel
|
||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def download_models(
|
||||
output_dir: Optional[Path] = None,
|
||||
*,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
with_layout: bool = True,
|
||||
with_tableformer: bool = True,
|
||||
with_code_formula: bool = True,
|
||||
with_picture_classifier: bool = True,
|
||||
with_easyocr: bool = True,
|
||||
):
|
||||
if output_dir is None:
|
||||
output_dir = settings.cache_dir / "models"
|
||||
|
||||
# Make sure the folder exists
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
if with_layout:
|
||||
_log.info(f"Downloading layout model...")
|
||||
LayoutModel.download_models(
|
||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if with_tableformer:
|
||||
_log.info(f"Downloading tableformer model...")
|
||||
TableStructureModel.download_models(
|
||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if with_picture_classifier:
|
||||
_log.info(f"Downloading picture classifier model...")
|
||||
DocumentPictureClassifier.download_models(
|
||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||
force=force,
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if with_code_formula:
|
||||
_log.info(f"Downloading code formula model...")
|
||||
CodeFormulaModel.download_models(
|
||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
if with_easyocr:
|
||||
_log.info(f"Downloading easyocr models...")
|
||||
EasyOcrModel.download_models(
|
||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=progress,
|
||||
)
|
||||
|
||||
return output_dir
|
||||
@@ -4,6 +4,9 @@ from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def chunkify(iterator, chunk_size):
|
||||
"""Yield successive chunks of chunk_size from the iterable."""
|
||||
@@ -39,3 +42,24 @@ def create_hash(string: str):
|
||||
hasher.update(string.encode("utf-8"))
|
||||
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
|
||||
buf = BytesIO()
|
||||
with requests.get(url, stream=True, allow_redirects=True) as response:
|
||||
total_size = int(response.headers.get("content-length", 0))
|
||||
progress_bar = tqdm(
|
||||
total=total_size,
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
disable=(not progress),
|
||||
)
|
||||
|
||||
for chunk in response.iter_content(10 * 1024):
|
||||
buf.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
progress_bar.close()
|
||||
|
||||
buf.seek(0)
|
||||
return buf
|
||||
|
||||
Reference in New Issue
Block a user