remove hf_vlm_model and add extra_generation_args

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-06-02 12:58:32 +02:00
parent c0847c97a7
commit 8006683007
14 changed files with 94 additions and 114 deletions

View File

@ -46,7 +46,8 @@ class InlineVlmOptions(BaseVlmOptions):
scale: float = 2.0 scale: float = 2.0
temperature: float = 0.0 temperature: float = 0.0
stop_strings: list[str] = [] stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {}
use_kv_cache: bool = True use_kv_cache: bool = True
max_new_tokens: int = 4096 max_new_tokens: int = 4096

View File

@ -97,6 +97,7 @@ PHI4_TRANSFORMERS = InlineVlmOptions(
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA], supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
scale=2.0, scale=2.0,
temperature=0.0, temperature=0.0,
extra_generation_config=dict(num_logits_to_keep=0),
) )
# Qwen # Qwen

View File

@ -19,6 +19,7 @@ from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import ItemAndImageEnrichmentElement from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.models.base_model import BaseItemAndImageEnrichmentModel from docling.models.base_model import BaseItemAndImageEnrichmentModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
force: bool = False, force: bool = False,
progress: bool = False, progress: bool = False,
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download return download_hf_model(
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/CodeFormula", repo_id="ds4sd/CodeFormula",
force_download=force,
local_dir=local_dir,
revision="v1.0.2", revision="v1.0.2",
local_dir=local_dir,
force=force,
progress=progress,
) )
return Path(download_path)
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
""" """
Determines if a given element in a document can be processed by the model. Determines if a given element in a document can be processed by the model.

View File

@ -15,6 +15,7 @@ from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.models.base_model import BaseEnrichmentModel from docling.models.base_model import BaseEnrichmentModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
def download_models( def download_models(
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download return download_hf_model(
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/DocumentFigureClassifier", repo_id="ds4sd/DocumentFigureClassifier",
force_download=force,
local_dir=local_dir,
revision="v1.0.1", revision="v1.0.1",
local_dir=local_dir,
force=force,
progress=progress,
) )
return Path(download_path)
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
""" """
Determines if the given element can be processed by the classifier. Determines if the given element can be processed by the classifier.

View File

@ -1,28 +0,0 @@
import logging
from pathlib import Path
from typing import Optional
_log = logging.getLogger(__name__)
class HuggingFaceVlmModel:
@staticmethod
def download_models(
repo_id: str,
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id=repo_id,
force_download=force,
local_dir=local_dir,
# revision="v0.0.1",
)
return Path(download_path)

View File

@ -15,6 +15,7 @@ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -83,20 +84,14 @@ class LayoutModel(BasePageModel):
force: bool = False, force: bool = False,
progress: bool = False, progress: bool = False,
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download return download_hf_model(
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models", repo_id="ds4sd/docling-models",
force_download=force, revision="v2.2.0",
local_dir=local_dir, local_dir=local_dir,
revision="v2.1.0", force=force,
progress=progress,
) )
return Path(download_path)
def draw_clusters_and_cells_side_by_side( def draw_clusters_and_cells_side_by_side(
self, conv_res, page, clusters, mode_prefix: str, show: bool = False self, conv_res, page, clusters, mode_prefix: str, show: bool = False
): ):

View File

@ -10,10 +10,15 @@ from docling.datamodel.pipeline_options import (
PictureDescriptionVlmOptions, PictureDescriptionVlmOptions,
) )
from docling.models.picture_description_base_model import PictureDescriptionBaseModel from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
class PictureDescriptionVlmModel(PictureDescriptionBaseModel): class PictureDescriptionVlmModel(
PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
):
@classmethod @classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return PictureDescriptionVlmOptions return PictureDescriptionVlmOptions
@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
self.provenance = f"{self.options.repo_id}" self.provenance = f"{self.options.repo_id}"
@staticmethod
def download_models(
repo_id: str,
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id=repo_id,
force_download=force,
local_dir=local_dir,
)
return Path(download_path)
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
from transformers import GenerationConfig from transformers import GenerationConfig

View File

@ -22,6 +22,7 @@ from docling.datamodel.pipeline_options import (
) )
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -89,20 +90,14 @@ class TableStructureModel(BasePageModel):
def download_models( def download_models(
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download return download_hf_model(
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models", repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.2.0", revision="v2.2.0",
local_dir=local_dir,
force=force,
progress=progress,
) )
return Path(download_path)
def draw_table_and_cells( def draw_table_and_cells(
self, self,
conv_res: ConversionResult, conv_res: ConversionResult,

View File

View File

@ -0,0 +1,40 @@
import logging
from pathlib import Path
from typing import Optional
_log = logging.getLogger(__name__)
def download_hf_model(
repo_id: str,
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
revision: Optional[str] = None,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id=repo_id,
force_download=force,
local_dir=local_dir,
revision=revision,
)
return Path(download_path)
class HuggingFaceModelDownloadMixin:
@staticmethod
def download_models(
repo_id: str,
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
return download_hf_model(
repo_id=repo_id, local_dir=local_dir, force=force, progress=progress
)

View File

@ -12,14 +12,18 @@ from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): class HuggingFaceVlmModel_AutoModelForCausalLM(
BasePageModel, HuggingFaceModelDownloadMixin
):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -62,9 +66,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
repo_cache_folder = vlm_options.repo_id.replace("/", "--") repo_cache_folder = vlm_options.repo_id.replace("/", "--")
if artifacts_path is None: if artifacts_path is None:
artifacts_path = HuggingFaceVlmModel.download_models( artifacts_path = self.download_models(self.vlm_options.repo_id)
self.vlm_options.repo_id
)
elif (artifacts_path / repo_cache_folder).exists(): elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder artifacts_path = artifacts_path / repo_cache_folder
@ -128,7 +130,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
use_cache=self.use_cache, # Enables KV caching which can improve performance use_cache=self.use_cache, # Enables KV caching which can improve performance
temperature=self.temperature, temperature=self.temperature,
generation_config=self.generation_config, generation_config=self.generation_config,
num_logits_to_keep=1, **self.vlm_options.extra_generation_config,
) )
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]

View File

@ -11,14 +11,18 @@ from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): class HuggingFaceVlmModel_AutoModelForVision2Seq(
BasePageModel, HuggingFaceModelDownloadMixin
):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -52,10 +56,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
# PARAMETERS: # PARAMETERS:
if artifacts_path is None: if artifacts_path is None:
# artifacts_path = self.download_models(self.vlm_options.repo_id) artifacts_path = self.download_models(self.vlm_options.repo_id)
artifacts_path = HuggingFaceVlmModel.download_models(
self.vlm_options.repo_id
)
elif (artifacts_path / repo_cache_folder).exists(): elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder artifacts_path = artifacts_path / repo_cache_folder

View File

@ -11,13 +11,15 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class HuggingFaceMlxModel(BasePageModel): class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -48,12 +50,8 @@ class HuggingFaceMlxModel(BasePageModel):
# PARAMETERS: # PARAMETERS:
if artifacts_path is None: if artifacts_path is None:
_log.debug( artifacts_path = self.download_models(
f"before HuggingFaceVlmModel.download_models: {self.vlm_options.repo_id}"
)
artifacts_path = HuggingFaceVlmModel.download_models(
self.vlm_options.repo_id, self.vlm_options.repo_id,
progress=True,
) )
elif (artifacts_path / repo_cache_folder).exists(): elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder artifacts_path = artifacts_path / repo_cache_folder

View File

@ -14,10 +14,10 @@ from docling.datamodel.vlm_model_specs import (
from docling.models.code_formula_model import CodeFormulaModel from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.utils.hf_model_download import download_hf_model
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -77,7 +77,7 @@ def download_models(
if with_smolvlm: if with_smolvlm:
_log.info("Downloading SmolVlm model...") _log.info("Downloading SmolVlm model...")
PictureDescriptionVlmModel.download_models( download_hf_model(
repo_id=smolvlm_picture_description.repo_id, repo_id=smolvlm_picture_description.repo_id,
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
force=force, force=force,
@ -86,7 +86,7 @@ def download_models(
if with_smoldocling: if with_smoldocling:
_log.info("Downloading SmolDocling model...") _log.info("Downloading SmolDocling model...")
HuggingFaceVlmModel.download_models( download_hf_model(
repo_id=SMOLDOCLING_TRANSFORMERS.repo_id, repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder, local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
force=force, force=force,
@ -95,7 +95,7 @@ def download_models(
if with_smoldocling_mlx: if with_smoldocling_mlx:
_log.info("Downloading SmolDocling MLX model...") _log.info("Downloading SmolDocling MLX model...")
HuggingFaceVlmModel.download_models( download_hf_model(
repo_id=SMOLDOCLING_MLX.repo_id, repo_id=SMOLDOCLING_MLX.repo_id,
local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder, local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
force=force, force=force,
@ -104,7 +104,7 @@ def download_models(
if with_granite_vision: if with_granite_vision:
_log.info("Downloading Granite Vision model...") _log.info("Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models( download_hf_model(
repo_id=granite_picture_description.repo_id, repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder, local_dir=output_dir / granite_picture_description.repo_cache_folder,
force=force, force=force,