From 800668300766225daebbeb8557090736e7bde7cb Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Mon, 2 Jun 2025 12:58:32 +0200
Subject: [PATCH] remove hf_vlm_model and add extra_generation_args

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 .../datamodel/pipeline_options_vlm_model.py   |  3 +-
 docling/datamodel/vlm_model_specs.py          |  1 +
 docling/models/code_formula_model.py          | 15 +++----
 docling/models/document_picture_classifier.py | 15 +++----
 docling/models/hf_vlm_model.py                | 28 -------------
 docling/models/layout_model.py                | 15 +++----
 .../models/picture_description_vlm_model.py   | 27 +++----------
 docling/models/table_structure_model.py       | 15 +++----
 docling/models/utils/__init__.py              |  0
 docling/models/utils/hf_model_download.py     | 40 +++++++++++++++++++
 .../hf_transformers_causallm_model.py         | 14 ++++---
 .../hf_transformers_vision2seq_model.py       | 13 +++---
 docling/models/vlm_models_inline/mlx_model.py | 12 +++---
 docling/utils/model_downloader.py             | 10 ++---
 14 files changed, 94 insertions(+), 114 deletions(-)
 delete mode 100644 docling/models/hf_vlm_model.py
 create mode 100644 docling/models/utils/__init__.py
 create mode 100644 docling/models/utils/hf_model_download.py

diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index 07185107..d4510318 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -46,7 +46,8 @@ class InlineVlmOptions(BaseVlmOptions):
     scale: float = 2.0
 
     temperature: float = 0.0
-    stop_strings: list[str] = []
+    stop_strings: List[str] = []
+    extra_generation_config: Dict[str, Any] = {}
 
     use_kv_cache: bool = True
     max_new_tokens: int = 4096
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
index 663cbd73..85ad1b40 100644
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -97,6 +97,7 @@ PHI4_TRANSFORMERS = InlineVlmOptions(
     supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
     scale=2.0,
     temperature=0.0,
+    extra_generation_config=dict(num_logits_to_keep=0),
 )
 
 # Qwen
diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py
index eae0b82a..19a831ab 100644
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@@ -19,6 +19,7 @@ from pydantic import BaseModel
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 
 
@@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         force: bool = False,
         progress: bool = False,
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/CodeFormula",
-            force_download=force,
-            local_dir=local_dir,
             revision="v1.0.2",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
         )
 
-        return Path(download_path)
-
     def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
         """
         Determines if a given element in a document can be processed by the model.
diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py
index 10216306..73a30203 100644
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@@ -15,6 +15,7 @@ from pydantic import BaseModel
 
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.models.base_model import BaseEnrichmentModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 
 
@@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
     def download_models(
         local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/DocumentFigureClassifier",
-            force_download=force,
-            local_dir=local_dir,
             revision="v1.0.1",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
         )
 
-        return Path(download_path)
-
     def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
         """
         Determines if the given element can be processed by the classifier.
diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py
deleted file mode 100644
index 20a1c7dd..00000000
--- a/docling/models/hf_vlm_model.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import logging
-from pathlib import Path
-from typing import Optional
-
-_log = logging.getLogger(__name__)
-
-
-class HuggingFaceVlmModel:
-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-            # revision="v0.0.1",
-        )
-
-        return Path(download_path)
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index 68c807c9..d8e9c032 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -15,6 +15,7 @@ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
@@ -83,20 +84,14 @@ class LayoutModel(BasePageModel):
         force: bool = False,
         progress: bool = False,
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/docling-models",
-            force_download=force,
+            revision="v2.2.0",
             local_dir=local_dir,
-            revision="v2.1.0",
+            force=force,
+            progress=progress,
         )
 
-        return Path(download_path)
-
     def draw_clusters_and_cells_side_by_side(
         self, conv_res, page, clusters, mode_prefix: str, show: bool = False
     ):
diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
index 5a79f1b6..230151d6 100644
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@@ -10,10 +10,15 @@ from docling.datamodel.pipeline_options import (
     PictureDescriptionVlmOptions,
 )
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.accelerator_utils import decide_device
 
 
-class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+class PictureDescriptionVlmModel(
+    PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
+):
     @classmethod
     def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
         return PictureDescriptionVlmOptions
@@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
 
             self.provenance = f"{self.options.repo_id}"
 
-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-        )
-
-        return Path(download_path)
-
     def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
         from transformers import GenerationConfig
 
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 55a04afb..b90e85d5 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -22,6 +22,7 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
@@ -89,20 +90,14 @@ class TableStructureModel(BasePageModel):
     def download_models(
         local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
             revision="v2.2.0",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
         )
 
-        return Path(download_path)
-
     def draw_table_and_cells(
         self,
         conv_res: ConversionResult,
diff --git a/docling/models/utils/__init__.py b/docling/models/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/docling/models/utils/hf_model_download.py b/docling/models/utils/hf_model_download.py
new file mode 100644
index 00000000..3595166a
--- /dev/null
+++ b/docling/models/utils/hf_model_download.py
@@ -0,0 +1,40 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+_log = logging.getLogger(__name__)
+
+
+def download_hf_model(
+    repo_id: str,
+    local_dir: Optional[Path] = None,
+    force: bool = False,
+    progress: bool = False,
+    revision: Optional[str] = None,
+) -> Path:
+    from huggingface_hub import snapshot_download
+    from huggingface_hub.utils import disable_progress_bars
+
+    if not progress:
+        disable_progress_bars()
+    download_path = snapshot_download(
+        repo_id=repo_id,
+        force_download=force,
+        local_dir=local_dir,
+        revision=revision,
+    )
+
+    return Path(download_path)
+
+
+class HuggingFaceModelDownloadMixin:
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        return download_hf_model(
+            repo_id=repo_id, local_dir=local_dir, force=force, progress=progress
+        )
diff --git a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
index d3d6a93d..aef23b79 100644
--- a/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_causallm_model.py
@@ -12,14 +12,18 @@ from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.models.base_model import BasePageModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
+class HuggingFaceVlmModel_AutoModelForCausalLM(
+    BasePageModel, HuggingFaceModelDownloadMixin
+):
     def __init__(
         self,
         enabled: bool,
@@ -62,9 +66,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
 
             if artifacts_path is None:
-                artifacts_path = HuggingFaceVlmModel.download_models(
-                    self.vlm_options.repo_id
-                )
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
@@ -128,7 +130,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                         use_cache=self.use_cache,  # Enables KV caching which can improve performance
                         temperature=self.temperature,
                         generation_config=self.generation_config,
-                        num_logits_to_keep=1,
+                        **self.vlm_options.extra_generation_config,
                     )
                     generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
 
diff --git a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
index 6a54d241..f92b9292 100644
--- a/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_vision2seq_model.py
@@ -11,14 +11,18 @@ from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.models.base_model import BasePageModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
+class HuggingFaceVlmModel_AutoModelForVision2Seq(
+    BasePageModel, HuggingFaceModelDownloadMixin
+):
     def __init__(
         self,
         enabled: bool,
@@ -52,10 +56,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
 
             # PARAMETERS:
             if artifacts_path is None:
-                # artifacts_path = self.download_models(self.vlm_options.repo_id)
-                artifacts_path = HuggingFaceVlmModel.download_models(
-                    self.vlm_options.repo_id
-                )
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
index 0949665e..d8b90407 100644
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -11,13 +11,15 @@ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToke
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.models.base_model import BasePageModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceMlxModel(BasePageModel):
+class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -48,12 +50,8 @@ class HuggingFaceMlxModel(BasePageModel):
 
             # PARAMETERS:
             if artifacts_path is None:
-                _log.debug(
-                    f"before HuggingFaceVlmModel.download_models: {self.vlm_options.repo_id}"
-                )
-                artifacts_path = HuggingFaceVlmModel.download_models(
+                artifacts_path = self.download_models(
                     self.vlm_options.repo_id,
-                    progress=True,
                 )
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
index f8237fbc..55383c03 100644
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -14,10 +14,10 @@ from docling.datamodel.vlm_model_specs import (
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.models.layout_model import LayoutModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.utils.hf_model_download import download_hf_model
 
 _log = logging.getLogger(__name__)
 
@@ -77,7 +77,7 @@ def download_models(
 
     if with_smolvlm:
         _log.info("Downloading SmolVlm model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
             repo_id=smolvlm_picture_description.repo_id,
             local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
             force=force,
@@ -86,7 +86,7 @@ def download_models(
 
     if with_smoldocling:
         _log.info("Downloading SmolDocling model...")
-        HuggingFaceVlmModel.download_models(
+        download_hf_model(
             repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
             local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
             force=force,
@@ -95,7 +95,7 @@ def download_models(
 
     if with_smoldocling_mlx:
         _log.info("Downloading SmolDocling MLX model...")
-        HuggingFaceVlmModel.download_models(
+        download_hf_model(
             repo_id=SMOLDOCLING_MLX.repo_id,
             local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
             force=force,
@@ -104,7 +104,7 @@ def download_models(
 
     if with_granite_vision:
         _log.info("Downloading Granite Vision model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
             repo_id=granite_picture_description.repo_id,
             local_dir=output_dir / granite_picture_description.repo_cache_folder,
             force=force,