diff --git a/docling/cli/main.py b/docling/cli/main.py
index 98a4c8d7..cd2f040b 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -29,6 +29,13 @@ from docling.datamodel.base_models import (
     OutputFormat,
 )
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_model_specializations import (
+    VlmModelType,
+    granite_vision_vlm_conversion_options,
+    granite_vision_vlm_ollama_conversion_options,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
+)
 from docling.datamodel.pipeline_options import (
     AcceleratorDevice,
     AcceleratorOptions,
@@ -39,12 +46,7 @@ from docling.datamodel.pipeline_options import (
     PdfPipeline,
     PdfPipelineOptions,
     TableFormerMode,
-    VlmModelType,
     VlmPipelineOptions,
-    granite_vision_vlm_conversion_options,
-    granite_vision_vlm_ollama_conversion_options,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 0d28791e..bcd7f237 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,10 +1,6 @@
 from enum import Enum
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
-from docling_core.types.io import (
-    DocumentStream,
-)
-
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
@@ -14,6 +10,9 @@ from docling_core.types.doc import (
     TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
+from docling_core.types.io import (
+    DocumentStream,
+)
 
 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
@@ -148,11 +147,13 @@ class BasePageElement(BaseModel):
 class LayoutPrediction(BaseModel):
     clusters: List[Cluster] = []
 
+
 class VlmPredictionToken(BaseModel):
     text: str = ""
     token: int = -1
     logprob: float = -1
-    
+
+
 class VlmPrediction(BaseModel):
     text: str = ""
     generated_tokens: list[VlmPredictionToken] = []
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 4aaa8ec5..6cf559e7 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,6 +16,12 @@ from pydantic import (
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import deprecated
 
+from docling.datamodel.pipeline_model_specializations import (
+    ApiVlmOptions,
+    HuggingFaceVlmOptions,
+    smoldocling_vlm_conversion_options,
+)
+
 _log = logging.getLogger(__name__)
 
 
@@ -121,24 +127,22 @@ class RapidOcrOptions(OcrOptions):
     lang: List[str] = [
         "english",
         "chinese",
-    ]  # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
-    # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+    ]
+    # However, language as a parameter is not supported by rapidocr yet
+    # and hence changing this options doesn't affect anything.
+
+    # For more details on supported languages by RapidOCR visit
+    # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+
+    # For more details on the following options visit
+    # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
 
-    # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
     text_score: float = 0.5  # same default as rapidocr
 
     use_det: Optional[bool] = None  # same default as rapidocr
     use_cls: Optional[bool] = None  # same default as rapidocr
     use_rec: Optional[bool] = None  # same default as rapidocr
 
-    # class Device(Enum):
-    #     CPU = "CPU"
-    #     CUDA = "CUDA"
-    #     DIRECTML = "DIRECTML"
-    #     AUTO = "AUTO"
-
-    # device: Device = Device.AUTO  # Default value is AUTO
-
     print_verbose: bool = False  # same default as rapidocr
 
     det_model_path: Optional[str] = None  # same default as rapidocr
@@ -243,110 +247,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
         return self.repo_id.replace("/", "--")
 
 
+# SmolVLM
 smolvlm_picture_description = PictureDescriptionVlmOptions(
     repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
 )
-# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+
+# GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
     repo_id="ibm-granite/granite-vision-3.1-2b-preview",
     prompt="What is shown in this image?",
 )
 
 
-class BaseVlmOptions(BaseModel):
-    kind: str
-    prompt: str
-
-
-class ResponseFormat(str, Enum):
-    DOCTAGS = "doctags"
-    MARKDOWN = "markdown"
-    HTML = "html"
-
-
-class InferenceFramework(str, Enum):
-    MLX = "mlx"
-    TRANSFORMERS = "transformers"
-    OPENAI = "openai"
-    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
-    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
-    TRANSFORMERS_LlavaForConditionalGeneration = (
-        "transformers-LlavaForConditionalGeneration"
-    )
-
-
-class HuggingFaceVlmOptions(BaseVlmOptions):
-    kind: Literal["hf_model_options"] = "hf_model_options"
-
-    repo_id: str
-    load_in_8bit: bool = True
-    llm_int8_threshold: float = 6.0
-    quantized: bool = False
-
-    inference_framework: InferenceFramework
-    response_format: ResponseFormat
-
-    scale: float = 2.0
-    
-    use_kv_cache: bool = True
-    max_new_tokens: int = 4096
-    
-    @property
-    def repo_cache_folder(self) -> str:
-        return self.repo_id.replace("/", "--")
-
-
-class ApiVlmOptions(BaseVlmOptions):
-    kind: Literal["api_model_options"] = "api_model_options"
-
-    url: AnyUrl = AnyUrl(
-        "http://localhost:11434/v1/chat/completions"
-    )  # Default to ollama
-    headers: Dict[str, str] = {}
-    params: Dict[str, Any] = {}
-    scale: float = 2.0
-    timeout: float = 60
-    response_format: ResponseFormat
-
-
-smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
-    prompt="Convert this page to docling.",
-    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.MLX,
-)
-
-
-smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ds4sd/SmolDocling-256M-preview",
-    prompt="Convert this page to docling.",
-    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
-)
-
-granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
-    prompt="OCR the full page to markdown.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
-)
-
-granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
-    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
-    params={"model": "granite3.2-vision:2b"},
-    prompt="OCR the full page to markdown.",
-    scale=1.0,
-    timeout=120,
-    response_format=ResponseFormat.MARKDOWN,
-)
-
-
-class VlmModelType(str, Enum):
-    SMOLDOCLING = "smoldocling"
-    GRANITE_VISION = "granite_vision"
-    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
-
-
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
index 2abe37be..bc9a9317 100644
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
@@ -29,7 +29,7 @@ class HuggingFaceMlxModel(BasePageModel):
 
         self.vlm_options = vlm_options
         self.max_tokens = vlm_options.max_new_tokens
-        
+
         if self.enabled:
             try:
                 from mlx_vlm import generate, load  # type: ignore
@@ -60,7 +60,7 @@ class HuggingFaceMlxModel(BasePageModel):
             self.param_question = vlm_options.prompt
 
             ## Load the model
-            self.vlm_model, self.processor = load(artifacts_path)            
+            self.vlm_model, self.processor = load(artifacts_path)
             self.config = load_config(artifacts_path)
 
     def __call__(
@@ -94,8 +94,8 @@ class HuggingFaceMlxModel(BasePageModel):
                     _log.debug("start generating ...")
 
                     # Call model to generate:
-                    tokens:list[VlmPredictionToken] = []
-                    
+                    tokens: list[VlmPredictionToken] = []
+
                     output = ""
                     for token in self.stream_generate(
                         self.vlm_model,
@@ -105,25 +105,40 @@ class HuggingFaceMlxModel(BasePageModel):
                         max_tokens=4096,
                         verbose=False,
                     ):
-                        if len(token.logprobs.shape)==1:
-                            tokens.append(VlmPredictionToken(text=token.text,
-                                                             token=token.token,
-                                                             logprob=token.logprobs[token.token]))
-                        elif len(token.logprobs.shape)==2 and token.logprobs.shape[0]==1:
-                            tokens.append(VlmPredictionToken(text=token.text,
-                                                             token=token.token,
-                                                             logprob=token.logprobs[0, token.token]))
-                        
-                        output += token.text                        
+                        if len(token.logprobs.shape) == 1:
+                            tokens.append(
+                                VlmPredictionToken(
+                                    text=token.text,
+                                    token=token.token,
+                                    logprob=token.logprobs[token.token],
+                                )
+                            )
+                        elif (
+                            len(token.logprobs.shape) == 2
+                            and token.logprobs.shape[0] == 1
+                        ):
+                            tokens.append(
+                                VlmPredictionToken(
+                                    text=token.text,
+                                    token=token.token,
+                                    logprob=token.logprobs[0, token.token],
+                                )
+                            )
+
+                        output += token.text
                         if "</doctag>" in token.text:
                             break
 
                     generation_time = time.time() - start_time
                     page_tags = output
-                    
-                    _log.debug(f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens)/generation_time} tokens/sec).")
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags,
-                                                                  generation_time=generation_time,
-                                                                  generated_tokens=tokens)
+
+                    _log.debug(
+                        f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
+                    )
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=page_tags,
+                        generation_time=generation_time,
+                        generated_tokens=tokens,
+                    )
 
                 yield page
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
index 3299205d..8b4022d3 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -43,17 +43,18 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
 
             self.device = decide_device(accelerator_options.device)
 
-            if self.device=="mlx":
-                _log.warning(f"Mapping mlx to cpu for AutoModelForCausalLM")
-                self.device = cpu
-            
+            if self.device == "mlx":
+                _log.warning(
+                    "Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!"
+                )
+                self.device = "cpu"
+
             self.use_cache = vlm_options.use_kv_cache
             self.max_new_tokens = vlm_options.max_new_tokens
 
             _log.debug(f"Available device for VLM: {self.device}")
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
 
-            # PARAMETERS:
             if artifacts_path is None:
                 artifacts_path = HuggingFaceVlmModel.download_models(
                     self.vlm_options.repo_id
@@ -117,8 +118,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
 
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
 
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
@@ -157,14 +157,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                     _log.debug(
                         f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                     )
-
-                    # inference_time = time.time() - start_time
-                    # tokens_per_second = num_tokens / generation_time
-                    # print("")
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
-                    # print("")
                     page.predictions.vlm_response = VlmPrediction(text=response)
 
                 yield page
@@ -172,11 +164,12 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
     def formulate_prompt(self) -> str:
         """Formulate a prompt for the VLM."""
         if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
+
             user_prompt = "<|user|>"
             assistant_prompt = "<|assistant|>"
             prompt_suffix = "<|end|>"
 
-            # prompt = f"{user_prompt}<|image_1|>Convert this image into MarkDown and only return the bare MarkDown!{prompt_suffix}{assistant_prompt}"
             prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
             _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
 
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
index b4313a5f..6b9f352b 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -38,10 +38,8 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                 BitsAndBytesConfig,
             )
 
-            device = decide_device(accelerator_options.device)
-            self.device = device
-
-            _log.debug(f"Available device for HuggingFace VLM: {device}")
+            self.device = decide_device(accelerator_options.device)
+            _log.debug(f"Available device for HuggingFace VLM: {self.device}")
 
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
 
@@ -54,7 +52,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
-            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            # self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
             self.param_quantization_config = BitsAndBytesConfig(
                 load_in_8bit=vlm_options.load_in_8bit,  # True,
                 llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
@@ -68,7 +66,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
             if not self.param_quantized:
                 self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                     artifacts_path,
-                    device_map=device,
+                    device_map=self.device,
                     torch_dtype=torch.bfloat16,
                     _attn_implementation=(
                         "flash_attention_2"
@@ -82,7 +80,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
             else:
                 self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                     artifacts_path,
-                    device_map=device,
+                    device_map=self.device,
                     torch_dtype="auto",
                     quantization_config=self.param_quantization_config,
                     _attn_implementation=(
@@ -94,29 +92,6 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                     # trust_remote_code=True,
                 )  # .to(self.device)
 
-    """
-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-            # revision="v0.0.1",
-        )
-
-        return Path(download_path)
-    """
-
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -128,8 +103,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
 
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
 
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
@@ -141,22 +115,9 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
 
-                    messages = [
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": "This is a page from a document.",
-                                },
-                                {"type": "image"},
-                                {"type": "text", "text": self.param_question},
-                            ],
-                        }
-                    ]
-                    prompt = self.processor.apply_chat_template(
-                        messages, add_generation_prompt=False
-                    )
+                    # Define prompt structure
+                    prompt = self.formulate_prompt()
+
                     inputs = self.processor(
                         text=prompt, images=[hi_res_image], return_tensors="pt"
                     )
@@ -180,14 +141,26 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                     _log.debug(
                         f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                     )
-
-                    # inference_time = time.time() - start_time
-                    # tokens_per_second = num_tokens / generation_time
-                    # print("")
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
-                    # print("")
                     page.predictions.vlm_response = VlmPrediction(text=page_tags)
 
                 yield page
+
+    def formulate_prompt(self) -> str:
+        """Formulate a prompt for the VLM."""
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "This is a page from a document.",
+                    },
+                    {"type": "image"},
+                    {"type": "text", "text": self.vlm_options.prompt},
+                ],
+            }
+        ]
+        prompt = self.processor.apply_chat_template(
+            messages, add_generation_prompt=False
+        )
+        return prompt
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
index 80dfdae2..304f8c0f 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py
@@ -39,10 +39,15 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
             )
 
             self.device = decide_device(accelerator_options.device)
-            self.device = "cpu"  # FIXME
 
-            self.use_cache = True
-            self.max_new_tokens = 64  # FIXME
+            if self.device == "mlx":
+                _log.warning(
+                    "Mapping mlx to cpu for LlavaForConditionalGeneration, use MLX framework!"
+                )
+                self.device = "cpu"
+
+            self.use_cache = vlm_options.use_kv_cache
+            self.max_new_tokens = vlm_options.max_new_tokens
 
             _log.debug(f"Available device for VLM: {self.device}")
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
@@ -54,9 +59,6 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
-            model_path = artifacts_path
-            _log.debug(f"model: {model_path}")
-
             self.processor = AutoProcessor.from_pretrained(
                 artifacts_path,
                 trust_remote_code=self.trust_remote_code,
@@ -98,12 +100,11 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
                     images = [hi_res_image]
 
                     # Define prompt structure
-                    # prompt = "<s>[INST]Describe the images.\n[IMG][/INST]"
                     prompt = self.formulate_prompt()
 
                     inputs = self.processor(
                         text=prompt, images=images, return_tensors="pt"
-                    ).to(self.device)  # .to("cuda")
+                    ).to(self.device)
 
                     # Generate response
                     start_time = time.time()
@@ -113,8 +114,6 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
                         use_cache=self.use_cache,  # Enables KV caching which can improve performance
                     )
 
-                    print(generate_ids)
-                    
                     num_tokens = len(generate_ids[0])
                     generation_time = time.time() - start_time
 
@@ -123,10 +122,12 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel):
                         skip_special_tokens=True,
                         clean_up_tokenization_spaces=False,
                     )[0]
-                    
-                    page.predictions.vlm_response = VlmPrediction(text=response,
-                                                                  generated_tokens=num_tokens,
-                                                                  generation_time=generation_time)
+
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=response,
+                        generated_tokens=num_tokens,
+                        generation_time=generation_time,
+                    )
 
                 yield page
 
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index b90a17dd..39d6bb33 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -13,11 +13,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import (
+from docling.datamodel.pipeline_model_specializations import (
     ApiVlmOptions,
     HuggingFaceVlmOptions,
     InferenceFramework,
     ResponseFormat,
+)
+from docling.datamodel.pipeline_options import (
     VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
index eb20f255..e2b4b194 100644
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,10 +2,12 @@ import logging
 from pathlib import Path
 from typing import Optional
 
-from docling.datamodel.pipeline_options import (
-    granite_picture_description,
+from docling.datamodel.pipeline_model_specializations import (
     smoldocling_vlm_conversion_options,
     smoldocling_vlm_mlx_conversion_options,
+)
+from docling.datamodel.pipeline_options import (
+    granite_picture_description,
     smolvlm_picture_description,
 )
 from docling.datamodel.settings import settings
diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py
index 4681085f..c2112be4 100644
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -11,10 +11,15 @@ from docling.datamodel.pipeline_options import (
     InferenceFramework,
     ResponseFormat,
     VlmPipelineOptions,
-    smoldocling_vlm_mlx_conversion_options,
-    smoldocling_vlm_conversion_options,
     granite_vision_vlm_conversion_options,
+    granite_vision_vlm_mlx_conversion_options,
     granite_vision_vlm_ollama_conversion_options,
+    phi_vlm_conversion_options,
+    pixtral_12b_vlm_conversion_options,
+    pixtral_12b_vlm_mlx_conversion_options,
+    qwen25_vl_3b_vlm_mlx_conversion_options,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
@@ -28,6 +33,7 @@ sources = [
 pipeline_options = VlmPipelineOptions()
 # If force_backend_text = True, text from backend will be used instead of generated text
 pipeline_options.force_backend_text = False
+pipeline_options.generate_page_images = True
 
 ## On GPU systems, enable flash_attention_2 with CUDA:
 # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
@@ -37,11 +43,13 @@ pipeline_options.force_backend_text = False
 # pipeline_options.vlm_options = smoldocling_vlm_conversion_options
 
 ## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
-pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
+# pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
 
+pipeline_options.vlm_options = phi_vlm_conversion_options
+
 """
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
      repo_id="mistralai/Pixtral-12B-Base-2409",
@@ -105,7 +113,7 @@ converter = DocumentConverter(
             pipeline_cls=VlmPipeline,
             pipeline_options=pipeline_options,
         ),
-    }
+    },
 )
 
 out_path = Path("scratch")
@@ -121,39 +129,44 @@ for source in sources:
     res = converter.convert(source)
 
     print("")
-    #print(res.document.export_to_markdown())
+    # print(res.document.export_to_markdown())
 
-    for i,page in enumerate(res.pages):
+    model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
+    fname = f"{model_id}-{res.input.file.stem}"
+
+    for i, page in enumerate(res.pages):
         print("")
-        print(f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format}:")
+        print(
+            f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format}:"
+        )
         print(page.predictions.vlm_response.text)
-        print(f" ---------- ")
+        print(" ---------- ")
 
     print("===== Final output of the converted document =======")
-    
-    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
+
+    with (out_path / f"{fname}.json").open("w") as fp:
         fp.write(json.dumps(res.document.export_to_dict()))
 
     res.document.save_as_json(
-        out_path / f"{res.input.file.stem}.json",
+        out_path / f"{fname}.json",
         image_mode=ImageRefMode.PLACEHOLDER,
     )
-    print(f" => produced {out_path / res.input.file.stem}.json")
-    
+    print(f" => produced {out_path / fname}.json")
+
     res.document.save_as_markdown(
-        out_path / f"{res.input.file.stem}.md",
+        out_path / f"{fname}.md",
         image_mode=ImageRefMode.PLACEHOLDER,
     )
-    print(f" => produced {out_path / res.input.file.stem}.md")
-    
+    print(f" => produced {out_path / fname}.md")
+
     res.document.save_as_html(
-        out_path / f"{res.input.file.stem}.html",
+        out_path / f"{fname}.html",
         image_mode=ImageRefMode.EMBEDDED,
         labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
-        # split_page_view=True,
+        split_page_view=True,
     )
-    print(f" => produced {out_path / res.input.file.stem}.html")
-    
+    print(f" => produced {out_path / fname}.html")
+
     pg_num = res.document.num_pages()
     print("")
     inference_time = time.time() - start_time
@@ -161,4 +174,3 @@ for source in sources:
         f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
     )
     print("====================================================")
-