diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py
index 7082da3b..2a7350c8 100644
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -119,16 +119,16 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
 # Pixtral
 pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
     repo_id="mistral-community/pixtral-12b",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
     response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
+    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
     scale=2.0,
     temperature=0.0,
 )
 
 pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
     repo_id="mlx-community/pixtral-12b-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.MLX,
     scale=2.0,
@@ -138,7 +138,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 # Phi4
 phi_vlm_conversion_options = HuggingFaceVlmOptions(
     repo_id="microsoft/Phi-4-multimodal-instruct",
-    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare MarkDown",
+    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
     scale=2.0,
@@ -148,7 +148,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
 # Qwen
 qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
     repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.MLX,
     scale=2.0,
@@ -158,7 +158,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 # Gemma-3
 gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
     repo_id="mlx-community/gemma-3-12b-it-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.MLX,
     scale=2.0,
@@ -167,7 +167,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
 
 gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
     repo_id="mlx-community/gemma-3-27b-it-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.MLX,
     scale=2.0,
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
index e0c09c88..99dc32b1 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -116,7 +116,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                     assert page.size is not None
 
                     hi_res_image = page.get_image(scale=2)  # self.vlm_options.scale)
-                    print(hi_res_image)
 
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
@@ -127,7 +126,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
 
                     inputs = self.processor(
                         text=prompt, images=hi_res_image, return_tensors="pt"
-                    )  # .to(self.device)
+                    ).to(self.device)
 
                     # Generate response
                     start_time = time.time()
diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
index 69154d77..6de02808 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -40,7 +40,6 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
 
             self.device = decide_device(accelerator_options.device)
             self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
-
             _log.debug(f"Available device for HuggingFace VLM: {self.device}")
 
             self.use_cache = vlm_options.use_kv_cache
@@ -73,7 +72,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                 self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                     artifacts_path,
                     device_map=self.device,
-                    torch_dtype=torch.bfloat16,
+                    # torch_dtype=torch.bfloat16,
                     _attn_implementation=(
                         "flash_attention_2"
                         if self.device.startswith("cuda")
diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/compare_vlm_models.py
similarity index 51%
rename from docs/examples/minimal_vlm_pipeline.py
rename to docs/examples/compare_vlm_models.py
index 32aecc6d..2d8915de 100644
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/compare_vlm_models.py
@@ -1,3 +1,9 @@
+# Compare VLM models
+# ==================
+#
+# This example runs the VLM pipeline with different vision-language models.
+# Their runtime as well output quality is compared.
+
 import json
 import time
 from pathlib import Path
@@ -8,9 +14,6 @@ from tabulate import tabulate
 
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_model_specializations import (
-    HuggingFaceVlmOptions,
-    InferenceFramework,
-    ResponseFormat,
     gemma_3_12b_mlx_conversion_options,
     granite_vision_vlm_conversion_options,
     granite_vision_vlm_ollama_conversion_options,
@@ -27,96 +30,24 @@ from docling.datamodel.pipeline_options import (
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 
-## Use experimental VlmPipeline
-pipeline_options = VlmPipelineOptions()
-# If force_backend_text = True, text from backend will be used instead of generated text
-pipeline_options.force_backend_text = False
-pipeline_options.generate_page_images = True
 
-## On GPU systems, enable flash_attention_2 with CUDA:
-# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
-# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
-
-## Pick a VLM model. We choose SmolDocling-256M by default
-# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
-
-## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
-# pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
-
-## Alternative VLM models:
-# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
-
-pipeline_options.vlm_options = phi_vlm_conversion_options
-# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options
-
-"""
-pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-     repo_id="mistralai/Pixtral-12B-Base-2409",
-     prompt="OCR this image and export it in MarkDown.",
-     response_format=ResponseFormat.MARKDOWN,
-     inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
-)
-pipeline_options.vlm_options = pixtral_vlm_conversion_options
-"""
-
-"""
-pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="mistral-community/pixtral-12b",
-    prompt="OCR this image and export it in MarkDown.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
-)
-pipeline_options.vlm_options = pixtral_vlm_conversion_options
-"""
-
-"""
-phi_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="microsoft/Phi-4-multimodal-instruct",
-    # prompt="OCR the full page to markdown.",
-    prompt="OCR this image and export it in MarkDown.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
-)
-pipeline_options.vlm_options = phi_vlm_conversion_options
-"""
-
-"""
-pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="mlx-community/pixtral-12b-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.MLX,
-    scale=1.0,
-)
-pipeline_options.vlm_options = pixtral_vlm_conversion_options
-"""
-
-"""
-qwen_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
-    prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.MLX,
-)
-pipeline_options.vlm_options = qwen_vlm_conversion_options
-"""
-
-
-def convert(sources: list[Path], converter):
+def convert(sources: list[Path], converter: DocumentConverter):
+    model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
+    framework = pipeline_options.vlm_options.inference_framework
     for source in sources:
-        # start_time = time.time()
         print("================================================")
-        print(f"Processing... {source}")
+        print("Processing...")
+        print(f"Source: {source}")
+        print("---")
+        print(f"Model: {model_id}")
+        print(f"Framework: {framework}")
         print("================================================")
         print("")
 
         res = converter.convert(source)
 
         print("")
-        # print(res.document.export_to_markdown())
 
-        model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
-        framework = pipeline_options.vlm_options.inference_framework
         fname = f"{res.input.file.stem}-{model_id}-{framework}"
 
         inference_time = 0.0
@@ -161,11 +92,10 @@ def convert(sources: list[Path], converter):
         )
         print("====================================================")
 
-        # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ]
         return [
             source,
             model_id,
-            framework,
+            str(framework),
             pg_num,
             inference_time,
         ]
@@ -173,7 +103,6 @@ def convert(sources: list[Path], converter):
 
 if __name__ == "__main__":
     sources = [
-        # "tests/data/2305.03393v1-pg9-img.png",
         "tests/data/pdf/2305.03393v1-pg9.pdf",
     ]
 
@@ -182,9 +111,6 @@ if __name__ == "__main__":
 
     ## Use VlmPipeline
     pipeline_options = VlmPipelineOptions()
-
-    # If force_backend_text = True, text from backend will be used instead of generated text
-    pipeline_options.force_backend_text = False
     pipeline_options.generate_page_images = True
 
     ## On GPU systems, enable flash_attention_2 with CUDA:
@@ -193,14 +119,17 @@ if __name__ == "__main__":
 
     rows = []
     for vlm_options in [
-        # smoldocling_vlm_conversion_options, \
-        smoldocling_vlm_mlx_conversion_options,
-        # granite_vision_vlm_conversion_options, \
-        # phi_vlm_conversion_options, \
-        # qwen25_vl_3b_vlm_mlx_conversion_options, \
+        ## DocTags / SmolDocling models
+        smoldocling_vlm_conversion_options,
+        # smoldocling_vlm_mlx_conversion_options,
+        ## Markdown models (using MLX framework)
+        # qwen25_vl_3b_vlm_mlx_conversion_options,
         # pixtral_12b_vlm_mlx_conversion_options,
-        # pixtral_12b_vlm_conversion_options,
-        gemma_3_12b_mlx_conversion_options,
+        # gemma_3_12b_mlx_conversion_options,
+        ## Markdown models (using Transformers framework)
+        # granite_vision_vlm_conversion_options,
+        phi_vlm_conversion_options,
+        pixtral_12b_vlm_conversion_options,
     ]:
         pipeline_options.vlm_options = vlm_options
 
@@ -219,11 +148,13 @@ if __name__ == "__main__":
         )
 
         row = convert(sources=sources, converter=converter)
-        print("pipelines: \n", converter._get_initialized_pipelines())
-
         rows.append(row)
 
-        print(tabulate(rows))
+        print(
+            tabulate(
+                rows, headers=["source", "model_id", "framework", "num_pages", "time"]
+            )
+        )
 
         print("see if memory gets released ...")
         time.sleep(10)