From f159075b676a2366041ec187657a433e6d0b52ff Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 14 May 2025 07:39:20 +0200
Subject: [PATCH] pixtral 12b runs via MLX and native transformers

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/models/hf_vlm_models/hf_vlm_mlx_model.py | 12 +++++++++++-
 docs/examples/minimal_vlm_pipeline.py            |  6 +++---
 2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
index 4dc90bf7..c6137b18 100644
--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
@@ -40,22 +40,29 @@ class HuggingFaceMlxModel(BasePageModel):
                 )
 
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+            print(f"model init: {repo_cache_folder}")
+
             self.apply_chat_template = apply_chat_template
             self.stream_generate = stream_generate
 
             # PARAMETERS:
             if artifacts_path is None:
+                print(f"before HuggingFaceVlmModel.download_models: {self.vlm_options.repo_id}")
                 # artifacts_path = self.download_models(self.vlm_options.repo_id)
                 artifacts_path = HuggingFaceVlmModel.download_models(
-                    self.vlm_options.repo_id
+                    self.vlm_options.repo_id, progress=True,
                 )
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
+            print(f"downloaded model: {artifacts_path}")
+                
             self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
 
             ## Load the model
+            print("start loading model ...")
             self.vlm_model, self.processor = load(artifacts_path)
+            print("loaded model ...")
             self.config = load_config(artifacts_path)
 
     """
@@ -110,6 +117,8 @@ class HuggingFaceMlxModel(BasePageModel):
                     )
 
                     start_time = time.time()
+                    print("start generating ...")
+                    
                     # Call model to generate:
                     output = ""
                     for token in self.stream_generate(
@@ -120,6 +129,7 @@ class HuggingFaceMlxModel(BasePageModel):
                         max_tokens=4096,
                         verbose=False,
                     ):
+                        print(token.text, end="", flush=True)
                         output += token.text
                         if "</doctag>" in token.text:
                             break
diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py
index 5ab971c3..8d81d4d6 100644
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -49,6 +49,7 @@ pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
 vlm_conversion_options = pixtral_vlm_conversion_options
 """
 
+"""
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
     repo_id="mistral-community/pixtral-12b",
     prompt="OCR this image and export it in MarkDown.",
@@ -56,6 +57,7 @@ pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
     inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
 )
 vlm_conversion_options = pixtral_vlm_conversion_options
+"""
 
 """
 phi_vlm_conversion_options = HuggingFaceVlmOptions(
@@ -68,15 +70,13 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
 vlm_conversion_options = phi_vlm_conversion_options
 """
 
-"""
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
     repo_id="mlx-community/pixtral-12b-bf16",
-    prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.MLX,
 )
 vlm_conversion_options = pixtral_vlm_conversion_options
-"""
 
 """
 qwen_vlm_conversion_options = HuggingFaceVlmOptions(