From e2c95d09bc513067ad1839666585eebda44258d1 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 15 May 2025 07:32:55 +0200
Subject: [PATCH] need to get Phi4 working again ...

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../hf_vlm_model_AutoModelForCausalLM.py      | 41 ++++++++++---------
 docs/examples/minimal_vlm_pipeline.py         |  8 ++--
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
index 8b4022d3..504b2a2e 100644
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -73,21 +73,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                 artifacts_path,
                 trust_remote_code=self.trust_remote_code,
             )
-            if not self.param_quantized:
-                self.vlm_model = AutoModelForCausalLM.from_pretrained(
-                    artifacts_path,
-                    device_map=self.device,
-                    torch_dtype=torch.bfloat16,
-                    _attn_implementation=(
-                        "flash_attention_2"
-                        if self.device.startswith("cuda")
-                        and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
-                    ),
-                    trust_remote_code=self.trust_remote_code,
-                ).to(self.device)
-
-            else:
+            if self.param_quantized:
+                print("using quantized")
                 self.vlm_model = AutoModelForCausalLM.from_pretrained(
                     artifacts_path,
                     device_map=self.device,
@@ -100,7 +87,21 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                         else "eager"
                     ),
                     trust_remote_code=self.trust_remote_code,
-                ).to(self.device)
+                )  # .to(self.device)
+            else:
+                print("using original")
+                self.vlm_model = AutoModelForCausalLM.from_pretrained(
+                    artifacts_path,
+                    device_map=self.device,
+                    torch_dtype="auto",  # torch.bfloat16,
+                    _attn_implementation=(
+                        "flash_attention_2"
+                        if self.device.startswith("cuda")
+                        and accelerator_options.cuda_use_flash_attention2
+                        else "eager"
+                    ),
+                    trust_remote_code=self.trust_remote_code,
+                )  # .to(self.device)
 
             model_path = artifacts_path
 
@@ -118,7 +119,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
 
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(scale=2)  # self.vlm_options.scale)
+                    # hi_res_image.show()
 
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
@@ -129,6 +131,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
 
                     # Define prompt structure
                     prompt = self.formulate_prompt()
+                    print(f"prompt: '{prompt}', size: {im_width}, {im_height}")
 
                     inputs = self.processor(
                         text=prompt, images=hi_res_image, return_tensors="pt"
@@ -138,8 +141,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                     start_time = time.time()
                     generate_ids = self.vlm_model.generate(
                         **inputs,
-                        max_new_tokens=self.max_new_tokens,
-                        use_cache=self.use_cache,  # Enables KV caching which can improve performance
+                        max_new_tokens=4096,  # self.max_new_tokens,
+                        # use_cache=self.use_cache,  # Enables KV caching which can improve performance
                         generation_config=self.generation_config,
                         num_logits_to_keep=1,
                     )
diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py
index c2112be4..9c04d561 100644
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -6,13 +6,11 @@ from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import (
+from docling.datamodel.pipeline_model_specializations import (
     HuggingFaceVlmOptions,
     InferenceFramework,
     ResponseFormat,
-    VlmPipelineOptions,
     granite_vision_vlm_conversion_options,
-    granite_vision_vlm_mlx_conversion_options,
     granite_vision_vlm_ollama_conversion_options,
     phi_vlm_conversion_options,
     pixtral_12b_vlm_conversion_options,
@@ -21,6 +19,9 @@ from docling.datamodel.pipeline_options import (
     smoldocling_vlm_conversion_options,
     smoldocling_vlm_mlx_conversion_options,
 )
+from docling.datamodel.pipeline_options import (
+    VlmPipelineOptions,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 
@@ -49,6 +50,7 @@ pipeline_options.generate_page_images = True
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
 
 pipeline_options.vlm_options = phi_vlm_conversion_options
+# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options
 
 """
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(