fixed the pipeline for Phi4

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-05-16 15:55:49 +02:00 · 2025-05-16 15:55:49 +02:00 · 661f7c9780
commit 661f7c9780
parent d41b856961
4 changed files with 26 additions and 15 deletions
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@ -44,8 +44,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
    inference_framework: InferenceFramework
    response_format: ResponseFormat

-    scale: float = 2.0
+    scale: float = 2.0 

+    temperature: float = 0.0
+    stop_strings: list[str] = []
+    
    use_kv_cache: bool = True
    max_new_tokens: int = 4096

--- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
+++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py
@ -75,7 +75,8 @@ class HuggingFaceMlxModel(BasePageModel):
                    assert page.size is not None

                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
-
+                    hi_res_image.save("./scratch/page.png")
+                    
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size

--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@ -43,12 +43,14 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):

            self.device = decide_device(accelerator_options.device)

-            if self.device == "mlx":
+            if self.device == "mps":
                _log.warning(
                    "Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!"
                )
                self.device = "cpu"

+            print("device: ", self.device)
+                
            self.use_cache = vlm_options.use_kv_cache
            self.max_new_tokens = vlm_options.max_new_tokens

@ -120,31 +122,34 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                    assert page.size is not None

                    hi_res_image = page.get_image(scale=2)  # self.vlm_options.scale)
-                    # hi_res_image.show()
+                    print(hi_res_image)

                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size

+                    """
                    if hi_res_image:
                        if hi_res_image.mode != "RGB":
                            hi_res_image = hi_res_image.convert("RGB")
-
+                    """
+                    
                    # Define prompt structure
                    prompt = self.formulate_prompt()
                    print(f"prompt: '{prompt}', size: {im_width}, {im_height}")

                    inputs = self.processor(
                        text=prompt, images=hi_res_image, return_tensors="pt"
-                    ).to(self.device)
+                    ) #.to(self.device)

                    # Generate response
                    start_time = time.time()
                    generate_ids = self.vlm_model.generate(
                        **inputs,
-                        max_new_tokens=4096,  # self.max_new_tokens,
-                        # use_cache=self.use_cache,  # Enables KV caching which can improve performance
+                        max_new_tokens=self.max_new_tokens,
+                        use_cache=self.use_cache,  # Enables KV caching which can improve performance
                        generation_config=self.generation_config,
                        num_logits_to_keep=1,
+                        # temperature=0.0,
                    )
                    generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]

@ -157,10 +162,11 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                        clean_up_tokenization_spaces=False,
                    )[0]

-                    _log.debug(
+                    #_log.debug(
+                    print(
                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                    )
-                    page.predictions.vlm_response = VlmPrediction(text=response)
+                    page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time)

                yield page

--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -187,11 +187,12 @@ if __name__ == "__main__":
    rows = []
    for vlm_options in [
            # smoldocling_vlm_conversion_options, \
-            smoldocling_vlm_mlx_conversion_options, \
-            granite_vision_vlm_conversion_options, \
-            # phi_vlm_conversion_options, \
-            qwen25_vl_3b_vlm_mlx_conversion_options, \
-            pixtral_12b_vlm_mlx_conversion_options,
+            # smoldocling_vlm_mlx_conversion_options, \
+            # granite_vision_vlm_conversion_options, \
+            phi_vlm_conversion_options, \
+            # qwen25_vl_3b_vlm_mlx_conversion_options, \
+            # pixtral_12b_vlm_mlx_conversion_options,
+            # pixtral_12b_vlm_conversion_options,
    ]:
        pipeline_options.vlm_options = vlm_options