From 661f7c978073dc596cc4c4edfd4a3b40746d4142 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 16 May 2025 15:55:49 +0200 Subject: [PATCH] fixed the pipeline for Phi4 Signed-off-by: Peter Staar --- .../pipeline_model_specializations.py | 5 ++++- .../models/hf_vlm_models/hf_vlm_mlx_model.py | 3 ++- .../hf_vlm_model_AutoModelForCausalLM.py | 22 ++++++++++++------- docs/examples/minimal_vlm_pipeline.py | 11 +++++----- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py index 85aef998..9a80fae4 100644 --- a/docling/datamodel/pipeline_model_specializations.py +++ b/docling/datamodel/pipeline_model_specializations.py @@ -44,8 +44,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions): inference_framework: InferenceFramework response_format: ResponseFormat - scale: float = 2.0 + scale: float = 2.0 + temperature: float = 0.0 + stop_strings: list[str] = [] + use_kv_cache: bool = True max_new_tokens: int = 4096 diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py index 73144404..0d7b63f9 100644 --- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py +++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py @@ -75,7 +75,8 @@ class HuggingFaceMlxModel(BasePageModel): assert page.size is not None hi_res_image = page.get_image(scale=self.vlm_options.scale) - + hi_res_image.save("./scratch/page.png") + if hi_res_image is not None: im_width, im_height = hi_res_image.size diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index 504b2a2e..449764cb 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -43,12 +43,14 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): self.device = decide_device(accelerator_options.device) - if self.device == "mlx": + if self.device == "mps": _log.warning( "Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!" ) self.device = "cpu" + print("device: ", self.device) + self.use_cache = vlm_options.use_kv_cache self.max_new_tokens = vlm_options.max_new_tokens @@ -120,31 +122,34 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): assert page.size is not None hi_res_image = page.get_image(scale=2) # self.vlm_options.scale) - # hi_res_image.show() + print(hi_res_image) if hi_res_image is not None: im_width, im_height = hi_res_image.size + """ if hi_res_image: if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") - + """ + # Define prompt structure prompt = self.formulate_prompt() print(f"prompt: '{prompt}', size: {im_width}, {im_height}") inputs = self.processor( text=prompt, images=hi_res_image, return_tensors="pt" - ).to(self.device) + ) #.to(self.device) # Generate response start_time = time.time() generate_ids = self.vlm_model.generate( **inputs, - max_new_tokens=4096, # self.max_new_tokens, - # use_cache=self.use_cache, # Enables KV caching which can improve performance + max_new_tokens=self.max_new_tokens, + use_cache=self.use_cache, # Enables KV caching which can improve performance generation_config=self.generation_config, num_logits_to_keep=1, + # temperature=0.0, ) generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] @@ -157,10 +162,11 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): clean_up_tokenization_spaces=False, )[0] - _log.debug( + #_log.debug( + print( f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." ) - page.predictions.vlm_response = VlmPrediction(text=response) + page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time) yield page diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 1310637d..e240f6ce 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -187,11 +187,12 @@ if __name__ == "__main__": rows = [] for vlm_options in [ # smoldocling_vlm_conversion_options, \ - smoldocling_vlm_mlx_conversion_options, \ - granite_vision_vlm_conversion_options, \ - # phi_vlm_conversion_options, \ - qwen25_vl_3b_vlm_mlx_conversion_options, \ - pixtral_12b_vlm_mlx_conversion_options, + # smoldocling_vlm_mlx_conversion_options, \ + # granite_vision_vlm_conversion_options, \ + phi_vlm_conversion_options, \ + # qwen25_vl_3b_vlm_mlx_conversion_options, \ + # pixtral_12b_vlm_mlx_conversion_options, + # pixtral_12b_vlm_conversion_options, ]: pipeline_options.vlm_options = vlm_options