fixed the pipeline for Phi4

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-05-16 15:55:49 +02:00
parent d41b856961
commit 661f7c9780
4 changed files with 26 additions and 15 deletions

View File

@ -44,8 +44,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
inference_framework: InferenceFramework inference_framework: InferenceFramework
response_format: ResponseFormat response_format: ResponseFormat
scale: float = 2.0 scale: float = 2.0
temperature: float = 0.0
stop_strings: list[str] = []
use_kv_cache: bool = True use_kv_cache: bool = True
max_new_tokens: int = 4096 max_new_tokens: int = 4096

View File

@ -75,7 +75,8 @@ class HuggingFaceMlxModel(BasePageModel):
assert page.size is not None assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale) hi_res_image = page.get_image(scale=self.vlm_options.scale)
hi_res_image.save("./scratch/page.png")
if hi_res_image is not None: if hi_res_image is not None:
im_width, im_height = hi_res_image.size im_width, im_height = hi_res_image.size

View File

@ -43,12 +43,14 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
self.device = decide_device(accelerator_options.device) self.device = decide_device(accelerator_options.device)
if self.device == "mlx": if self.device == "mps":
_log.warning( _log.warning(
"Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!" "Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!"
) )
self.device = "cpu" self.device = "cpu"
print("device: ", self.device)
self.use_cache = vlm_options.use_kv_cache self.use_cache = vlm_options.use_kv_cache
self.max_new_tokens = vlm_options.max_new_tokens self.max_new_tokens = vlm_options.max_new_tokens
@ -120,31 +122,34 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
assert page.size is not None assert page.size is not None
hi_res_image = page.get_image(scale=2) # self.vlm_options.scale) hi_res_image = page.get_image(scale=2) # self.vlm_options.scale)
# hi_res_image.show() print(hi_res_image)
if hi_res_image is not None: if hi_res_image is not None:
im_width, im_height = hi_res_image.size im_width, im_height = hi_res_image.size
"""
if hi_res_image: if hi_res_image:
if hi_res_image.mode != "RGB": if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB") hi_res_image = hi_res_image.convert("RGB")
"""
# Define prompt structure # Define prompt structure
prompt = self.formulate_prompt() prompt = self.formulate_prompt()
print(f"prompt: '{prompt}', size: {im_width}, {im_height}") print(f"prompt: '{prompt}', size: {im_width}, {im_height}")
inputs = self.processor( inputs = self.processor(
text=prompt, images=hi_res_image, return_tensors="pt" text=prompt, images=hi_res_image, return_tensors="pt"
).to(self.device) ) #.to(self.device)
# Generate response # Generate response
start_time = time.time() start_time = time.time()
generate_ids = self.vlm_model.generate( generate_ids = self.vlm_model.generate(
**inputs, **inputs,
max_new_tokens=4096, # self.max_new_tokens, max_new_tokens=self.max_new_tokens,
# use_cache=self.use_cache, # Enables KV caching which can improve performance use_cache=self.use_cache, # Enables KV caching which can improve performance
generation_config=self.generation_config, generation_config=self.generation_config,
num_logits_to_keep=1, num_logits_to_keep=1,
# temperature=0.0,
) )
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
@ -157,10 +162,11 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
clean_up_tokenization_spaces=False, clean_up_tokenization_spaces=False,
)[0] )[0]
_log.debug( #_log.debug(
print(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
) )
page.predictions.vlm_response = VlmPrediction(text=response) page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time)
yield page yield page

View File

@ -187,11 +187,12 @@ if __name__ == "__main__":
rows = [] rows = []
for vlm_options in [ for vlm_options in [
# smoldocling_vlm_conversion_options, \ # smoldocling_vlm_conversion_options, \
smoldocling_vlm_mlx_conversion_options, \ # smoldocling_vlm_mlx_conversion_options, \
granite_vision_vlm_conversion_options, \ # granite_vision_vlm_conversion_options, \
# phi_vlm_conversion_options, \ phi_vlm_conversion_options, \
qwen25_vl_3b_vlm_mlx_conversion_options, \ # qwen25_vl_3b_vlm_mlx_conversion_options, \
pixtral_12b_vlm_mlx_conversion_options, # pixtral_12b_vlm_mlx_conversion_options,
# pixtral_12b_vlm_conversion_options,
]: ]:
pipeline_options.vlm_options = vlm_options pipeline_options.vlm_options = vlm_options