got microsoft/Phi-4-multimodal-instruct to work

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-05-12 13:37:03 +02:00
parent 68747e3cad
commit 77eb21b235

View File

@ -137,6 +137,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
# hi_res_image = page.get_image(scale=2.0) # 144dpi # hi_res_image = page.get_image(scale=2.0) # 144dpi
hi_res_image = page.get_image(scale=1.0) # 72dpi hi_res_image = page.get_image(scale=1.0) # 72dpi
hi_res_image.show()
if hi_res_image is not None: if hi_res_image is not None:
im_width, im_height = hi_res_image.size im_width, im_height = hi_res_image.size
@ -195,7 +197,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
# Part 1: Image Processing # Part 1: Image Processing
print("\n--- IMAGE PROCESSING ---") print("\n--- IMAGE PROCESSING ---")
# image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg' # image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
prompt = f'{user_prompt}<|image_1|>OCR this image into MarkDown?{prompt_suffix}{assistant_prompt}' prompt = f'{user_prompt}<|image_1|>Convert this image into MarkDown and only return the bare MarkDown!{prompt_suffix}{assistant_prompt}'
print(f'>>> Prompt\n{prompt}') print(f'>>> Prompt\n{prompt}')
inputs = self.processor(text=prompt, images=hi_res_image, return_tensors='pt').to(self.device) #.to('cuda:0') inputs = self.processor(text=prompt, images=hi_res_image, return_tensors='pt').to(self.device) #.to('cuda:0')
@ -206,19 +208,20 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
**inputs, **inputs,
max_new_tokens=128, max_new_tokens=128,
generation_config=self.generation_config, generation_config=self.generation_config,
num_logits_to_keep=1,
) )
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
num_tokens = len(generated_ids[0]) num_tokens = len(generate_ids[0])
response = self.processor.batch_decode( response = self.processor.batch_decode(
generate_ids, generate_ids,
skip_special_tokens=True, skip_special_tokens=True,
clean_up_tokenization_spaces=False clean_up_tokenization_spaces=False,
)[0] )[0]
print(f'>>> Response\n{response}') print(f'>>> Response\n{response}')
_log.debug( _log.debug(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." f"Generated {num_tokens} tokens."
) )
# inference_time = time.time() - start_time # inference_time = time.time() - start_time