mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
fixed the pipeline for Phi4
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
d41b856961
commit
661f7c9780
@ -44,8 +44,11 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|||||||
inference_framework: InferenceFramework
|
inference_framework: InferenceFramework
|
||||||
response_format: ResponseFormat
|
response_format: ResponseFormat
|
||||||
|
|
||||||
scale: float = 2.0
|
scale: float = 2.0
|
||||||
|
|
||||||
|
temperature: float = 0.0
|
||||||
|
stop_strings: list[str] = []
|
||||||
|
|
||||||
use_kv_cache: bool = True
|
use_kv_cache: bool = True
|
||||||
max_new_tokens: int = 4096
|
max_new_tokens: int = 4096
|
||||||
|
|
||||||
|
@ -75,7 +75,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||||
|
hi_res_image.save("./scratch/page.png")
|
||||||
|
|
||||||
if hi_res_image is not None:
|
if hi_res_image is not None:
|
||||||
im_width, im_height = hi_res_image.size
|
im_width, im_height = hi_res_image.size
|
||||||
|
|
||||||
|
@ -43,12 +43,14 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
|
|
||||||
self.device = decide_device(accelerator_options.device)
|
self.device = decide_device(accelerator_options.device)
|
||||||
|
|
||||||
if self.device == "mlx":
|
if self.device == "mps":
|
||||||
_log.warning(
|
_log.warning(
|
||||||
"Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!"
|
"Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!"
|
||||||
)
|
)
|
||||||
self.device = "cpu"
|
self.device = "cpu"
|
||||||
|
|
||||||
|
print("device: ", self.device)
|
||||||
|
|
||||||
self.use_cache = vlm_options.use_kv_cache
|
self.use_cache = vlm_options.use_kv_cache
|
||||||
self.max_new_tokens = vlm_options.max_new_tokens
|
self.max_new_tokens = vlm_options.max_new_tokens
|
||||||
|
|
||||||
@ -120,31 +122,34 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=2) # self.vlm_options.scale)
|
hi_res_image = page.get_image(scale=2) # self.vlm_options.scale)
|
||||||
# hi_res_image.show()
|
print(hi_res_image)
|
||||||
|
|
||||||
if hi_res_image is not None:
|
if hi_res_image is not None:
|
||||||
im_width, im_height = hi_res_image.size
|
im_width, im_height = hi_res_image.size
|
||||||
|
|
||||||
|
"""
|
||||||
if hi_res_image:
|
if hi_res_image:
|
||||||
if hi_res_image.mode != "RGB":
|
if hi_res_image.mode != "RGB":
|
||||||
hi_res_image = hi_res_image.convert("RGB")
|
hi_res_image = hi_res_image.convert("RGB")
|
||||||
|
"""
|
||||||
|
|
||||||
# Define prompt structure
|
# Define prompt structure
|
||||||
prompt = self.formulate_prompt()
|
prompt = self.formulate_prompt()
|
||||||
print(f"prompt: '{prompt}', size: {im_width}, {im_height}")
|
print(f"prompt: '{prompt}', size: {im_width}, {im_height}")
|
||||||
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=prompt, images=hi_res_image, return_tensors="pt"
|
text=prompt, images=hi_res_image, return_tensors="pt"
|
||||||
).to(self.device)
|
) #.to(self.device)
|
||||||
|
|
||||||
# Generate response
|
# Generate response
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
generate_ids = self.vlm_model.generate(
|
generate_ids = self.vlm_model.generate(
|
||||||
**inputs,
|
**inputs,
|
||||||
max_new_tokens=4096, # self.max_new_tokens,
|
max_new_tokens=self.max_new_tokens,
|
||||||
# use_cache=self.use_cache, # Enables KV caching which can improve performance
|
use_cache=self.use_cache, # Enables KV caching which can improve performance
|
||||||
generation_config=self.generation_config,
|
generation_config=self.generation_config,
|
||||||
num_logits_to_keep=1,
|
num_logits_to_keep=1,
|
||||||
|
# temperature=0.0,
|
||||||
)
|
)
|
||||||
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
|
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
|
||||||
|
|
||||||
@ -157,10 +162,11 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
clean_up_tokenization_spaces=False,
|
clean_up_tokenization_spaces=False,
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
_log.debug(
|
#_log.debug(
|
||||||
|
print(
|
||||||
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||||
)
|
)
|
||||||
page.predictions.vlm_response = VlmPrediction(text=response)
|
page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
|
@ -187,11 +187,12 @@ if __name__ == "__main__":
|
|||||||
rows = []
|
rows = []
|
||||||
for vlm_options in [
|
for vlm_options in [
|
||||||
# smoldocling_vlm_conversion_options, \
|
# smoldocling_vlm_conversion_options, \
|
||||||
smoldocling_vlm_mlx_conversion_options, \
|
# smoldocling_vlm_mlx_conversion_options, \
|
||||||
granite_vision_vlm_conversion_options, \
|
# granite_vision_vlm_conversion_options, \
|
||||||
# phi_vlm_conversion_options, \
|
phi_vlm_conversion_options, \
|
||||||
qwen25_vl_3b_vlm_mlx_conversion_options, \
|
# qwen25_vl_3b_vlm_mlx_conversion_options, \
|
||||||
pixtral_12b_vlm_mlx_conversion_options,
|
# pixtral_12b_vlm_mlx_conversion_options,
|
||||||
|
# pixtral_12b_vlm_conversion_options,
|
||||||
]:
|
]:
|
||||||
pipeline_options.vlm_options = vlm_options
|
pipeline_options.vlm_options = vlm_options
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user