From e2c95d09bc513067ad1839666585eebda44258d1 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 15 May 2025 07:32:55 +0200 Subject: [PATCH] need to get Phi4 working again ... Signed-off-by: Peter Staar --- .../hf_vlm_model_AutoModelForCausalLM.py | 41 ++++++++++--------- docs/examples/minimal_vlm_pipeline.py | 8 ++-- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index 8b4022d3..504b2a2e 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -73,21 +73,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): artifacts_path, trust_remote_code=self.trust_remote_code, ) - if not self.param_quantized: - self.vlm_model = AutoModelForCausalLM.from_pretrained( - artifacts_path, - device_map=self.device, - torch_dtype=torch.bfloat16, - _attn_implementation=( - "flash_attention_2" - if self.device.startswith("cuda") - and accelerator_options.cuda_use_flash_attention2 - else "eager" - ), - trust_remote_code=self.trust_remote_code, - ).to(self.device) - - else: + if self.param_quantized: + print("using quantized") self.vlm_model = AutoModelForCausalLM.from_pretrained( artifacts_path, device_map=self.device, @@ -100,7 +87,21 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): else "eager" ), trust_remote_code=self.trust_remote_code, - ).to(self.device) + ) # .to(self.device) + else: + print("using original") + self.vlm_model = AutoModelForCausalLM.from_pretrained( + artifacts_path, + device_map=self.device, + torch_dtype="auto", # torch.bfloat16, + _attn_implementation=( + "flash_attention_2" + if self.device.startswith("cuda") + and accelerator_options.cuda_use_flash_attention2 + else "eager" + ), + trust_remote_code=self.trust_remote_code, + ) # .to(self.device) model_path = artifacts_path @@ -118,7 +119,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): with TimeRecorder(conv_res, "vlm"): assert page.size is not None - hi_res_image = page.get_image(scale=self.vlm_options.scale) + hi_res_image = page.get_image(scale=2) # self.vlm_options.scale) + # hi_res_image.show() if hi_res_image is not None: im_width, im_height = hi_res_image.size @@ -129,6 +131,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): # Define prompt structure prompt = self.formulate_prompt() + print(f"prompt: '{prompt}', size: {im_width}, {im_height}") inputs = self.processor( text=prompt, images=hi_res_image, return_tensors="pt" @@ -138,8 +141,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): start_time = time.time() generate_ids = self.vlm_model.generate( **inputs, - max_new_tokens=self.max_new_tokens, - use_cache=self.use_cache, # Enables KV caching which can improve performance + max_new_tokens=4096, # self.max_new_tokens, + # use_cache=self.use_cache, # Enables KV caching which can improve performance generation_config=self.generation_config, num_logits_to_keep=1, ) diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index c2112be4..9c04d561 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -6,13 +6,11 @@ from docling_core.types.doc import DocItemLabel, ImageRefMode from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( +from docling.datamodel.pipeline_model_specializations import ( HuggingFaceVlmOptions, InferenceFramework, ResponseFormat, - VlmPipelineOptions, granite_vision_vlm_conversion_options, - granite_vision_vlm_mlx_conversion_options, granite_vision_vlm_ollama_conversion_options, phi_vlm_conversion_options, pixtral_12b_vlm_conversion_options, @@ -21,6 +19,9 @@ from docling.datamodel.pipeline_options import ( smoldocling_vlm_conversion_options, smoldocling_vlm_mlx_conversion_options, ) +from docling.datamodel.pipeline_options import ( + VlmPipelineOptions, +) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -49,6 +50,7 @@ pipeline_options.generate_page_images = True # pipeline_options.vlm_options = granite_vision_vlm_conversion_options pipeline_options.vlm_options = phi_vlm_conversion_options +# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options """ pixtral_vlm_conversion_options = HuggingFaceVlmOptions(