need to get Phi4 working again ...

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-05-15 07:32:55 +02:00 · 2025-05-15 07:32:55 +02:00 · e2c95d09bc
commit e2c95d09bc
parent 15a8f328c2
2 changed files with 27 additions and 22 deletions
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@ -73,21 +73,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                artifacts_path,
                trust_remote_code=self.trust_remote_code,
            )
-            if not self.param_quantized:
-                self.vlm_model = AutoModelForCausalLM.from_pretrained(
-                    artifacts_path,
-                    device_map=self.device,
-                    torch_dtype=torch.bfloat16,
-                    _attn_implementation=(
-                        "flash_attention_2"
-                        if self.device.startswith("cuda")
-                        and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
-                    ),
-                    trust_remote_code=self.trust_remote_code,
-                ).to(self.device)
-
-            else:
+            if self.param_quantized:
+                print("using quantized")
                self.vlm_model = AutoModelForCausalLM.from_pretrained(
                    artifacts_path,
                    device_map=self.device,
@ -100,7 +87,21 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                        else "eager"
                    ),
                    trust_remote_code=self.trust_remote_code,
-                ).to(self.device)
+                )  # .to(self.device)
+            else:
+                print("using original")
+                self.vlm_model = AutoModelForCausalLM.from_pretrained(
+                    artifacts_path,
+                    device_map=self.device,
+                    torch_dtype="auto",  # torch.bfloat16,
+                    _attn_implementation=(
+                        "flash_attention_2"
+                        if self.device.startswith("cuda")
+                        and accelerator_options.cuda_use_flash_attention2
+                        else "eager"
+                    ),
+                    trust_remote_code=self.trust_remote_code,
+                )  # .to(self.device)

            model_path = artifacts_path

@ -118,7 +119,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(scale=2)  # self.vlm_options.scale)
+                    # hi_res_image.show()

                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
@ -129,6 +131,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):

                    # Define prompt structure
                    prompt = self.formulate_prompt()
+                    print(f"prompt: '{prompt}', size: {im_width}, {im_height}")

                    inputs = self.processor(
                        text=prompt, images=hi_res_image, return_tensors="pt"
@ -138,8 +141,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                    start_time = time.time()
                    generate_ids = self.vlm_model.generate(
                        **inputs,
-                        max_new_tokens=self.max_new_tokens,
-                        use_cache=self.use_cache,  # Enables KV caching which can improve performance
+                        max_new_tokens=4096,  # self.max_new_tokens,
+                        # use_cache=self.use_cache,  # Enables KV caching which can improve performance
                        generation_config=self.generation_config,
                        num_logits_to_keep=1,
                    )
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -6,13 +6,11 @@ from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS

 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import (
+from docling.datamodel.pipeline_model_specializations import (
    HuggingFaceVlmOptions,
    InferenceFramework,
    ResponseFormat,
-    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
-    granite_vision_vlm_mlx_conversion_options,
    granite_vision_vlm_ollama_conversion_options,
    phi_vlm_conversion_options,
    pixtral_12b_vlm_conversion_options,
@ -21,6 +19,9 @@ from docling.datamodel.pipeline_options import (
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
+from docling.datamodel.pipeline_options import (
+    VlmPipelineOptions,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

@ -49,6 +50,7 @@ pipeline_options.generate_page_images = True
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options

 pipeline_options.vlm_options = phi_vlm_conversion_options
+# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options

 """
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(