diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py index 9a80fae4..77e6c2f2 100644 --- a/docling/datamodel/pipeline_model_specializations.py +++ b/docling/datamodel/pipeline_model_specializations.py @@ -84,6 +84,7 @@ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.DOCTAGS, inference_framework=InferenceFramework.MLX, scale=2.0, + temperature=0.0, ) smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( @@ -92,6 +93,7 @@ smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.DOCTAGS, inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, scale=2.0, + temperature=0.0, ) # GraniteVision @@ -101,6 +103,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, scale=2.0, + temperature=0.0, ) granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( @@ -110,6 +113,7 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( scale=1.0, timeout=120, response_format=ResponseFormat.MARKDOWN, + temperature=0.0, ) # Pixtral @@ -119,6 +123,7 @@ pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, scale=2.0, + temperature=0.0, ) pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( @@ -127,6 +132,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, scale=2.0, + temperature=0.0, ) # Phi4 @@ -135,6 +141,8 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions( prompt="Convert this page to MarkDown. Do not miss any text and only output the bare MarkDown", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, + scale=2.0, + temperature=0.0, ) # Qwen @@ -143,4 +151,6 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, + scale=2.0, + temperature=0.0, ) diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 20a1c7dd..73e6f313 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -6,6 +6,17 @@ _log = logging.getLogger(__name__) class HuggingFaceVlmModel: + + @staticmethod + def map_device_to_cpu_if_mlx(device: str) -> str: + if device == "mps": + _log.warning( + "Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!" + ) + return "cpu" + + return device + @staticmethod def download_models( repo_id: str, diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py index 0d7b63f9..57abaa7e 100644 --- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py +++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py @@ -29,7 +29,8 @@ class HuggingFaceMlxModel(BasePageModel): self.vlm_options = vlm_options self.max_tokens = vlm_options.max_new_tokens - + self.temperature = vlm_options.temperature + if self.enabled: try: from mlx_vlm import generate, load # type: ignore @@ -103,8 +104,9 @@ class HuggingFaceMlxModel(BasePageModel): self.processor, prompt, [hi_res_image], - max_tokens=4096, + max_tokens=self.max_tokens, verbose=False, + temp=self.temperature, ): if len(token.logprobs.shape) == 1: tokens.append( diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index 449764cb..213a5a28 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -42,19 +42,13 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): ) self.device = decide_device(accelerator_options.device) - - if self.device == "mps": - _log.warning( - "Mapping mlx to cpu for AutoModelForCausalLM, use MLX framework!" - ) - self.device = "cpu" - - print("device: ", self.device) - + self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device) + _log.debug(f"Available device for VLM: {self.device}") + self.use_cache = vlm_options.use_kv_cache self.max_new_tokens = vlm_options.max_new_tokens + self.temperature = vlm_options.temperature - _log.debug(f"Available device for VLM: {self.device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") if artifacts_path is None: @@ -126,12 +120,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): if hi_res_image is not None: im_width, im_height = hi_res_image.size - - """ - if hi_res_image: - if hi_res_image.mode != "RGB": - hi_res_image = hi_res_image.convert("RGB") - """ # Define prompt structure prompt = self.formulate_prompt() @@ -147,9 +135,9 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): **inputs, max_new_tokens=self.max_new_tokens, use_cache=self.use_cache, # Enables KV caching which can improve performance + temperature=self.temperature, generation_config=self.generation_config, num_logits_to_keep=1, - # temperature=0.0, ) generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] @@ -162,8 +150,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): clean_up_tokenization_spaces=False, )[0] - #_log.debug( - print( + _log.debug( f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." ) page.predictions.vlm_response = VlmPrediction(text=response, generation_time=generation_time) diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py index 6633c842..b0c74aa8 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py @@ -39,8 +39,14 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): ) self.device = decide_device(accelerator_options.device) + self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device) + _log.debug(f"Available device for HuggingFace VLM: {self.device}") + self.use_cache = vlm_options.use_kv_cache + self.max_new_tokens = vlm_options.max_new_tokens + self.temperature = vlm_options.temperature + repo_cache_folder = vlm_options.repo_id.replace("/", "--") # PARAMETERS: @@ -111,10 +117,12 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): # populate page_tags with predicted doc tags page_tags = "" + """ if hi_res_image: if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") - + """ + # Define prompt structure prompt = self.formulate_prompt() @@ -126,7 +134,10 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): start_time = time.time() # Call model to generate: generated_ids = self.vlm_model.generate( - **inputs, max_new_tokens=4096, use_cache=True + **inputs, + max_new_tokens=self.max_new_tokens, + use_cache=self.use_cache, + temperature=self.temperature, ) generation_time = time.time() - start_time diff --git a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py index 304f8c0f..1c286a8b 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_LlavaForConditionalGeneration.py @@ -39,16 +39,12 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): ) self.device = decide_device(accelerator_options.device) - - if self.device == "mlx": - _log.warning( - "Mapping mlx to cpu for LlavaForConditionalGeneration, use MLX framework!" - ) - self.device = "cpu" + self.device = HuggingFaceVlmMode.map_device_to_cpu_if_mlx(self.device) self.use_cache = vlm_options.use_kv_cache self.max_new_tokens = vlm_options.max_new_tokens - + self.temperature = vlm_options.temperature + _log.debug(f"Available device for VLM: {self.device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") @@ -93,10 +89,12 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): if hi_res_image is not None: im_width, im_height = hi_res_image.size + """ if hi_res_image: if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") - + """ + images = [hi_res_image] # Define prompt structure @@ -112,9 +110,10 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): **inputs, max_new_tokens=self.max_new_tokens, use_cache=self.use_cache, # Enables KV caching which can improve performance + temperature=self.temperature, ) - num_tokens = len(generate_ids[0]) + #num_tokens = len(generate_ids[0]) generation_time = time.time() - start_time response = self.processor.batch_decode( @@ -125,7 +124,7 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): page.predictions.vlm_response = VlmPrediction( text=response, - generated_tokens=num_tokens, + #generated_tokens=num_tokens, generation_time=generation_time, ) @@ -134,7 +133,6 @@ class HuggingFaceVlmModel_LlavaForConditionalGeneration(BasePageModel): def formulate_prompt(self) -> str: """Formulate a prompt for the VLM.""" if self.vlm_options.repo_id == "mistral-community/pixtral-12b": - # prompt = f"[INST]{self.vlm_options.prompt}\n[IMG][/INST]" chat = [ { "role": "user", diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index e240f6ce..be2afe06 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -187,9 +187,9 @@ if __name__ == "__main__": rows = [] for vlm_options in [ # smoldocling_vlm_conversion_options, \ - # smoldocling_vlm_mlx_conversion_options, \ + smoldocling_vlm_mlx_conversion_options, \ # granite_vision_vlm_conversion_options, \ - phi_vlm_conversion_options, \ + # phi_vlm_conversion_options, \ # qwen25_vl_3b_vlm_mlx_conversion_options, \ # pixtral_12b_vlm_mlx_conversion_options, # pixtral_12b_vlm_conversion_options,