From f159075b676a2366041ec187657a433e6d0b52ff Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 14 May 2025 07:39:20 +0200 Subject: [PATCH] pixtral 12b runs via MLX and native transformers Signed-off-by: Peter Staar --- docling/models/hf_vlm_models/hf_vlm_mlx_model.py | 12 +++++++++++- docs/examples/minimal_vlm_pipeline.py | 6 +++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py index 4dc90bf7..c6137b18 100644 --- a/docling/models/hf_vlm_models/hf_vlm_mlx_model.py +++ b/docling/models/hf_vlm_models/hf_vlm_mlx_model.py @@ -40,22 +40,29 @@ class HuggingFaceMlxModel(BasePageModel): ) repo_cache_folder = vlm_options.repo_id.replace("/", "--") + print(f"model init: {repo_cache_folder}") + self.apply_chat_template = apply_chat_template self.stream_generate = stream_generate # PARAMETERS: if artifacts_path is None: + print(f"before HuggingFaceVlmModel.download_models: {self.vlm_options.repo_id}") # artifacts_path = self.download_models(self.vlm_options.repo_id) artifacts_path = HuggingFaceVlmModel.download_models( - self.vlm_options.repo_id + self.vlm_options.repo_id, progress=True, ) elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder + print(f"downloaded model: {artifacts_path}") + self.param_question = vlm_options.prompt # "Perform Layout Analysis." ## Load the model + print("start loading model ...") self.vlm_model, self.processor = load(artifacts_path) + print("loaded model ...") self.config = load_config(artifacts_path) """ @@ -110,6 +117,8 @@ class HuggingFaceMlxModel(BasePageModel): ) start_time = time.time() + print("start generating ...") + # Call model to generate: output = "" for token in self.stream_generate( @@ -120,6 +129,7 @@ class HuggingFaceMlxModel(BasePageModel): max_tokens=4096, verbose=False, ): + print(token.text, end="", flush=True) output += token.text if "" in token.text: break diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 5ab971c3..8d81d4d6 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -49,6 +49,7 @@ pixtral_vlm_conversion_options = HuggingFaceVlmOptions( vlm_conversion_options = pixtral_vlm_conversion_options """ +""" pixtral_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="mistral-community/pixtral-12b", prompt="OCR this image and export it in MarkDown.", @@ -56,6 +57,7 @@ pixtral_vlm_conversion_options = HuggingFaceVlmOptions( inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, ) vlm_conversion_options = pixtral_vlm_conversion_options +""" """ phi_vlm_conversion_options = HuggingFaceVlmOptions( @@ -68,15 +70,13 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions( vlm_conversion_options = phi_vlm_conversion_options """ -""" pixtral_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="mlx-community/pixtral-12b-bf16", - prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!", + prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, ) vlm_conversion_options = pixtral_vlm_conversion_options -""" """ qwen_vlm_conversion_options = HuggingFaceVlmOptions(