use AutoModelForVision2Seq for Pixtral and review example (including rename)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-16 08:38:14 +00:00 · 2025-06-01 16:30:58 +02:00
parent 0cb7520648
commit 9dbf08a084
4 changed files with 39 additions and 110 deletions
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -119,16 +119,16 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
 # Pixtral
 pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
+    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
    scale=2.0,
    temperature=0.0,
 )
 pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/pixtral-12b-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
@@ -138,7 +138,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 # Phi4
 phi_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
-    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare MarkDown",
+    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
    scale=2.0,
@@ -148,7 +148,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
 # Qwen
 qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
@@ -158,7 +158,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 # Gemma-3
 gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-12b-it-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
@@ -167,7 +167,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
 gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-27b-it-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -116,7 +116,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                    assert page.size is not None
                    hi_res_image = page.get_image(scale=2)  # self.vlm_options.scale)
                    print(hi_res_image)
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
@@ -127,7 +126,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                    inputs = self.processor(
                        text=prompt, images=hi_res_image, return_tensors="pt"
-                    )  # .to(self.device)
+                    ).to(self.device)
                    # Generate response
                    start_time = time.time()
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -40,7 +40,6 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
            self.device = decide_device(accelerator_options.device)
            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
            _log.debug(f"Available device for HuggingFace VLM: {self.device}")
            self.use_cache = vlm_options.use_kv_cache
@@ -73,7 +72,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                    artifacts_path,
                    device_map=self.device,
-                    torch_dtype=torch.bfloat16,
+                    # torch_dtype=torch.bfloat16,
                    _attn_implementation=(
                        "flash_attention_2"
                        if self.device.startswith("cuda")
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -1,3 +1,9 @@
 # Compare VLM models
 # ==================
 #
 # This example runs the VLM pipeline with different vision-language models.
 # Their runtime as well output quality is compared.
 import json
 import time
 from pathlib import Path
@@ -8,9 +14,6 @@ from tabulate import tabulate
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_model_specializations import (
    HuggingFaceVlmOptions,
    InferenceFramework,
    ResponseFormat,
    gemma_3_12b_mlx_conversion_options,
    granite_vision_vlm_conversion_options,
    granite_vision_vlm_ollama_conversion_options,
@@ -27,96 +30,24 @@ from docling.datamodel.pipeline_options import (
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 ## Use experimental VlmPipeline
 pipeline_options = VlmPipelineOptions()
 # If force_backend_text = True, text from backend will be used instead of generated text
 pipeline_options.force_backend_text = False
 pipeline_options.generate_page_images = True
-## On GPU systems, enable flash_attention_2 with CUDA:
+def convert(sources: list[Path], converter: DocumentConverter):
-# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
+    model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
-# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+    framework = pipeline_options.vlm_options.inference_framework
 ## Pick a VLM model. We choose SmolDocling-256M by default
 # pipeline_options.vlm_options = smoldocling_vlm_conversion_options
 ## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
 # pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
 pipeline_options.vlm_options = phi_vlm_conversion_options
 # pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options
 """
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
     repo_id="mistralai/Pixtral-12B-Base-2409",
     prompt="OCR this image and export it in MarkDown.",
     response_format=ResponseFormat.MARKDOWN,
     inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
 )
 pipeline_options.vlm_options = pixtral_vlm_conversion_options
 """
 """
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
    prompt="OCR this image and export it in MarkDown.",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
 )
 pipeline_options.vlm_options = pixtral_vlm_conversion_options
 """
 """
 phi_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
    # prompt="OCR the full page to markdown.",
    prompt="OCR this image and export it in MarkDown.",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
 )
 pipeline_options.vlm_options = phi_vlm_conversion_options
 """
 """
 pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/pixtral-12b-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=1.0,
 )
 pipeline_options.vlm_options = pixtral_vlm_conversion_options
 """
 """
 qwen_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
    prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
 )
 pipeline_options.vlm_options = qwen_vlm_conversion_options
 """
 def convert(sources: list[Path], converter):
    for source in sources:
        # start_time = time.time()
        print("================================================")
-        print(f"Processing... {source}")
+        print("Processing...")
        print(f"Source: {source}")
        print("---")
        print(f"Model: {model_id}")
        print(f"Framework: {framework}")
        print("================================================")
        print("")
        res = converter.convert(source)
        print("")
        # print(res.document.export_to_markdown())
        model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
        framework = pipeline_options.vlm_options.inference_framework
        fname = f"{res.input.file.stem}-{model_id}-{framework}"
        inference_time = 0.0
@@ -161,11 +92,10 @@ def convert(sources: list[Path], converter):
        )
        print("====================================================")
        # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ]
        return [
            source,
            model_id,
-            framework,
+            str(framework),
            pg_num,
            inference_time,
        ]
@@ -173,7 +103,6 @@ def convert(sources: list[Path], converter):
 if __name__ == "__main__":
    sources = [
        # "tests/data/2305.03393v1-pg9-img.png",
        "tests/data/pdf/2305.03393v1-pg9.pdf",
    ]
@@ -182,9 +111,6 @@ if __name__ == "__main__":
    ## Use VlmPipeline
    pipeline_options = VlmPipelineOptions()
    # If force_backend_text = True, text from backend will be used instead of generated text
    pipeline_options.force_backend_text = False
    pipeline_options.generate_page_images = True
    ## On GPU systems, enable flash_attention_2 with CUDA:
@@ -193,14 +119,17 @@ if __name__ == "__main__":
    rows = []
    for vlm_options in [
-        # smoldocling_vlm_conversion_options, \
+        ## DocTags / SmolDocling models
-        smoldocling_vlm_mlx_conversion_options,
+        smoldocling_vlm_conversion_options,
-        # granite_vision_vlm_conversion_options, \
+        # smoldocling_vlm_mlx_conversion_options,
-        # phi_vlm_conversion_options, \
+        ## Markdown models (using MLX framework)
-        # qwen25_vl_3b_vlm_mlx_conversion_options, \
+        # qwen25_vl_3b_vlm_mlx_conversion_options,
        # pixtral_12b_vlm_mlx_conversion_options,
-        # pixtral_12b_vlm_conversion_options,
+        # gemma_3_12b_mlx_conversion_options,
-        gemma_3_12b_mlx_conversion_options,
+        ## Markdown models (using Transformers framework)
        # granite_vision_vlm_conversion_options,
        phi_vlm_conversion_options,
        pixtral_12b_vlm_conversion_options,
    ]:
        pipeline_options.vlm_options = vlm_options
@@ -219,11 +148,13 @@ if __name__ == "__main__":
        )
        row = convert(sources=sources, converter=converter)
        print("pipelines: \n", converter._get_initialized_pipelines())
        rows.append(row)
-        print(tabulate(rows))
+        print(
            tabulate(
                rows, headers=["source", "model_id", "framework", "num_pages", "time"]
            )
        )
        print("see if memory gets released ...")
        time.sleep(10)