use AutoModelForVision2Seq for Pixtral and review example (including rename)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-11 14:18:30 +00:00 · 2025-06-01 16:30:58 +02:00
parent 0cb7520648
commit 9dbf08a084
4 changed files with 39 additions and 110 deletions
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -119,16 +119,16 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
 # Pixtral
 pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
+    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq,
    scale=2.0,
    temperature=0.0,
 )

 pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/pixtral-12b-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
@@ -138,7 +138,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 # Phi4
 phi_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
-    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare MarkDown",
+    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
    scale=2.0,
@@ -148,7 +148,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
 # Qwen
 qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
@@ -158,7 +158,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 # Gemma-3
 gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-12b-it-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
@@ -167,7 +167,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(

 gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-27b-it-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.MLX,
    scale=2.0,
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py
@@ -116,7 +116,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
                    assert page.size is not None

                    hi_res_image = page.get_image(scale=2)  # self.vlm_options.scale)
-                    print(hi_res_image)

                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
@@ -127,7 +126,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):

                    inputs = self.processor(
                        text=prompt, images=hi_res_image, return_tensors="pt"
-                    )  # .to(self.device)
+                    ).to(self.device)

                    # Generate response
                    start_time = time.time()
--- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
+++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py
@@ -40,7 +40,6 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):

            self.device = decide_device(accelerator_options.device)
            self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device)
-
            _log.debug(f"Available device for HuggingFace VLM: {self.device}")

            self.use_cache = vlm_options.use_kv_cache
@@ -73,7 +72,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
                    artifacts_path,
                    device_map=self.device,
-                    torch_dtype=torch.bfloat16,
+                    # torch_dtype=torch.bfloat16,
                    _attn_implementation=(
                        "flash_attention_2"
                        if self.device.startswith("cuda")
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@@ -1,3 +1,9 @@
+# Compare VLM models
+# ==================
+#
+# This example runs the VLM pipeline with different vision-language models.
+# Their runtime as well output quality is compared.
+
 import json
 import time
 from pathlib import Path
@@ -8,9 +14,6 @@ from tabulate import tabulate

 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_model_specializations import (
-    HuggingFaceVlmOptions,
-    InferenceFramework,
-    ResponseFormat,
    gemma_3_12b_mlx_conversion_options,
    granite_vision_vlm_conversion_options,
    granite_vision_vlm_ollama_conversion_options,
@@ -27,96 +30,24 @@ from docling.datamodel.pipeline_options import (
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

-## Use experimental VlmPipeline
-pipeline_options = VlmPipelineOptions()
-# If force_backend_text = True, text from backend will be used instead of generated text
-pipeline_options.force_backend_text = False
-pipeline_options.generate_page_images = True

-## On GPU systems, enable flash_attention_2 with CUDA:
-# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
-# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
-
-## Pick a VLM model. We choose SmolDocling-256M by default
-# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
-
-## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
-# pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
-
-## Alternative VLM models:
-# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
-
-pipeline_options.vlm_options = phi_vlm_conversion_options
-# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options
-
-"""
-pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-     repo_id="mistralai/Pixtral-12B-Base-2409",
-     prompt="OCR this image and export it in MarkDown.",
-     response_format=ResponseFormat.MARKDOWN,
-     inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
-)
-pipeline_options.vlm_options = pixtral_vlm_conversion_options
-"""
-
-"""
-pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="mistral-community/pixtral-12b",
-    prompt="OCR this image and export it in MarkDown.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
-)
-pipeline_options.vlm_options = pixtral_vlm_conversion_options
-"""
-
-"""
-phi_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="microsoft/Phi-4-multimodal-instruct",
-    # prompt="OCR the full page to markdown.",
-    prompt="OCR this image and export it in MarkDown.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
-)
-pipeline_options.vlm_options = phi_vlm_conversion_options
-"""
-
-"""
-pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="mlx-community/pixtral-12b-bf16",
-    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.MLX,
-    scale=1.0,
-)
-pipeline_options.vlm_options = pixtral_vlm_conversion_options
-"""
-
-"""
-qwen_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
-    prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.MLX,
-)
-pipeline_options.vlm_options = qwen_vlm_conversion_options
-"""
-
-
-def convert(sources: list[Path], converter):
+def convert(sources: list[Path], converter: DocumentConverter):
+    model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
+    framework = pipeline_options.vlm_options.inference_framework
    for source in sources:
-        # start_time = time.time()
        print("================================================")
-        print(f"Processing... {source}")
+        print("Processing...")
+        print(f"Source: {source}")
+        print("---")
+        print(f"Model: {model_id}")
+        print(f"Framework: {framework}")
        print("================================================")
        print("")

        res = converter.convert(source)

        print("")
-        # print(res.document.export_to_markdown())

-        model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
-        framework = pipeline_options.vlm_options.inference_framework
        fname = f"{res.input.file.stem}-{model_id}-{framework}"

        inference_time = 0.0
@@ -161,11 +92,10 @@ def convert(sources: list[Path], converter):
        )
        print("====================================================")

-        # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ]
        return [
            source,
            model_id,
-            framework,
+            str(framework),
            pg_num,
            inference_time,
        ]
@@ -173,7 +103,6 @@ def convert(sources: list[Path], converter):

 if __name__ == "__main__":
    sources = [
-        # "tests/data/2305.03393v1-pg9-img.png",
        "tests/data/pdf/2305.03393v1-pg9.pdf",
    ]

@@ -182,9 +111,6 @@ if __name__ == "__main__":

    ## Use VlmPipeline
    pipeline_options = VlmPipelineOptions()
-
-    # If force_backend_text = True, text from backend will be used instead of generated text
-    pipeline_options.force_backend_text = False
    pipeline_options.generate_page_images = True

    ## On GPU systems, enable flash_attention_2 with CUDA:
@@ -193,14 +119,17 @@ if __name__ == "__main__":

    rows = []
    for vlm_options in [
-        # smoldocling_vlm_conversion_options, \
-        smoldocling_vlm_mlx_conversion_options,
-        # granite_vision_vlm_conversion_options, \
-        # phi_vlm_conversion_options, \
-        # qwen25_vl_3b_vlm_mlx_conversion_options, \
+        ## DocTags / SmolDocling models
+        smoldocling_vlm_conversion_options,
+        # smoldocling_vlm_mlx_conversion_options,
+        ## Markdown models (using MLX framework)
+        # qwen25_vl_3b_vlm_mlx_conversion_options,
        # pixtral_12b_vlm_mlx_conversion_options,
-        # pixtral_12b_vlm_conversion_options,
-        gemma_3_12b_mlx_conversion_options,
+        # gemma_3_12b_mlx_conversion_options,
+        ## Markdown models (using Transformers framework)
+        # granite_vision_vlm_conversion_options,
+        phi_vlm_conversion_options,
+        pixtral_12b_vlm_conversion_options,
    ]:
        pipeline_options.vlm_options = vlm_options

@@ -219,11 +148,13 @@ if __name__ == "__main__":
        )

        row = convert(sources=sources, converter=converter)
-        print("pipelines: \n", converter._get_initialized_pipelines())
-
        rows.append(row)

-        print(tabulate(rows))
+        print(
+            tabulate(
+                rows, headers=["source", "model_id", "framework", "num_pages", "time"]
+            )
+        )

        print("see if memory gets released ...")
        time.sleep(10)