refactor instances of VLM models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-11 14:18:30 +00:00 · 2025-06-01 16:55:56 +02:00
parent fb0d979419
commit 0b2c1d5eda
6 changed files with 128 additions and 128 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -29,13 +29,6 @@ from docling.datamodel.base_models import (
    OutputFormat,
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_model_specializations import (
    VlmModelType,
    granite_vision_vlm_conversion_options,
    granite_vision_vlm_ollama_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
@@ -48,6 +41,13 @@ from docling.datamodel.pipeline_options import (
    TableFormerMode,
    VlmPipelineOptions,
 )
 from docling.datamodel.pipeline_vlm_model_spec import (
    GRANITE_VISION_OLLAMA,
    GRANITE_VISION_TRANSFORMERS,
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
    VlmModelType,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
@@ -549,20 +549,16 @@ def convert(  # noqa: C901
            )
            if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = (
+                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
                    granite_vision_vlm_ollama_conversion_options
                )
            elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
-                        pipeline_options.vlm_options = (
+                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                            smoldocling_vlm_mlx_conversion_options
                        )
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,10 +16,13 @@ from pydantic import (
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import deprecated
-from docling.datamodel.pipeline_model_specializations import (
+# Import the following for backwards compatibility
-    ApiVlmOptions,
+from docling.datamodel.pipeline_vlm_model_spec import (
-    HuggingFaceVlmOptions,
+    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
-    smoldocling_vlm_conversion_options,
+    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
    VlmModelType,
 )
 _log = logging.getLogger(__name__)
@@ -302,6 +305,65 @@ class PaginatedPipelineOptions(PipelineOptions):
    generate_picture_images: bool = False
 class BaseVlmOptions(BaseModel):
    kind: str
    prompt: str
 class ResponseFormat(str, Enum):
    DOCTAGS = "doctags"
    MARKDOWN = "markdown"
    HTML = "html"
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
    TRANSFORMERS_LlavaForConditionalGeneration = (
        "transformers-LlavaForConditionalGeneration"
    )
 class HuggingFaceVlmOptions(BaseVlmOptions):
    kind: Literal["hf_model_options"] = "hf_model_options"
    repo_id: str
    load_in_8bit: bool = True
    llm_int8_threshold: float = 6.0
    quantized: bool = False
    inference_framework: InferenceFramework
    response_format: ResponseFormat
    scale: float = 2.0
    temperature: float = 0.0
    stop_strings: list[str] = []
    use_kv_cache: bool = True
    max_new_tokens: int = 4096
    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")
 class ApiVlmOptions(BaseVlmOptions):
    kind: Literal["api_model_options"] = "api_model_options"
    url: AnyUrl = AnyUrl(
        "http://localhost:11434/v1/chat/completions"
    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
 class VlmPipelineOptions(PaginatedPipelineOptions):
    generate_page_images: bool = True
    force_backend_text: bool = (
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -1,83 +1,22 @@
 import logging
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
 from pydantic import (
    AnyUrl,
-    BaseModel,
+)
 from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    HuggingFaceVlmOptions,
    InferenceFramework,
    ResponseFormat,
 )
 _log = logging.getLogger(__name__)
 class BaseVlmOptions(BaseModel):
    kind: str
    prompt: str
 class ResponseFormat(str, Enum):
    DOCTAGS = "doctags"
    MARKDOWN = "markdown"
    HTML = "html"
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
    TRANSFORMERS_LlavaForConditionalGeneration = (
        "transformers-LlavaForConditionalGeneration"
    )
 class HuggingFaceVlmOptions(BaseVlmOptions):
    kind: Literal["hf_model_options"] = "hf_model_options"
    repo_id: str
    load_in_8bit: bool = True
    llm_int8_threshold: float = 6.0
    quantized: bool = False
    inference_framework: InferenceFramework
    response_format: ResponseFormat
    scale: float = 2.0
    temperature: float = 0.0
    stop_strings: list[str] = []
    use_kv_cache: bool = True
    max_new_tokens: int = 4096
    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")
 class ApiVlmOptions(BaseVlmOptions):
    kind: Literal["api_model_options"] = "api_model_options"
    url: AnyUrl = AnyUrl(
        "http://localhost:11434/v1/chat/completions"
    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
 class VlmModelType(str, Enum):
    SMOLDOCLING = "smoldocling"
    GRANITE_VISION = "granite_vision"
    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
 # SmolDocling
-smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+SMOLDOCLING_MLX = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
@@ -86,7 +25,7 @@ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )
-smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
+SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
@@ -96,7 +35,7 @@ smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
 )
 # GraniteVision
-granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
+GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ibm-granite/granite-vision-3.2-2b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -105,7 +44,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )
-granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
+GRANITE_VISION_OLLAMA = ApiVlmOptions(
    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
    params={"model": "granite3.2-vision:2b"},
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
@@ -116,7 +55,7 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
 )
 # Pixtral
-pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
+PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -125,7 +64,7 @@ pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )
-pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+PIXTRAL_12B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/pixtral-12b-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -135,7 +74,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 )
 # Phi4
-phi_vlm_conversion_options = HuggingFaceVlmOptions(
+PHI4_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    response_format=ResponseFormat.MARKDOWN,
@@ -145,7 +84,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
 )
 # Qwen
-qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+QWEN25_VL_3B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -155,7 +94,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 )
 # Gemma-3
-gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
+GEMMA3_12B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-12b-it-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -164,7 +103,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )
-gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
+GEMMA3_27B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-27b-it-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -172,3 +111,9 @@ gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
    scale=2.0,
    temperature=0.0,
 )
 class VlmModelType(str, Enum):
    SMOLDOCLING = "smoldocling"
    GRANITE_VISION = "granite_vision"
    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -27,13 +27,11 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_model_specializations import (
+from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    HuggingFaceVlmOptions,
    InferenceFramework,
    ResponseFormat,
 )
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,14 +2,14 @@ import logging
 from pathlib import Path
 from typing import Optional
 from docling.datamodel.pipeline_model_specializations import (
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.pipeline_options import (
    granite_picture_description,
    smolvlm_picture_description,
 )
 from docling.datamodel.pipeline_vlm_model_spec import (
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
 )
 from docling.datamodel.settings import settings
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
@@ -87,8 +87,8 @@ def download_models(
    if with_smoldocling:
        _log.info("Downloading SmolDocling model...")
        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_conversion_options.repo_id,
+            repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
-            local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
+            local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
            force=force,
            progress=progress,
        )
@@ -96,9 +96,8 @@ def download_models(
    if with_smoldocling_mlx:
        _log.info("Downloading SmolDocling MLX model...")
        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
+            repo_id=SMOLDOCLING_MLX.repo_id,
-            local_dir=output_dir
+            local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
            / smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
            force=force,
            progress=progress,
        )
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@@ -13,20 +13,20 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from tabulate import tabulate
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_model_specializations import (
    gemma_3_12b_mlx_conversion_options,
    granite_vision_vlm_conversion_options,
    granite_vision_vlm_ollama_conversion_options,
    phi_vlm_conversion_options,
    pixtral_12b_vlm_conversion_options,
    pixtral_12b_vlm_mlx_conversion_options,
    qwen25_vl_3b_vlm_mlx_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
 from docling.datamodel.pipeline_vlm_model_spec import (
    GEMMA3_12B_MLX,
    GRANITE_VISION_OLLAMA,
    GRANITE_VISION_TRANSFORMERS,
    PHI4_TRANSFORMERS,
    PIXTRAL_12B_MLX,
    PIXTRAL_12B_TRANSFORMERS,
    QWEN25_VL_3B_MLX,
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
@@ -120,16 +120,16 @@ if __name__ == "__main__":
    rows = []
    for vlm_options in [
        ## DocTags / SmolDocling models
-        smoldocling_vlm_conversion_options,
+        SMOLDOCLING_TRANSFORMERS,
-        # smoldocling_vlm_mlx_conversion_options,
+        SMOLDOCLING_MLX,
        ## Markdown models (using MLX framework)
-        # qwen25_vl_3b_vlm_mlx_conversion_options,
+        QWEN25_VL_3B_MLX,
-        # pixtral_12b_vlm_mlx_conversion_options,
+        PIXTRAL_12B_MLX,
-        # gemma_3_12b_mlx_conversion_options,
+        GEMMA3_12B_MLX,
        ## Markdown models (using Transformers framework)
-        # granite_vision_vlm_conversion_options,
+        GRANITE_VISION_TRANSFORMERS,
-        phi_vlm_conversion_options,
+        PHI4_TRANSFORMERS,
-        pixtral_12b_vlm_conversion_options,
+        PIXTRAL_12B_TRANSFORMERS,
    ]:
        pipeline_options.vlm_options = vlm_options