refactor instances of VLM models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-11 14:18:30 +00:00 · 2025-06-01 16:55:56 +02:00
parent fb0d979419
commit 0b2c1d5eda
6 changed files with 128 additions and 128 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -29,13 +29,6 @@ from docling.datamodel.base_models import (
    OutputFormat,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_model_specializations import (
-    VlmModelType,
-    granite_vision_vlm_conversion_options,
-    granite_vision_vlm_ollama_conversion_options,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
-)
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
@@ -48,6 +41,13 @@ from docling.datamodel.pipeline_options import (
    TableFormerMode,
    VlmPipelineOptions,
 )
+from docling.datamodel.pipeline_vlm_model_spec import (
+    GRANITE_VISION_OLLAMA,
+    GRANITE_VISION_TRANSFORMERS,
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+    VlmModelType,
+)
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
@@ -549,20 +549,16 @@ def convert(  # noqa: C901
            )

            if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = (
-                    granite_vision_vlm_ollama_conversion_options
-                )
+                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
            elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm

-                        pipeline_options.vlm_options = (
-                            smoldocling_vlm_mlx_conversion_options
-                        )
+                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,10 +16,13 @@ from pydantic import (
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import deprecated

-from docling.datamodel.pipeline_model_specializations import (
-    ApiVlmOptions,
-    HuggingFaceVlmOptions,
-    smoldocling_vlm_conversion_options,
+# Import the following for backwards compatibility
+from docling.datamodel.pipeline_vlm_model_spec import (
+    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
+    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
+    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
+    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
+    VlmModelType,
 )

 _log = logging.getLogger(__name__)
@@ -302,6 +305,65 @@ class PaginatedPipelineOptions(PipelineOptions):
    generate_picture_images: bool = False


+class BaseVlmOptions(BaseModel):
+    kind: str
+    prompt: str
+
+
+class ResponseFormat(str, Enum):
+    DOCTAGS = "doctags"
+    MARKDOWN = "markdown"
+    HTML = "html"
+
+
+class InferenceFramework(str, Enum):
+    MLX = "mlx"
+    TRANSFORMERS = "transformers"
+    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
+    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
+    TRANSFORMERS_LlavaForConditionalGeneration = (
+        "transformers-LlavaForConditionalGeneration"
+    )
+
+
+class HuggingFaceVlmOptions(BaseVlmOptions):
+    kind: Literal["hf_model_options"] = "hf_model_options"
+
+    repo_id: str
+    load_in_8bit: bool = True
+    llm_int8_threshold: float = 6.0
+    quantized: bool = False
+
+    inference_framework: InferenceFramework
+    response_format: ResponseFormat
+
+    scale: float = 2.0
+
+    temperature: float = 0.0
+    stop_strings: list[str] = []
+
+    use_kv_cache: bool = True
+    max_new_tokens: int = 4096
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+class ApiVlmOptions(BaseVlmOptions):
+    kind: Literal["api_model_options"] = "api_model_options"
+
+    url: AnyUrl = AnyUrl(
+        "http://localhost:11434/v1/chat/completions"
+    )  # Default to ollama
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    scale: float = 2.0
+    timeout: float = 60
+    concurrency: int = 1
+    response_format: ResponseFormat
+
+
 class VlmPipelineOptions(PaginatedPipelineOptions):
    generate_page_images: bool = True
    force_backend_text: bool = (
--- a/docling/datamodel/pipeline_model_specializations.py
+++ b/docling/datamodel/pipeline_model_specializations.py
@@ -1,83 +1,22 @@
 import logging
 from enum import Enum
-from pathlib import Path
-from typing import Any, ClassVar, Dict, List, Literal, Optional, Union

 from pydantic import (
    AnyUrl,
-    BaseModel,
+)
+
+from docling.datamodel.pipeline_options import (
+    ApiVlmOptions,
+    HuggingFaceVlmOptions,
+    InferenceFramework,
+    ResponseFormat,
 )

 _log = logging.getLogger(__name__)


-class BaseVlmOptions(BaseModel):
-    kind: str
-    prompt: str
-
-
-class ResponseFormat(str, Enum):
-    DOCTAGS = "doctags"
-    MARKDOWN = "markdown"
-    HTML = "html"
-
-
-class InferenceFramework(str, Enum):
-    MLX = "mlx"
-    TRANSFORMERS = "transformers"
-    TRANSFORMERS_AutoModelForVision2Seq = "transformers-AutoModelForVision2Seq"
-    TRANSFORMERS_AutoModelForCausalLM = "transformers-AutoModelForCausalLM"
-    TRANSFORMERS_LlavaForConditionalGeneration = (
-        "transformers-LlavaForConditionalGeneration"
-    )
-
-
-class HuggingFaceVlmOptions(BaseVlmOptions):
-    kind: Literal["hf_model_options"] = "hf_model_options"
-
-    repo_id: str
-    load_in_8bit: bool = True
-    llm_int8_threshold: float = 6.0
-    quantized: bool = False
-
-    inference_framework: InferenceFramework
-    response_format: ResponseFormat
-
-    scale: float = 2.0
-
-    temperature: float = 0.0
-    stop_strings: list[str] = []
-
-    use_kv_cache: bool = True
-    max_new_tokens: int = 4096
-
-    @property
-    def repo_cache_folder(self) -> str:
-        return self.repo_id.replace("/", "--")
-
-
-class ApiVlmOptions(BaseVlmOptions):
-    kind: Literal["api_model_options"] = "api_model_options"
-
-    url: AnyUrl = AnyUrl(
-        "http://localhost:11434/v1/chat/completions"
-    )  # Default to ollama
-    headers: Dict[str, str] = {}
-    params: Dict[str, Any] = {}
-    scale: float = 2.0
-    timeout: float = 60
-    concurrency: int = 1
-    response_format: ResponseFormat
-
-
-class VlmModelType(str, Enum):
-    SMOLDOCLING = "smoldocling"
-    GRANITE_VISION = "granite_vision"
-    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
-
-
 # SmolDocling
-smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+SMOLDOCLING_MLX = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
@@ -86,7 +25,7 @@ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )

-smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
+SMOLDOCLING_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
@@ -96,7 +35,7 @@ smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
 )

 # GraniteVision
-granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
+GRANITE_VISION_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="ibm-granite/granite-vision-3.2-2b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -105,7 +44,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )

-granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
+GRANITE_VISION_OLLAMA = ApiVlmOptions(
    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
    params={"model": "granite3.2-vision:2b"},
    prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!",
@@ -116,7 +55,7 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
 )

 # Pixtral
-pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
+PIXTRAL_12B_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="mistral-community/pixtral-12b",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -125,7 +64,7 @@ pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )

-pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+PIXTRAL_12B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/pixtral-12b-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -135,7 +74,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 )

 # Phi4
-phi_vlm_conversion_options = HuggingFaceVlmOptions(
+PHI4_TRANSFORMERS = HuggingFaceVlmOptions(
    repo_id="microsoft/Phi-4-multimodal-instruct",
    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
    response_format=ResponseFormat.MARKDOWN,
@@ -145,7 +84,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions(
 )

 # Qwen
-qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+QWEN25_VL_3B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -155,7 +94,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
 )

 # Gemma-3
-gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
+GEMMA3_12B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-12b-it-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -164,7 +103,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions(
    temperature=0.0,
 )

-gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
+GEMMA3_27B_MLX = HuggingFaceVlmOptions(
    repo_id="mlx-community/gemma-3-27b-it-bf16",
    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
    response_format=ResponseFormat.MARKDOWN,
@@ -172,3 +111,9 @@ gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions(
    scale=2.0,
    temperature=0.0,
 )
+
+
+class VlmModelType(str, Enum):
+    SMOLDOCLING = "smoldocling"
+    GRANITE_VISION = "granite_vision"
+    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -27,13 +27,11 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_model_specializations import (
+from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    HuggingFaceVlmOptions,
    InferenceFramework,
    ResponseFormat,
-)
-from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,14 +2,14 @@ import logging
 from pathlib import Path
 from typing import Optional

-from docling.datamodel.pipeline_model_specializations import (
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
-)
 from docling.datamodel.pipeline_options import (
    granite_picture_description,
    smolvlm_picture_description,
 )
+from docling.datamodel.pipeline_vlm_model_spec import (
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+)
 from docling.datamodel.settings import settings
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
@@ -87,8 +87,8 @@ def download_models(
    if with_smoldocling:
        _log.info("Downloading SmolDocling model...")
        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_conversion_options.repo_id,
-            local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
+            repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
+            local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
            force=force,
            progress=progress,
        )
@@ -96,9 +96,8 @@ def download_models(
    if with_smoldocling_mlx:
        _log.info("Downloading SmolDocling MLX model...")
        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
-            local_dir=output_dir
-            / smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
+            repo_id=SMOLDOCLING_MLX.repo_id,
+            local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
            force=force,
            progress=progress,
        )
--- a/docs/examples/compare_vlm_models.py
+++ b/docs/examples/compare_vlm_models.py
@@ -13,20 +13,20 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from tabulate import tabulate

 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_model_specializations import (
-    gemma_3_12b_mlx_conversion_options,
-    granite_vision_vlm_conversion_options,
-    granite_vision_vlm_ollama_conversion_options,
-    phi_vlm_conversion_options,
-    pixtral_12b_vlm_conversion_options,
-    pixtral_12b_vlm_mlx_conversion_options,
-    qwen25_vl_3b_vlm_mlx_conversion_options,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
-)
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
+from docling.datamodel.pipeline_vlm_model_spec import (
+    GEMMA3_12B_MLX,
+    GRANITE_VISION_OLLAMA,
+    GRANITE_VISION_TRANSFORMERS,
+    PHI4_TRANSFORMERS,
+    PIXTRAL_12B_MLX,
+    PIXTRAL_12B_TRANSFORMERS,
+    QWEN25_VL_3B_MLX,
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

@@ -120,16 +120,16 @@ if __name__ == "__main__":
    rows = []
    for vlm_options in [
        ## DocTags / SmolDocling models
-        smoldocling_vlm_conversion_options,
-        # smoldocling_vlm_mlx_conversion_options,
+        SMOLDOCLING_TRANSFORMERS,
+        SMOLDOCLING_MLX,
        ## Markdown models (using MLX framework)
-        # qwen25_vl_3b_vlm_mlx_conversion_options,
-        # pixtral_12b_vlm_mlx_conversion_options,
-        # gemma_3_12b_mlx_conversion_options,
+        QWEN25_VL_3B_MLX,
+        PIXTRAL_12B_MLX,
+        GEMMA3_12B_MLX,
        ## Markdown models (using Transformers framework)
-        # granite_vision_vlm_conversion_options,
-        phi_vlm_conversion_options,
-        pixtral_12b_vlm_conversion_options,
+        GRANITE_VISION_TRANSFORMERS,
+        PHI4_TRANSFORMERS,
+        PIXTRAL_12B_TRANSFORMERS,
    ]:
        pipeline_options.vlm_options = vlm_options