Initial implementation to support MLX for VLM pipeline and SmolDocling

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
2025-07-30 14:04:27 +00:00 · 2025-03-19 10:51:20 +01:00 · 2025-03-19 10:51:20 +01:00 · e7c29a89d0
commit e7c29a89d0
parent f5adfb9724
3 changed files with 48 additions and 10 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
    MARKDOWN = "markdown"
 class InferenceFramework(str, Enum):
    MLX = "mlx"
    TRANSFORMERS = "transformers"
 class HuggingFaceVlmOptions(BaseVlmOptions):
    kind: Literal["hf_model_options"] = "hf_model_options"
@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
    llm_int8_threshold: float = 6.0
    quantized: bool = False
    inference_framework: InferenceFramework
    response_format: ResponseFormat
    @property
@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
        return self.repo_id.replace("/", "--")
 smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
    inference_framework=InferenceFramework.MLX,
 )
 smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
    inference_framework=InferenceFramework.TRANSFORMERS,
 )
 granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@ -289,6 +304,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
    # prompt="OCR the full page to markdown.",
    prompt="OCR this image.",
    response_format=ResponseFormat.MARKDOWN,
    inference_framework=InferenceFramework.TRANSFORMERS,
 )
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions
+from docling.datamodel.pipeline_options import (
    InferenceFramework,
    ResponseFormat,
    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.hf_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -58,14 +63,27 @@ class VlmPipeline(PaginatedPipeline):
        self.keep_images = self.pipeline_options.generate_page_images
-        self.build_pipe = [
+        if (
-            HuggingFaceVlmModel(
+            self.pipeline_options.vlm_options.inference_framework
-                enabled=True,  # must be always enabled for this pipeline to make sense.
+            == InferenceFramework.MLX
-                artifacts_path=artifacts_path,
+        ):
-                accelerator_options=pipeline_options.accelerator_options,
+            self.build_pipe = [
-                vlm_options=self.pipeline_options.vlm_options,
+                HuggingFaceMlxModel(
-            ),
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
-        ]
+                    artifacts_path=artifacts_path,
                    accelerator_options=pipeline_options.accelerator_options,
                    vlm_options=self.pipeline_options.vlm_options,
                ),
            ]
        else:
            self.build_pipe = [
                HuggingFaceVlmModel(
                    enabled=True,  # must be always enabled for this pipeline to make sense.
                    artifacts_path=artifacts_path,
                    accelerator_options=pipeline_options.accelerator_options,
                    vlm_options=self.pipeline_options.vlm_options,
                ),
            ]
        self.enrichment_pipe = [
            # Other models working on `NodeItem` elements in the DoclingDocument
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -10,6 +10,7 @@ from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
@ -29,7 +30,10 @@ pipeline_options.force_backend_text = False
 # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
 ## Pick a VLM model. We choose SmolDocling-256M by default
-pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
 ## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
 pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options