From e7c29a89d098449762698d903f81860e30787d13 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 19 Mar 2025 10:51:20 +0100 Subject: [PATCH] Initial implementation to support MLX for VLM pipeline and SmolDocling Signed-off-by: Maksym Lysak --- docling/datamodel/pipeline_options.py | 16 ++++++++++++ docling/pipeline/vlm_pipeline.py | 36 ++++++++++++++++++++------- docs/examples/minimal_vlm_pipeline.py | 6 ++++- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d28b5826..c96603c4 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -263,6 +263,11 @@ class ResponseFormat(str, Enum): MARKDOWN = "markdown" +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + + class HuggingFaceVlmOptions(BaseVlmOptions): kind: Literal["hf_model_options"] = "hf_model_options" @@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions): llm_int8_threshold: float = 6.0 quantized: bool = False + inference_framework: InferenceFramework response_format: ResponseFormat @property @@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions): return self.repo_id.replace("/", "--") +smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.MLX, +) + + smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview", prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.TRANSFORMERS, ) granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( @@ -289,6 +304,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( # prompt="OCR the full page to markdown.", prompt="OCR this image.", response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, ) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 4afb918d..d01ee7f1 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions +from docling.datamodel.pipeline_options import ( + InferenceFramework, + ResponseFormat, + VlmPipelineOptions, +) from docling.datamodel.settings import settings +from docling.models.hf_mlx_model import HuggingFaceMlxModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -58,14 +63,27 @@ class VlmPipeline(PaginatedPipeline): self.keep_images = self.pipeline_options.generate_page_images - self.build_pipe = [ - HuggingFaceVlmModel( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, - ), - ] + if ( + self.pipeline_options.vlm_options.inference_framework + == InferenceFramework.MLX + ): + self.build_pipe = [ + HuggingFaceMlxModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, + ), + ] + else: + self.build_pipe = [ + HuggingFaceVlmModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, + ), + ] self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 948ecc64..c10b000f 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -10,6 +10,7 @@ from docling.datamodel.pipeline_options import ( VlmPipelineOptions, granite_vision_vlm_conversion_options, smoldocling_vlm_conversion_options, + smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption @@ -29,7 +30,10 @@ pipeline_options.force_backend_text = False # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True ## Pick a VLM model. We choose SmolDocling-256M by default -pipeline_options.vlm_options = smoldocling_vlm_conversion_options +# pipeline_options.vlm_options = smoldocling_vlm_conversion_options + +## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX +pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options