Initial implementation to support MLX for VLM pipeline and SmolDocling

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2025-03-19 10:51:20 +01:00
parent f5adfb9724
commit e7c29a89d0
3 changed files with 48 additions and 10 deletions

View File

@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
MARKDOWN = "markdown" MARKDOWN = "markdown"
class InferenceFramework(str, Enum):
MLX = "mlx"
TRANSFORMERS = "transformers"
class HuggingFaceVlmOptions(BaseVlmOptions): class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options" kind: Literal["hf_model_options"] = "hf_model_options"
@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
llm_int8_threshold: float = 6.0 llm_int8_threshold: float = 6.0
quantized: bool = False quantized: bool = False
inference_framework: InferenceFramework
response_format: ResponseFormat response_format: ResponseFormat
@property @property
@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
return self.repo_id.replace("/", "--") return self.repo_id.replace("/", "--")
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.MLX,
)
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ds4sd/SmolDocling-256M-preview", repo_id="ds4sd/SmolDocling-256M-preview",
prompt="Convert this page to docling.", prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS, response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.TRANSFORMERS,
) )
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@ -289,6 +304,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
# prompt="OCR the full page to markdown.", # prompt="OCR the full page to markdown.",
prompt="OCR this image.", prompt="OCR this image.",
response_format=ResponseFormat.MARKDOWN, response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS,
) )

View File

@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions from docling.datamodel.pipeline_options import (
InferenceFramework,
ResponseFormat,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.hf_mlx_model import HuggingFaceMlxModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.pipeline.base_pipeline import PaginatedPipeline from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -58,14 +63,27 @@ class VlmPipeline(PaginatedPipeline):
self.keep_images = self.pipeline_options.generate_page_images self.keep_images = self.pipeline_options.generate_page_images
self.build_pipe = [ if (
HuggingFaceVlmModel( self.pipeline_options.vlm_options.inference_framework
enabled=True, # must be always enabled for this pipeline to make sense. == InferenceFramework.MLX
artifacts_path=artifacts_path, ):
accelerator_options=pipeline_options.accelerator_options, self.build_pipe = [
vlm_options=self.pipeline_options.vlm_options, HuggingFaceMlxModel(
), enabled=True, # must be always enabled for this pipeline to make sense.
] artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
),
]
else:
self.build_pipe = [
HuggingFaceVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
),
]
self.enrichment_pipe = [ self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument # Other models working on `NodeItem` elements in the DoclingDocument

View File

@ -10,6 +10,7 @@ from docling.datamodel.pipeline_options import (
VlmPipelineOptions, VlmPipelineOptions,
granite_vision_vlm_conversion_options, granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options, smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
) )
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -29,7 +30,10 @@ pipeline_options.force_backend_text = False
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
## Pick a VLM model. We choose SmolDocling-256M by default ## Pick a VLM model. We choose SmolDocling-256M by default
pipeline_options.vlm_options = smoldocling_vlm_conversion_options # pipeline_options.vlm_options = smoldocling_vlm_conversion_options
## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models: ## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options # pipeline_options.vlm_options = granite_vision_vlm_conversion_options