mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Initial implementation to support MLX for VLM pipeline and SmolDocling
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
f5adfb9724
commit
e7c29a89d0
@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
|
|||||||
MARKDOWN = "markdown"
|
MARKDOWN = "markdown"
|
||||||
|
|
||||||
|
|
||||||
|
class InferenceFramework(str, Enum):
|
||||||
|
MLX = "mlx"
|
||||||
|
TRANSFORMERS = "transformers"
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||||
|
|
||||||
@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|||||||
llm_int8_threshold: float = 6.0
|
llm_int8_threshold: float = 6.0
|
||||||
quantized: bool = False
|
quantized: bool = False
|
||||||
|
|
||||||
|
inference_framework: InferenceFramework
|
||||||
response_format: ResponseFormat
|
response_format: ResponseFormat
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|||||||
return self.repo_id.replace("/", "--")
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
||||||
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
||||||
|
prompt="Convert this page to docling.",
|
||||||
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
|
inference_framework=InferenceFramework.MLX,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||||
repo_id="ds4sd/SmolDocling-256M-preview",
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
||||||
prompt="Convert this page to docling.",
|
prompt="Convert this page to docling.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
)
|
)
|
||||||
|
|
||||||
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||||
@ -289,6 +304,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
# prompt="OCR the full page to markdown.",
|
# prompt="OCR the full page to markdown.",
|
||||||
prompt="OCR this image.",
|
prompt="OCR this image.",
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
InferenceFramework,
|
||||||
|
ResponseFormat,
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
@ -58,14 +63,27 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
self.keep_images = self.pipeline_options.generate_page_images
|
self.keep_images = self.pipeline_options.generate_page_images
|
||||||
|
|
||||||
self.build_pipe = [
|
if (
|
||||||
HuggingFaceVlmModel(
|
self.pipeline_options.vlm_options.inference_framework
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
== InferenceFramework.MLX
|
||||||
artifacts_path=artifacts_path,
|
):
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
self.build_pipe = [
|
||||||
vlm_options=self.pipeline_options.vlm_options,
|
HuggingFaceMlxModel(
|
||||||
),
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
]
|
artifacts_path=artifacts_path,
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
vlm_options=self.pipeline_options.vlm_options,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.build_pipe = [
|
||||||
|
HuggingFaceVlmModel(
|
||||||
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
vlm_options=self.pipeline_options.vlm_options,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||||
|
@ -10,6 +10,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_conversion_options,
|
granite_vision_vlm_conversion_options,
|
||||||
smoldocling_vlm_conversion_options,
|
smoldocling_vlm_conversion_options,
|
||||||
|
smoldocling_vlm_mlx_conversion_options,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
@ -29,7 +30,10 @@ pipeline_options.force_backend_text = False
|
|||||||
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||||
|
|
||||||
## Pick a VLM model. We choose SmolDocling-256M by default
|
## Pick a VLM model. We choose SmolDocling-256M by default
|
||||||
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
||||||
|
|
||||||
|
## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
|
||||||
|
pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
||||||
|
|
||||||
## Alternative VLM models:
|
## Alternative VLM models:
|
||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
Loading…
Reference in New Issue
Block a user