From 8e2b0b39c132d0323be5091923b9cfe9130200a3 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 19 Mar 2025 13:18:04 +0100
Subject: [PATCH] Add CLI choices for VLM pipeline and model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/cli/main.py                   | 119 ++++++++++++++++++--------
 docling/datamodel/pipeline_options.py |  14 ++-
 docling/models/hf_mlx_model.py        |  13 ++-
 docling/pipeline/vlm_pipeline.py      |   6 --
 pyproject.toml                        |   1 +
 5 files changed, 103 insertions(+), 50 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index 7f0f20bf..c85a04f3 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
     AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
+    PaginatedPipelineOptions,
     PdfBackend,
+    PdfPipeline,
     PdfPipelineOptions,
     TableFormerMode,
+    VlmModelType,
+    VlmPipelineOptions,
+    granite_vision_vlm_conversion_options,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.vlm_pipeline import VlmPipeline
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -200,6 +208,14 @@ def convert(
             help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
         ),
     ] = ImageRefMode.EMBEDDED,
+    pipeline: Annotated[
+        PdfPipeline,
+        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
+    ] = PdfPipeline.STANDARD,
+    vlm_model: Annotated[
+        VlmModelType,
+        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
+    ] = VlmModelType.SMOLDOCLING,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -420,50 +436,77 @@ def convert(
             ocr_options.lang = ocr_lang_list
 
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options = PdfPipelineOptions(
-            allow_external_plugins=allow_external_plugins,
-            enable_remote_services=enable_remote_services,
-            accelerator_options=accelerator_options,
-            do_ocr=ocr,
-            ocr_options=ocr_options,
-            do_table_structure=True,
-            do_code_enrichment=enrich_code,
-            do_formula_enrichment=enrich_formula,
-            do_picture_description=enrich_picture_description,
-            do_picture_classification=enrich_picture_classes,
-            document_timeout=document_timeout,
-        )
-        pipeline_options.table_structure_options.do_cell_matching = (
-            True  # do_cell_matching
-        )
-        pipeline_options.table_structure_options.mode = table_mode
+        pipeline_options: PaginatedPipelineOptions
 
-        if image_export_mode != ImageRefMode.PLACEHOLDER:
-            pipeline_options.generate_page_images = True
-            pipeline_options.generate_picture_images = (
-                True  # FIXME: to be deprecated in verson 3
+        if pipeline == PdfPipeline.STANDARD:
+            pipeline_options = PdfPipelineOptions(
+                allow_external_plugins=allow_external_plugins,
+                enable_remote_services=enable_remote_services,
+                accelerator_options=accelerator_options,
+                do_ocr=ocr,
+                ocr_options=ocr_options,
+                do_table_structure=True,
+                do_code_enrichment=enrich_code,
+                do_formula_enrichment=enrich_formula,
+                do_picture_description=enrich_picture_description,
+                do_picture_classification=enrich_picture_classes,
+                document_timeout=document_timeout,
+            )
+            pipeline_options.table_structure_options.do_cell_matching = (
+                True  # do_cell_matching
+            )
+            pipeline_options.table_structure_options.mode = table_mode
+
+            if image_export_mode != ImageRefMode.PLACEHOLDER:
+                pipeline_options.generate_page_images = True
+                pipeline_options.generate_picture_images = (
+                    True  # FIXME: to be deprecated in verson 3
+                )
+                pipeline_options.images_scale = 2
+
+            backend: Type[PdfDocumentBackend]
+            if pdf_backend == PdfBackend.DLPARSE_V1:
+                backend = DoclingParseDocumentBackend
+            elif pdf_backend == PdfBackend.DLPARSE_V2:
+                backend = DoclingParseV2DocumentBackend
+            elif pdf_backend == PdfBackend.DLPARSE_V4:
+                backend = DoclingParseV4DocumentBackend  # type: ignore
+            elif pdf_backend == PdfBackend.PYPDFIUM2:
+                backend = PyPdfiumDocumentBackend  # type: ignore
+            else:
+                raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+
+            pdf_format_option = PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        elif pipeline == PdfPipeline.VLM:
+            pipeline_options = VlmPipelineOptions()
+
+            if vlm_model == VlmModelType.GRANITE_VISION:
+                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+            elif vlm_model == VlmModelType.SMOLDOCLING:
+                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                if sys.platform == "darwin":
+                    try:
+                        import mlx_vlm
+
+                        pipeline_options.vlm_options = (
+                            smoldocling_vlm_mlx_conversion_options
+                        )
+                    except ImportError:
+                        _log.warning(
+                            "To run SmolDocling faster, please install mlx-vlm:\n"
+                            "pip install mlx-vlm"
+                        )
+
+            pdf_format_option = PdfFormatOption(
+                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
             )
-            pipeline_options.images_scale = 2
 
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
 
-        backend: Type[PdfDocumentBackend]
-        if pdf_backend == PdfBackend.DLPARSE_V1:
-            backend = DoclingParseDocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V2:
-            backend = DoclingParseV2DocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V4:
-            backend = DoclingParseV4DocumentBackend  # type: ignore
-        elif pdf_backend == PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend  # type: ignore
-        else:
-            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-
-        pdf_format_option = PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
-        )
         format_options: Dict[InputFormat, FormatOption] = {
             InputFormat.PDF: pdf_format_option,
             InputFormat.IMAGE: pdf_format_option,
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index c96603c4..654e04df 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -308,6 +308,11 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
 )
 
 
+class VlmModelType(str, Enum):
+    SMOLDOCLING = "smoldocling"
+    GRANITE_VISION = "granite_vision"
+
+
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -343,13 +348,14 @@ class PipelineOptions(BaseModel):
 
 
 class PaginatedPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
+
     images_scale: float = 1.0
     generate_page_images: bool = False
     generate_picture_images: bool = False
 
 
 class VlmPipelineOptions(PaginatedPipelineOptions):
-    artifacts_path: Optional[Union[Path, str]] = None
 
     generate_page_images: bool = True
     force_backend_text: bool = (
@@ -362,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
 
-    artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
     do_code_enrichment: bool = False  # True: perform code OCR
@@ -393,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     )
 
     generate_parsed_pages: bool = False
+
+
+class PdfPipeline(str, Enum):
+    STANDARD = "standard"
+    VLM = "vlm"
diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py
index 1c4577ac..762a6557 100644
--- a/docling/models/hf_mlx_model.py
+++ b/docling/models/hf_mlx_model.py
@@ -32,9 +32,15 @@ class HuggingFaceMlxModel(BasePageModel):
         self.vlm_options = vlm_options
 
         if self.enabled:
-            from mlx_vlm import generate, load  # type: ignore
-            from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
-            from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+
+            try:
+                from mlx_vlm import generate, load  # type: ignore
+                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
+                from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
+                )
 
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
             self.apply_chat_template = apply_chat_template
@@ -113,7 +119,6 @@ class HuggingFaceMlxModel(BasePageModel):
                         verbose=False,
                     ):
                         output += token.text
-                        print(token.text, end="")
                         if "</doctag>" in token.text:
                             break
 
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index d01ee7f1..5237f2ae 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -34,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
         super().__init__(pipeline_options)
         self.keep_backend = True
 
-        warnings.warn(
-            "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
-            category=UserWarning,
-            stacklevel=2,
-        )
-
         self.pipeline_options: VlmPipelineOptions
 
         artifacts_path: Optional[Path] = None
diff --git a/pyproject.toml b/pyproject.toml
index 0f85915f..8d121d36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -192,6 +192,7 @@ module = [
   "docling_ibm_models.*",
   "easyocr.*",
   "ocrmac.*",
+  "mlx_vlm.*",
   "lxml.*",
   "huggingface_hub.*",
   "transformers.*",