Add CLI choices for VLM pipeline and model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-13 07:08:19 +00:00 · 2025-03-19 13:18:04 +01:00
parent 0cd9b48372
commit 8e2b0b39c1
5 changed files with 103 additions and 50 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    EasyOcrOptions,
    OcrOptions,
    PaginatedPipelineOptions,
    PdfBackend,
    PdfPipeline,
    PdfPipelineOptions,
    TableFormerMode,
    VlmModelType,
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.vlm_pipeline import VlmPipeline
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -200,6 +208,14 @@ def convert(
            help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
        ),
    ] = ImageRefMode.EMBEDDED,
    pipeline: Annotated[
        PdfPipeline,
        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
    ] = PdfPipeline.STANDARD,
    vlm_model: Annotated[
        VlmModelType,
        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
    ] = VlmModelType.SMOLDOCLING,
    ocr: Annotated[
        bool,
        typer.Option(
@@ -420,50 +436,77 @@ def convert(
            ocr_options.lang = ocr_lang_list
        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options = PdfPipelineOptions(
+        pipeline_options: PaginatedPipelineOptions
            allow_external_plugins=allow_external_plugins,
            enable_remote_services=enable_remote_services,
            accelerator_options=accelerator_options,
            do_ocr=ocr,
            ocr_options=ocr_options,
            do_table_structure=True,
            do_code_enrichment=enrich_code,
            do_formula_enrichment=enrich_formula,
            do_picture_description=enrich_picture_description,
            do_picture_classification=enrich_picture_classes,
            document_timeout=document_timeout,
        )
        pipeline_options.table_structure_options.do_cell_matching = (
            True  # do_cell_matching
        )
        pipeline_options.table_structure_options.mode = table_mode
-        if image_export_mode != ImageRefMode.PLACEHOLDER:
+        if pipeline == PdfPipeline.STANDARD:
-            pipeline_options.generate_page_images = True
+            pipeline_options = PdfPipelineOptions(
-            pipeline_options.generate_picture_images = (
+                allow_external_plugins=allow_external_plugins,
-                True  # FIXME: to be deprecated in verson 3
+                enable_remote_services=enable_remote_services,
                accelerator_options=accelerator_options,
                do_ocr=ocr,
                ocr_options=ocr_options,
                do_table_structure=True,
                do_code_enrichment=enrich_code,
                do_formula_enrichment=enrich_formula,
                do_picture_description=enrich_picture_description,
                do_picture_classification=enrich_picture_classes,
                document_timeout=document_timeout,
            )
            pipeline_options.table_structure_options.do_cell_matching = (
                True  # do_cell_matching
            )
            pipeline_options.table_structure_options.mode = table_mode
            if image_export_mode != ImageRefMode.PLACEHOLDER:
                pipeline_options.generate_page_images = True
                pipeline_options.generate_picture_images = (
                    True  # FIXME: to be deprecated in verson 3
                )
                pipeline_options.images_scale = 2
            backend: Type[PdfDocumentBackend]
            if pdf_backend == PdfBackend.DLPARSE_V1:
                backend = DoclingParseDocumentBackend
            elif pdf_backend == PdfBackend.DLPARSE_V2:
                backend = DoclingParseV2DocumentBackend
            elif pdf_backend == PdfBackend.DLPARSE_V4:
                backend = DoclingParseV4DocumentBackend  # type: ignore
            elif pdf_backend == PdfBackend.PYPDFIUM2:
                backend = PyPdfiumDocumentBackend  # type: ignore
            else:
                raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
            pdf_format_option = PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
            )
        elif pipeline == PdfPipeline.VLM:
            pipeline_options = VlmPipelineOptions()
            if vlm_model == VlmModelType.GRANITE_VISION:
                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
            elif vlm_model == VlmModelType.SMOLDOCLING:
                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
                        pipeline_options.vlm_options = (
                            smoldocling_vlm_mlx_conversion_options
                        )
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
                            "pip install mlx-vlm"
                        )
            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )
            pipeline_options.images_scale = 2
        if artifacts_path is not None:
            pipeline_options.artifacts_path = artifacts_path
        backend: Type[PdfDocumentBackend]
        if pdf_backend == PdfBackend.DLPARSE_V1:
            backend = DoclingParseDocumentBackend
        elif pdf_backend == PdfBackend.DLPARSE_V2:
            backend = DoclingParseV2DocumentBackend
        elif pdf_backend == PdfBackend.DLPARSE_V4:
            backend = DoclingParseV4DocumentBackend  # type: ignore
        elif pdf_backend == PdfBackend.PYPDFIUM2:
            backend = PyPdfiumDocumentBackend  # type: ignore
        else:
            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
        pdf_format_option = PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=backend,  # pdf_backend
        )
        format_options: Dict[InputFormat, FormatOption] = {
            InputFormat.PDF: pdf_format_option,
            InputFormat.IMAGE: pdf_format_option,
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -308,6 +308,11 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
 )
 class VlmModelType(str, Enum):
    SMOLDOCLING = "smoldocling"
    GRANITE_VISION = "granite_vision"
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
    """Enum of valid PDF backends."""
@@ -343,13 +348,14 @@ class PipelineOptions(BaseModel):
 class PaginatedPipelineOptions(PipelineOptions):
    artifacts_path: Optional[Union[Path, str]] = None
    images_scale: float = 1.0
    generate_page_images: bool = False
    generate_picture_images: bool = False
 class VlmPipelineOptions(PaginatedPipelineOptions):
    artifacts_path: Optional[Union[Path, str]] = None
    generate_page_images: bool = True
    force_backend_text: bool = (
@@ -362,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
 class PdfPipelineOptions(PaginatedPipelineOptions):
    """Options for the PDF pipeline."""
    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    do_code_enrichment: bool = False  # True: perform code OCR
@@ -393,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    )
    generate_parsed_pages: bool = False
 class PdfPipeline(str, Enum):
    STANDARD = "standard"
    VLM = "vlm"
--- a/docling/models/hf_mlx_model.py
+++ b/docling/models/hf_mlx_model.py
@@ -32,9 +32,15 @@ class HuggingFaceMlxModel(BasePageModel):
        self.vlm_options = vlm_options
        if self.enabled:
-            from mlx_vlm import generate, load  # type: ignore
+
-            from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
+            try:
-            from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+                from mlx_vlm import generate, load  # type: ignore
                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
                from mlx_vlm.utils import load_config, stream_generate  # type: ignore
            except ImportError:
                raise ImportError(
                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
                )
            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
            self.apply_chat_template = apply_chat_template
@@ -113,7 +119,6 @@ class HuggingFaceMlxModel(BasePageModel):
                        verbose=False,
                    ):
                        output += token.text
                        print(token.text, end="")
                        if "</doctag>" in token.text:
                            break
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -34,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
        super().__init__(pipeline_options)
        self.keep_backend = True
        warnings.warn(
            "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
            category=UserWarning,
            stacklevel=2,
        )
        self.pipeline_options: VlmPipelineOptions
        artifacts_path: Optional[Path] = None
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -192,6 +192,7 @@ module = [
  "docling_ibm_models.*",
  "easyocr.*",
  "ocrmac.*",
  "mlx_vlm.*",
  "lxml.*",
  "huggingface_hub.*",
  "transformers.*",