docling/docs/examples/asr_pipeline_performance_comparison.py

#!/usr/bin/env python3
"""
Performance comparison between CPU and MLX Whisper on Apple Silicon.

This script compares the performance of:
1. Native Whisper (forced to CPU)
2. MLX Whisper (Apple Silicon optimized)

Both use the same model size for fair comparison.
"""

import argparse
import sys
import time
from pathlib import Path

# Add the repository root to the path so we can import docling
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import AsrPipelineOptions
from docling.datamodel.pipeline_options_asr_model import (
    InferenceAsrFramework,
    InlineAsrMlxWhisperOptions,
    InlineAsrNativeWhisperOptions,
)
from docling.document_converter import AudioFormatOption, DocumentConverter
from docling.pipeline.asr_pipeline import AsrPipeline


def create_cpu_whisper_options(model_size: str = "turbo"):
    """Create native Whisper options forced to CPU."""
    return InlineAsrNativeWhisperOptions(
        repo_id=model_size,
        inference_framework=InferenceAsrFramework.WHISPER,
        verbose=True,
        timestamps=True,
        word_timestamps=True,
        temperature=0.0,
        max_new_tokens=256,
        max_time_chunk=30.0,
    )


def create_mlx_whisper_options(model_size: str = "turbo"):
    """Create MLX Whisper options for Apple Silicon."""
    model_map = {
        "tiny": "mlx-community/whisper-tiny-mlx",
        "small": "mlx-community/whisper-small-mlx",
        "base": "mlx-community/whisper-base-mlx",
        "medium": "mlx-community/whisper-medium-mlx-8bit",
        "large": "mlx-community/whisper-large-mlx-8bit",
        "turbo": "mlx-community/whisper-turbo",
    }

    return InlineAsrMlxWhisperOptions(
        repo_id=model_map[model_size],
        inference_framework=InferenceAsrFramework.MLX,
        language="en",
        task="transcribe",
        word_timestamps=True,
        no_speech_threshold=0.6,
        logprob_threshold=-1.0,
        compression_ratio_threshold=2.4,
    )


def run_transcription_test(
    audio_file: Path, asr_options, device: AcceleratorDevice, test_name: str
):
    """Run a single transcription test and return timing results."""
    print(f"\n{'=' * 60}")
    print(f"Running {test_name}")
    print(f"Device: {device}")
    print(f"Model: {asr_options.repo_id}")
    print(f"Framework: {asr_options.inference_framework}")
    print(f"{'=' * 60}")

    # Create pipeline options
    pipeline_options = AsrPipelineOptions(
        accelerator_options=AcceleratorOptions(device=device),
        asr_options=asr_options,
    )

    # Create document converter
    converter = DocumentConverter(
        format_options={
            InputFormat.AUDIO: AudioFormatOption(
                pipeline_cls=AsrPipeline,
                pipeline_options=pipeline_options,
            )
        }
    )

    # Run transcription with timing
    start_time = time.time()
    try:
        result = converter.convert(audio_file)
        end_time = time.time()

        duration = end_time - start_time

        if result.status.value == "success":
            # Extract text for verification
            text_content = []
            for item in result.document.texts:
                text_content.append(item.text)

            print(f"✅ Success! Duration: {duration:.2f} seconds")
            print(f"Transcribed text: {''.join(text_content)[:100]}...")
            return duration, True
        else:
            print(f"❌ Failed! Status: {result.status}")
            return duration, False

    except Exception as e:
        end_time = time.time()
        duration = end_time - start_time
        print(f"❌ Error: {e}")
        return duration, False


def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="Performance comparison between CPU and MLX Whisper on Apple Silicon",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:

# Use default test audio file
python asr_pipeline_performance_comparison.py

# Use your own audio file
python asr_pipeline_performance_comparison.py --audio /path/to/your/audio.mp3

# Use a different audio file from the tests directory
python asr_pipeline_performance_comparison.py --audio tests/data/audio/another_sample.wav
        """,
    )

    parser.add_argument(
        "--audio",
        type=str,
        help="Path to audio file for testing (default: tests/data/audio/sample_10s.mp3)",
    )

    return parser.parse_args()


def main():
    """Run performance comparison between CPU and MLX Whisper."""
    args = parse_args()

    # Check if we're on Apple Silicon
    try:
        import torch

        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
    except ImportError:
        has_mps = False

    try:
        import mlx_whisper

        has_mlx_whisper = True
    except ImportError:
        has_mlx_whisper = False

    print("ASR Pipeline Performance Comparison")
    print("=" * 50)
    print(f"Apple Silicon (MPS) available: {has_mps}")
    print(f"MLX Whisper available: {has_mlx_whisper}")

    if not has_mps:
        print("⚠️  Apple Silicon (MPS) not available - running CPU-only comparison")
        print("   For MLX Whisper performance benefits, run on Apple Silicon devices")
        print("   MLX Whisper is optimized for Apple Silicon devices.")

    if not has_mlx_whisper:
        print("⚠️  MLX Whisper not installed - running CPU-only comparison")
        print("   Install with: pip install mlx-whisper")
        print("   Or: uv sync --extra asr")
        print("   For MLX Whisper performance benefits, install the dependency")

    # Determine audio file path
    if args.audio:
        audio_file = Path(args.audio)
        if not audio_file.is_absolute():
            # If relative path, make it relative to the script's directory
            audio_file = Path(__file__).parent.parent.parent / audio_file
    else:
        # Use default test audio file
        audio_file = (
            Path(__file__).parent.parent.parent
            / "tests"
            / "data"
            / "audio"
            / "sample_10s.mp3"
        )

    if not audio_file.exists():
        print(f"❌ Audio file not found: {audio_file}")
        print("   Please check the path and try again.")
        sys.exit(1)

    print(f"Using test audio: {audio_file}")
    print(f"File size: {audio_file.stat().st_size / 1024:.1f} KB")

    # Test different model sizes
    model_sizes = ["tiny", "base", "turbo"]
    results = {}

    for model_size in model_sizes:
        print(f"\n{'#' * 80}")
        print(f"Testing model size: {model_size}")
        print(f"{'#' * 80}")

        model_results = {}

        # Test 1: Native Whisper (forced to CPU)
        cpu_options = create_cpu_whisper_options(model_size)
        cpu_duration, cpu_success = run_transcription_test(
            audio_file,
            cpu_options,
            AcceleratorDevice.CPU,
            f"Native Whisper {model_size} (CPU)",
        )
        model_results["cpu"] = {"duration": cpu_duration, "success": cpu_success}

        # Test 2: MLX Whisper (Apple Silicon optimized) - only if available
        if has_mps and has_mlx_whisper:
            mlx_options = create_mlx_whisper_options(model_size)
            mlx_duration, mlx_success = run_transcription_test(
                audio_file,
                mlx_options,
                AcceleratorDevice.MPS,
                f"MLX Whisper {model_size} (MPS)",
            )
            model_results["mlx"] = {"duration": mlx_duration, "success": mlx_success}
        else:
            print(f"\n{'=' * 60}")
            print(f"Skipping MLX Whisper {model_size} (MPS) - not available")
            print(f"{'=' * 60}")
            model_results["mlx"] = {"duration": 0.0, "success": False}

        results[model_size] = model_results

    # Print summary
    print(f"\n{'#' * 80}")
    print("PERFORMANCE COMPARISON SUMMARY")
    print(f"{'#' * 80}")
    print(
        f"{'Model':<10} {'CPU (sec)':<12} {'MLX (sec)':<12} {'Speedup':<12} {'Status':<10}"
    )
    print("-" * 80)

    for model_size, model_results in results.items():
        cpu_duration = model_results["cpu"]["duration"]
        mlx_duration = model_results["mlx"]["duration"]
        cpu_success = model_results["cpu"]["success"]
        mlx_success = model_results["mlx"]["success"]

        if cpu_success and mlx_success:
            speedup = cpu_duration / mlx_duration
            status = "✅ Both OK"
        elif cpu_success:
            speedup = float("inf")
            status = "❌ MLX Failed"
        elif mlx_success:
            speedup = 0
            status = "❌ CPU Failed"
        else:
            speedup = 0
            status = "❌ Both Failed"

        print(
            f"{model_size:<10} {cpu_duration:<12.2f} {mlx_duration:<12.2f} {speedup:<12.2f}x {status:<10}"
        )

    # Calculate overall improvement
    successful_tests = [
        (r["cpu"]["duration"], r["mlx"]["duration"])
        for r in results.values()
        if r["cpu"]["success"] and r["mlx"]["success"]
    ]

    if successful_tests:
        avg_cpu = sum(cpu for cpu, mlx in successful_tests) / len(successful_tests)
        avg_mlx = sum(mlx for cpu, mlx in successful_tests) / len(successful_tests)
        avg_speedup = avg_cpu / avg_mlx

        print("-" * 80)
        print(
            f"{'AVERAGE':<10} {avg_cpu:<12.2f} {avg_mlx:<12.2f} {avg_speedup:<12.2f}x {'Overall':<10}"
        )

        print(f"\n🎯 MLX Whisper provides {avg_speedup:.1f}x average speedup over CPU!")
    else:
        if has_mps and has_mlx_whisper:
            print("\n❌ No successful comparisons available.")
        else:
            print("\n⚠️  MLX Whisper not available - only CPU results shown.")
            print(
                "   Install MLX Whisper and run on Apple Silicon for performance comparison."
            )


if __name__ == "__main__":
    main()