Files
docling/docs/examples/minimal_asr_pipeline.py
2025-10-18 11:00:28 -07:00

92 lines
3.3 KiB
Python
Vendored

# %% [markdown]
# Minimal ASR pipeline example: transcribe an audio file to Markdown text.
#
# What this example does
# - Configures the ASR pipeline with a default model spec and converts one audio file.
# - Prints the recognized speech segments in Markdown with timestamps.
#
# Prerequisites
# - Install Docling with ASR extras and any audio dependencies (ffmpeg, etc.).
# - Ensure your environment can download or access the configured ASR model.
# - Some formats require ffmpeg codecs; install ffmpeg and ensure it's on PATH.
#
# How to run
# - From the repository root, run: `python docs/examples/minimal_asr_pipeline.py`.
# - The script prints the transcription to stdout.
#
# Customizing the model
# - The script automatically selects the best model for your hardware (MLX Whisper for Apple Silicon, native Whisper otherwise).
# - Edit `get_asr_converter()` to manually override `pipeline_options.asr_options` with any model from `asr_model_specs`.
# - Keep `InputFormat.AUDIO` and `AsrPipeline` unchanged for a minimal setup.
#
# Input audio
# - Defaults to `tests/data/audio/sample_10s.mp3`. Update `audio_path` to your own file if needed.
# %%
from pathlib import Path
from docling_core.types.doc import DoclingDocument
from docling.datamodel import asr_model_specs
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AsrPipelineOptions
from docling.document_converter import AudioFormatOption, DocumentConverter
from docling.pipeline.asr_pipeline import AsrPipeline
def get_asr_converter():
"""Create a DocumentConverter configured for ASR with automatic model selection.
Uses `asr_model_specs.WHISPER_TURBO` which automatically selects the best
implementation for your hardware:
- MLX Whisper Turbo for Apple Silicon (M1/M2/M3) with mlx-whisper installed
- Native Whisper Turbo as fallback
You can swap in another model spec from `docling.datamodel.asr_model_specs`
to experiment with different model sizes.
"""
pipeline_options = AsrPipelineOptions()
pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO
converter = DocumentConverter(
format_options={
InputFormat.AUDIO: AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options,
)
}
)
return converter
def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
"""Run the ASR pipeline and return a `DoclingDocument` transcript."""
# Check if the test audio file exists
assert audio_path.exists(), f"Test audio file not found: {audio_path}"
converter = get_asr_converter()
# Convert the audio file
result: ConversionResult = converter.convert(audio_path)
# Verify conversion was successful
assert result.status == ConversionStatus.SUCCESS, (
f"Conversion failed with status: {result.status}"
)
return result.document
if __name__ == "__main__":
audio_path = Path("tests/data/audio/sample_10s.mp3")
doc = asr_pipeline_conversion(audio_path=audio_path)
print(doc.export_to_markdown())
# Expected output:
#
# [time: 0.0-4.0] Shakespeare on Scenery by Oscar Wilde
#
# [time: 5.28-9.96] This is a LibriVox recording. All LibriVox recordings are in the public domain.