Audio Processing
Audio is a critical modality for AI applications — from meeting transcription to voice assistants. This page covers speech-to-text with OpenAI's Whisper and speaker diarization (identifying who said what).
Whisper — Speech-to-Text
Whisper is OpenAI's general-purpose speech recognition model. It supports 99 languages and handles various audio qualities, accents, and background noise.
Installation
bash
# Install Whisper
pip install openai-whisper
# Install ffmpeg (required for audio processing)
# Ubuntu/Debian:
sudo apt install ffmpeg
# macOS:
brew install ffmpeg
Basic Transcription
python
import whisper
# Load model (sizes: tiny, base, small, medium, large, turbo)
model = whisper.load_model("base")
# Transcribe an audio file
result = model.transcribe("meeting.wav")
print(result["text"])
# Transcribe with specific language
result = whisper.load_model("medium").transcribe(
"interview.mp3",
language="en",
)
print(result["text"])
Model Size Comparison
| Model | Parameters | English VRAM | Multilingual VRAM | Relative Speed |
|---|---|---|---|---|
| tiny | 39M | ~1 GB | ~1 GB | ~32x |
| base | 74M | ~1 GB | ~1 GB | ~16x |
| small | 244M | ~2 GB | ~2 GB | ~6x |
| medium | 769M | ~5 GB | ~5 GB | ~2x |
| large | 1550M | ~10 GB | ~10 GB | ~1x |
| turbo | 809M | ~6 GB | ~6 GB | ~8x |
Choosing a Model
- Use tiny/base for quick prototyping and real-time applications
- Use small/medium for production English transcription
- Use large/turbo for challenging audio (noise, accents, multiple languages)
- The turbo model offers large-quality results at medium speed — often the best tradeoff
Advanced Transcription Options
python
def transcribe_with_options(
audio_path: str,
model_size: str = "medium",
language: str | None = None,
word_timestamps: bool = True,
vad_filter: bool = True,
) -> dict:
"""Transcribe audio with detailed options.
Args:
audio_path: Path to audio file
model_size: Whisper model size
language: Language code (None for auto-detect)
word_timestamps: Include word-level timestamps
vad_filter: Use Voice Activity Detection to skip silence
Returns:
Dict with text, segments, and language
"""
model = whisper.load_model(model_size)
options = {
"word_timestamps": word_timestamps,
"fp16": False, # Use fp32 for better accuracy (slower)
}
if language:
options["language"] = language
result = model.transcribe(audio_path, **options)
return {
"text": result["text"],
"language": result["language"],
"segments": [
{
"start": seg["start"],
"end": seg["end"],
"text": seg["text"].strip(),
}
for seg in result["segments"]
],
}
# Usage
result = transcribe_with_options("podcast.mp3", model_size="turbo")
for seg in result["segments"]:
print(f"[{seg['start']:.1f}s - {seg['end']:.1f}s] {seg['text']}")
Using the OpenAI API
For production use, the Whisper API is faster and doesn't require local GPU:
python
from openai import OpenAI
client = OpenAI()
def transcribe_api(audio_path: str, language: str = "en") -> str:
"""Transcribe audio using the OpenAI Whisper API."""
with open(audio_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language=language,
response_format="verbose_json",
timestamp_granularities=["segment"],
)
return transcript.text
# Usage
text = transcribe_api("meeting.mp3")
print(text)
Speaker Diarization
Speaker diarization answers the question: "Who spoke when?" It segments audio by speaker identity.
Using pyannote.audio
bash
pip install pyannote.audio
python
from pyannote.audio import Pipeline
import torch
# Load the pipeline (requires HuggingFace token with pyannote access)
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="YOUR_HF_TOKEN",
)
# Move to GPU if available
if torch.cuda.is_available():
pipeline.to(torch.device("cuda"))
def diarize_audio(audio_path: str, num_speakers: int | None = None) -> list[dict]:
"""Perform speaker diarization on an audio file.
Args:
audio_path: Path to audio file
num_speakers: Expected number of speakers (None for auto-detect)
Returns:
List of segments with speaker labels and timestamps
"""
# Run diarization
diarization = pipeline(audio_path, num_speakers=num_speakers)
segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
segments.append({
"speaker": speaker,
"start": round(turn.start, 2),
"end": round(turn.end, 2),
"duration": round(turn.end - turn.start, 2),
})
return segments
# Usage
segments = diarize_audio("meeting.wav", num_speakers=2)
for seg in segments:
print(f"[{seg['start']:.1f}s - {seg['end']:.1f}s] {seg['speaker']}")
Combining Whisper + Diarization
python
def transcribe_with_speakers(audio_path: str, num_speakers: int = 2) -> list[dict]:
"""Get transcription with speaker labels.
Combines Whisper (what was said) with pyannote (who said it).
"""
# Get speaker segments
speaker_segments = diarize_audio(audio_path, num_speakers=num_speakers)
# Get word-level transcription
model = whisper.load_model("medium")
result = model.transcribe(audio_path, word_timestamps=True)
# Assign words to speakers based on time overlap
words = []
for segment in result["segments"]:
for word in segment.get("words", []):
words.append({
"word": word["word"].strip(),
"start": word["start"],
"end": word["end"],
})
# Match words to speakers
transcript = []
for seg in speaker_segments:
seg_words = [
w["word"] for w in words
if w["start"] >= seg["start"] - 0.5 and w["end"] <= seg["end"] + 0.5
]
if seg_words:
transcript.append({
"speaker": seg["speaker"],
"start": seg["start"],
"end": seg["end"],
"text": " ".join(seg_words),
})
return transcript
# Usage
result = transcribe_with_speakers("interview.wav", num_speakers=2)
for entry in result:
print(f"{entry['speaker']}: {entry['text']}")
Audio Preprocessing
python
import librosa
import soundfile as sf
def preprocess_audio(
input_path: str,
output_path: str,
target_sr: int = 16000,
normalize: bool = True,
trim_silence: bool = True,
):
"""Preprocess audio for speech recognition.
Args:
input_path: Input audio file path
output_path: Output audio file path
target_sr: Target sample rate (16000 Hz for Whisper)
normalize: Normalize audio volume
trim_silence: Remove leading/trailing silence
"""
# Load audio
audio, sr = librosa.load(input_path, sr=target_sr, mono=True)
# Normalize volume
if normalize:
audio = librosa.util.normalize(audio)
# Trim silence
if trim_silence:
audio, _ = librosa.effects.trim(audio, top_db=25)
# Save processed audio
sf.write(output_path, audio, target_sr)
print(f"Preprocessed audio saved to {output_path}")
print(f"Duration: {len(audio)/target_sr:.1f}s, Sample rate: {target_sr}Hz")
Audio Format Requirements
Whisper works best with:
- Sample rate: 16,000 Hz (auto-resampled)
- Format: WAV, MP3, FLAC, M4A
- Channels: Mono (stereo is auto-downmixed)
- Max file size: 25 MB (API), unlimited (local)
- For best results, preprocess to 16kHz mono WAV
Speaker Diarization Access
pyannote.audio models require you to:
- Accept the user agreement on HuggingFace
- Accept the segmentation model agreement too
- Use a HuggingFace access token with read permissions