Whisper Speech Recognition ?????????????????????
Whisper ???????????? automatic speech recognition (ASR) model ????????? OpenAI ??????????????????????????? multilingual transcription ????????? translation ?????????????????? 680,000 ?????????????????????????????? audio data ??????????????????????????????????????? 99 ???????????? ??????????????????????????????????????? ????????????????????????????????????????????????????????? ???????????????????????? large ????????? large-v3 models
Whisper ?????????????????????????????? tiny (39M params), base (74M), small (244M), medium (769M), large-v3 (1550M) ???????????? model ???????????? ?????????????????????????????? ?????????????????? resources ????????????????????? ??????????????????????????????????????? ??????????????? medium ??????????????????????????????????????????????????????????????????????????????
Capacity Planning ?????????????????? Whisper ???????????????????????????????????? resources ????????????????????????????????????????????????????????????????????? workload ?????????????????????????????? ????????????????????????????????? concurrent users, audio duration, latency requirements, throughput targets ????????? budget ?????????????????????????????? hardware, model size ????????? architecture ??????????????????????????????
????????????????????? Whisper ?????????????????? Production
Setup Whisper inference server
# === Whisper Production Setup ===
# 1. Install faster-whisper (CTranslate2 backend ??? 4x faster)
pip install faster-whisper
# 2. Basic Usage
cat > transcribe.py << 'PYEOF'
#!/usr/bin/env python3
from faster_whisper import WhisperModel
import time
# Load model (downloads automatically on first run)
# Options: tiny, base, small, medium, large-v3
model = WhisperModel("large-v3", device="cuda", compute_type="float16")
# Transcribe
start = time.time()
segments, info = model.transcribe(
"audio.wav",
language="th",
beam_size=5,
vad_filter=True, # Voice Activity Detection
vad_parameters=dict(
min_silence_duration_ms=500,
speech_pad_ms=400,
),
)
print(f"Language: {info.language} (prob: {info.language_probability:.2f})")
print(f"Duration: {info.duration:.1f}s")
for segment in segments:
print(f"[{segment.start:.1f}s -> {segment.end:.1f}s] {segment.text}")
elapsed = time.time() - start
print(f"\nProcessing time: {elapsed:.1f}s")
print(f"Real-time factor: {elapsed/info.duration:.2f}x")
PYEOF
# 3. FastAPI Server
cat > whisper_server.py << 'PYEOF'
#!/usr/bin/env python3
from fastapi import FastAPI, UploadFile, File, HTTPException
from faster_whisper import WhisperModel
import tempfile
import time
import os
app = FastAPI(title="Whisper ASR API")
# Load model at startup
model = WhisperModel("large-v3", device="cuda", compute_type="float16")
@app.post("/transcribe")
async def transcribe(
file: UploadFile = File(...),
language: str = "th",
beam_size: int = 5,
):
# Save uploaded file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
start = time.time()
segments, info = model.transcribe(
tmp_path,
language=language,
beam_size=beam_size,
vad_filter=True,
)
results = []
full_text = []
for seg in segments:
results.append({
"start": round(seg.start, 2),
"end": round(seg.end, 2),
"text": seg.text.strip(),
})
full_text.append(seg.text.strip())
elapsed = time.time() - start
return {
"text": " ".join(full_text),
"segments": results,
"language": info.language,
"duration": round(info.duration, 2),
"processing_time": round(elapsed, 2),
"rtf": round(elapsed / max(info.duration, 0.1), 3),
}
finally:
os.unlink(tmp_path)
@app.get("/health")
def health():
return {"status": "ok", "model": "large-v3"}
PYEOF
# Run server
uvicorn whisper_server:app --host 0.0.0.0 --port 8000 --workers 1
echo "Whisper server installed"
Capacity Planning ?????????????????? Speech-to-Text
??????????????? resources ??????????????????????????????
#!/usr/bin/env python3
# capacity_planning.py ??? Whisper Capacity Planning
import json
import logging
import math
from typing import Dict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")
class WhisperCapacityPlanner:
# Real-time factor (processing time / audio duration)
# Lower = faster. RTF < 1.0 means faster than real-time
RTF_BY_MODEL = {
"tiny": {"cpu": 0.8, "t4": 0.05, "a10g": 0.03, "a100": 0.02, "vram_gb": 1},
"base": {"cpu": 1.5, "t4": 0.08, "a10g": 0.05, "a100": 0.03, "vram_gb": 1},
"small": {"cpu": 4.0, "t4": 0.15, "a10g": 0.08, "a100": 0.05, "vram_gb": 2},
"medium": {"cpu": 10.0, "t4": 0.30, "a10g": 0.15, "a100": 0.08, "vram_gb": 5},
"large-v3": {"cpu": 25.0, "t4": 0.50, "a10g": 0.25, "a100": 0.12, "vram_gb": 10},
}
GPU_COST_PER_HOUR = {
"t4": 0.526,
"a10g": 1.006,
"a100": 3.673,
"cpu_8core": 0.34,
}
def __init__(self, model_size="large-v3", gpu_type="a10g"):
self.model_size = model_size
self.gpu_type = gpu_type
self.rtf = self.RTF_BY_MODEL[model_size][gpu_type]
def calculate_throughput(self, num_gpus=1):
"""Calculate max audio hours per hour"""
# Each GPU can process 1/RTF hours of audio per hour
throughput_per_gpu = 1.0 / self.rtf
total_throughput = throughput_per_gpu * num_gpus
return {
"model": self.model_size,
"gpu": self.gpu_type,
"num_gpus": num_gpus,
"rtf": self.rtf,
"audio_hours_per_hour_per_gpu": round(throughput_per_gpu, 1),
"total_audio_hours_per_hour": round(total_throughput, 1),
"audio_minutes_per_minute": round(total_throughput * 60 / 60, 1),
}
def plan_for_workload(self, daily_audio_hours, peak_concurrent, max_latency_sec, avg_audio_sec=60):
"""Plan capacity for a specific workload"""
# Daily throughput requirement
processing_hours_available = 20 # Leave 4 hours headroom
min_gpus_throughput = math.ceil(daily_audio_hours * self.rtf / processing_hours_available)
# Peak concurrency requirement
# Each GPU handles 1 request at a time
# Latency per request = avg_audio_sec * RTF
latency_per_request = avg_audio_sec * self.rtf
if latency_per_request > max_latency_sec:
suggestion = f"Use smaller model or faster GPU. Current latency: {latency_per_request:.1f}s"
else:
suggestion = f"Latency OK: {latency_per_request:.1f}s < {max_latency_sec}s target"
min_gpus_concurrency = math.ceil(peak_concurrent * latency_per_request / max_latency_sec)
recommended_gpus = max(min_gpus_throughput, min_gpus_concurrency)
# Add 30% headroom
recommended_gpus = math.ceil(recommended_gpus * 1.3)
cost_per_hour = recommended_gpus * self.GPU_COST_PER_HOUR.get(self.gpu_type, 1.0)
cost_per_month = cost_per_hour * 730
return {
"workload": {
"daily_audio_hours": daily_audio_hours,
"peak_concurrent_requests": peak_concurrent,
"max_latency_sec": max_latency_sec,
"avg_audio_duration_sec": avg_audio_sec,
},
"recommendation": {
"model": self.model_size,
"gpu_type": self.gpu_type,
"gpus_for_throughput": min_gpus_throughput,
"gpus_for_concurrency": min_gpus_concurrency,
"recommended_gpus": recommended_gpus,
"latency_per_request_sec": round(latency_per_request, 1),
"latency_suggestion": suggestion,
},
"cost": {
"cost_per_hour": round(cost_per_hour, 2),
"cost_per_month": round(cost_per_month, 2),
"cost_per_audio_hour": round(cost_per_month / (daily_audio_hours * 30), 2),
},
}
planner = WhisperCapacityPlanner("large-v3", "a10g")
throughput = planner.calculate_throughput(num_gpus=2)
print("Throughput:", json.dumps(throughput, indent=2))
plan = planner.plan_for_workload(
daily_audio_hours=50,
peak_concurrent=10,
max_latency_sec=30,
avg_audio_sec=60,
)
print("\nCapacity Plan:", json.dumps(plan, indent=2))
Scaling Architecture
?????????????????? architecture ?????????????????? scale
# === Whisper Scaling Architecture ===
# Architecture:
# Client ??? API Gateway ??? Queue (Redis) ??? GPU Workers ??? Storage (S3)
# ???
# Result Store (PostgreSQL)
# 1. Kubernetes Deployment
cat > k8s/whisper-worker.yaml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
name: whisper-worker
namespace: speech
spec:
replicas: 4
selector:
matchLabels:
app: whisper-worker
template:
metadata:
labels:
app: whisper-worker
spec:
containers:
- name: worker
image: ghcr.io/myorg/whisper-worker:latest
env:
- name: MODEL_SIZE
value: "large-v3"
- name: COMPUTE_TYPE
value: "float16"
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: whisper-secrets
key: redis-url
resources:
limits:
nvidia.com/gpu: 1
memory: "16Gi"
requests:
nvidia.com/gpu: 1
memory: "12Gi"
volumeMounts:
- name: model-cache
mountPath: /root/.cache
- name: shm
mountPath: /dev/shm
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: whisper-model-cache
- name: shm
emptyDir:
medium: Memory
sizeLimit: "4Gi"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: whisper-worker-hpa
namespace: speech
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: whisper-worker
minReplicas: 2
maxReplicas: 20
metrics:
- type: External
external:
metric:
name: redis_queue_length
selector:
matchLabels:
queue: whisper-transcribe
target:
type: AverageValue
averageValue: "3"
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Pods
value: 4
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 2
periodSeconds: 120
EOF
# 2. Queue-based Worker
cat > worker.py << 'PYEOF'
#!/usr/bin/env python3
import redis
import json
import time
from faster_whisper import WhisperModel
model = WhisperModel("large-v3", device="cuda", compute_type="float16")
r = redis.Redis.from_url("redis://redis:6379/0")
print("Worker started, waiting for jobs...")
while True:
job = r.brpop("whisper:queue", timeout=30)
if job is None:
continue
data = json.loads(job[1])
job_id = data["job_id"]
audio_path = data["audio_path"]
try:
start = time.time()
segments, info = model.transcribe(audio_path, language=data.get("language", "th"), vad_filter=True)
result = {
"text": " ".join(s.text.strip() for s in segments),
"duration": info.duration,
"processing_time": time.time() - start,
}
r.set(f"whisper:result:{job_id}", json.dumps(result), ex=3600)
r.publish(f"whisper:done:{job_id}", "ok")
except Exception as e:
r.set(f"whisper:result:{job_id}", json.dumps({"error": str(e)}), ex=3600)
PYEOF
kubectl apply -f k8s/
echo "Scaling architecture deployed"
Performance Optimization
Optimize Whisper ?????????????????? throughput ?????????
# === Whisper Performance Optimization ===
# 1. Use faster-whisper with CTranslate2 (4x faster than OpenAI whisper)
pip install faster-whisper
# 2. Use INT8 quantization (2x faster, slight quality loss)
python3 -c "
from faster_whisper import WhisperModel
# INT8 quantization ??? fastest
model = WhisperModel('large-v3', device='cuda', compute_type='int8_float16')
# FP16 ??? good balance
# model = WhisperModel('large-v3', device='cuda', compute_type='float16')
# Benchmark
import time
test_files = ['test1.wav', 'test2.wav', 'test3.wav']
for f in test_files:
try:
start = time.time()
segments, info = model.transcribe(f, language='th', beam_size=5, vad_filter=True)
text = ' '.join(s.text for s in segments)
elapsed = time.time() - start
print(f'{f}: {info.duration:.1f}s audio, {elapsed:.1f}s process, RTF={elapsed/info.duration:.3f}')
except:
pass
"
# 3. Batched Transcription
cat > batch_transcribe.py << 'PYEOF'
#!/usr/bin/env python3
from faster_whisper import WhisperModel, BatchedInferencePipeline
import time
import glob
model = WhisperModel("large-v3", device="cuda", compute_type="int8_float16")
pipeline = BatchedInferencePipeline(model=model)
audio_files = glob.glob("audio/*.wav")
print(f"Processing {len(audio_files)} files...")
total_audio = 0
total_time = 0
for f in audio_files:
start = time.time()
segments, info = pipeline.transcribe(f, language="th", batch_size=16)
text = " ".join(s.text.strip() for s in segments)
elapsed = time.time() - start
total_audio += info.duration
total_time += elapsed
print(f"{f}: {info.duration:.0f}s -> {elapsed:.1f}s (RTF: {elapsed/info.duration:.3f})")
print(f"\nTotal: {total_audio/3600:.1f}h audio in {total_time/3600:.2f}h")
print(f"Overall RTF: {total_time/total_audio:.3f}")
PYEOF
# 4. VAD Pre-processing (skip silence)
# faster-whisper has built-in VAD (Silero VAD)
# This skips silent parts, reducing processing time 20-40%
# 5. Model Selection Guide
# | Model | VRAM | RTF (A10G) | WER (Thai) | Use Case |
# |----------|-------|------------|------------|--------------------|
# | tiny | 1 GB | 0.03 | ~35% | Real-time, low acc |
# | base | 1 GB | 0.05 | ~25% | Fast, moderate acc |
# | small | 2 GB | 0.08 | ~18% | Good balance |
# | medium | 5 GB | 0.15 | ~12% | High accuracy |
# | large-v3 | 10 GB | 0.25 | ~8% | Best accuracy |
echo "Performance optimization complete"
Monitoring ????????? Cost Analysis
Monitor Whisper service ??????????????????????????????????????????????????????????????????
#!/usr/bin/env python3
# whisper_monitoring.py ??? Whisper Service Monitoring
import json
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("monitor")
class WhisperMonitoring:
def __init__(self):
self.metrics = {}
def dashboard_metrics(self):
return {
"real_time": {
"active_workers": 4,
"queue_depth": 12,
"avg_latency_sec": 15.3,
"p95_latency_sec": 28.7,
"gpu_utilization_pct": 78,
"requests_per_minute": 8.5,
},
"daily_summary": {
"total_requests": 4250,
"total_audio_hours": 52.3,
"avg_audio_duration_sec": 44.3,
"success_rate_pct": 99.2,
"avg_rtf": 0.28,
"languages": {"th": 3800, "en": 350, "ja": 100},
},
"cost_analysis": {
"gpu_instances": 4,
"gpu_type": "A10G",
"cost_per_hour": 4.024,
"daily_cost": 96.58,
"monthly_cost": 2937.52,
"cost_per_audio_hour": 1.87,
"cost_per_request": 0.023,
},
}
def optimization_recommendations(self, metrics):
recs = []
gpu_util = metrics["real_time"]["gpu_utilization_pct"]
if gpu_util < 50:
recs.append({
"action": "Scale down GPU workers",
"reason": f"GPU utilization only {gpu_util}%",
"savings": "25-50% cost reduction",
})
elif gpu_util > 90:
recs.append({
"action": "Scale up GPU workers",
"reason": f"GPU utilization {gpu_util}% ??? risk of queue buildup",
"impact": "Reduce latency, prevent timeouts",
})
queue = metrics["real_time"]["queue_depth"]
if queue > 20:
recs.append({
"action": "Add more workers or use faster model",
"reason": f"Queue depth {queue} ??? users waiting too long",
})
rtf = metrics["daily_summary"]["avg_rtf"]
if rtf > 0.3:
recs.append({
"action": "Consider INT8 quantization",
"reason": f"RTF {rtf} ??? can improve to ~{rtf*0.6:.2f} with INT8",
"tradeoff": "Slight accuracy reduction (~1-2% WER increase)",
})
cost = metrics["cost_analysis"]["cost_per_audio_hour"]
if cost > 2.0:
recs.append({
"action": "Use spot instances for batch workloads",
"reason": f"Cost /audio-hour ??? spot can reduce 60-70%",
})
return recs
monitor = WhisperMonitoring()
metrics = monitor.dashboard_metrics()
print(f"Daily: {metrics['daily_summary']['total_requests']} requests, {metrics['daily_summary']['total_audio_hours']}h audio")
print(f"Cost: /month (/audio-hour)")
recs = monitor.optimization_recommendations(metrics)
print(f"\nRecommendations: {len(recs)}")
for r in recs:
print(f" - {r['action']}: {r['reason']}")
FAQ ??????????????????????????????????????????
Q: Whisper ????????????????????????????????????????????????????????????????
A: Whisper large-v3 ??????????????????????????????????????????????????????????????? WER (Word Error Rate) ?????????????????? 8-12% ?????????????????????????????????????????? ?????????????????????????????????????????? ????????????????????????????????? ???????????????????????? noise ???????????? WER ?????????, ?????????????????? ???????????????????????????????????????????????????????????? transcribe ?????????????????????, ????????????????????? ????????????????????????????????????????????????????????? ??????????????????????????????????????????????????????????????????????????????????????????, ????????????????????????????????? ??????????????????????????????????????? miss ??????????????? ?????????????????? production ??????????????? large-v3 + VAD filter ?????????????????? real-time ?????????????????????????????? latency ????????? ????????? medium (?????????????????????????????? ???????????????????????? 2 ????????????)
Q: faster-whisper ????????? openai-whisper ???????????????????????????????????????????
A: faster-whisper ????????? CTranslate2 backend ????????? PyTorch ???????????????????????? 4 ???????????? ????????? memory ???????????????????????? 50% ?????????????????? INT8 quantization ?????????????????? transcription ????????????????????? (????????? model weights ????????????????????????) openai-whisper ???????????? official implementation ????????? PyTorch ????????????????????? ????????? memory ????????????????????? ??????????????? features ??????????????????????????? word-level timestamps ?????????????????? production ??????????????? faster-whisper ???????????? ?????????????????? development/testing ????????? openai-whisper ????????? ?????? WhisperX ?????????????????????????????????????????? word-level alignment ????????? speaker diarization
Q: CPU ????????? GPU ???????????????????????????????????????????????????????????? Whisper?
A: ?????????????????????????????? ???????????????????????? large-v3 model transcribe 1 ???????????? audio CPU (8-core): ?????????????????? 25 ???????????? (RTF 25x) GPU T4: ?????????????????? 30 ?????????????????? (RTF 0.5x) GPU A10G: ?????????????????? 15 ?????????????????? (RTF 0.25x) GPU A100: ?????????????????? 7 ?????????????????? (RTF 0.12x) GPU ???????????????????????? CPU 50-200 ???????????? ?????????????????? batch processing ?????????????????????????????? real-time ????????? CPU ????????? (?????????????????????) ???????????????????????????????????? ?????????????????? real-time ???????????? near real-time ????????????????????? GPU ???????????????????????? INT8 quantization ?????? RTF ????????? 40% ?????? GPU
Q: ????????????????????????????????????????????? run Whisper ???????????????????????? cloud ASR services?
A: Self-hosted Whisper ????????????????????????????????????????????? volume ????????? Google Speech-to-Text $0.006/15?????????????????? = $1.44/????????????????????? AWS Transcribe $0.024/???????????? = $1.44/????????????????????? Azure Speech $0.01-0.016/???????????? = $0.60-0.96/????????????????????? Self-hosted Whisper (A10G) ????????? GPU $1.006/hr ????????? process ????????? 4 hr audio/hr = $0.25/audio-hour ????????????????????? cloud 4-6 ???????????? Break-even ????????? process > 10 ????????????????????? audio/????????? self-host ???????????????????????? ????????? < 5 ?????????????????????/????????? ????????? cloud API ???????????????????????? ??????????????? self-host data privacy (audio ?????????????????? network), customize model ?????????, ??????????????? rate limit
