Coqui TTS คืออะไร
Coqui TTS เป็น open source Text-to-Speech library ที่ให้สร้างเสียงพูดจากข้อความ รองรับหลายภาษาและหลาย voices สร้างโดย Coqui AI ใช้ deep learning models เช่น Tacotron2, VITS, GlowTTS สำหรับ speech synthesis คุณภาพสูง
จุดเด่นของ Coqui TTS ได้แก่ Open source ใช้ฟรี มี pre-trained models หลายภาษา, Multi-speaker support สร้างเสียงหลาย speakers ใน model เดียว, Voice cloning สร้าง voice จาก audio sample สั้นๆ, Customizable train model ด้วย data ของตัวเอง, Multiple architectures VITS, Tacotron2, GlowTTS, YourTTS
Capacity Planning สำหรับ TTS สำคัญมากเพราะ TTS inference ใช้ GPU resources สูง ต้องวางแผนว่าจะ serve ได้กี่ requests per second, latency ที่ยอมรับได้, จำนวน concurrent users, storage สำหรับ models และ audio output, cost optimization ระหว่าง quality กับ performance
ติดตั้งและเริ่มใช้งาน Coqui TTS
Setup Coqui TTS
# === Coqui TTS Installation ===
# 1. Install via pip
pip install TTS
# 2. List Available Models
tts --list_models
# Output includes:
# tts_models/en/ljspeech/tacotron2-DDC
# tts_models/en/ljspeech/vits
# tts_models/multilingual/multi-dataset/your_tts
# tts_models/en/vctk/vits
# tts_models/multilingual/multi-dataset/xtts_v2
# 3. Generate Speech (CLI)
tts --text "Hello, this is a test of Coqui TTS" \
--model_name "tts_models/en/ljspeech/vits" \
--out_path output.wav
# 4. Generate Speech (Python)
cat > tts_demo.py << 'PYEOF'
from TTS.api import TTS
# Initialize TTS
tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
# Generate speech
tts.tts_to_file(
text="Welcome to the text to speech demo",
file_path="output.wav"
)
# Multi-speaker model
tts_multi = TTS(model_name="tts_models/en/vctk/vits", gpu=True)
tts_multi.tts_to_file(
text="Hello from speaker p225",
speaker="p225",
file_path="output_p225.wav"
)
# Voice cloning with XTTS
tts_clone = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
tts_clone.tts_to_file(
text="This is voice cloning in action",
speaker_wav="reference_audio.wav",
language="en",
file_path="cloned_output.wav"
)
print("TTS generation complete")
PYEOF
python3 tts_demo.py
# 5. TTS Server
tts-server --model_name "tts_models/en/ljspeech/vits" \
--port 5002 \
--use_cuda true
# API: POST http://localhost:5002/api/tts?text=Hello+World
# 6. Docker Deployment
cat > Dockerfile << 'EOF'
FROM python:3.10-slim
RUN pip install TTS
EXPOSE 5002
CMD ["tts-server", "--model_name", "tts_models/en/ljspeech/vits", "--port", "5002"]
EOF
docker build -t coqui-tts .
docker run -p 5002:5002 --gpus all coqui-tts
echo "Coqui TTS installed"
Training Custom Voice Models
Train custom TTS model
#!/usr/bin/env python3
# train_tts.py — Custom TTS Model Training
import json
import logging
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("train")
class TTSTrainingPipeline:
def __init__(self):
self.config = {}
def prepare_dataset(self, audio_dir, metadata_file):
"""Prepare dataset for TTS training"""
requirements = {
"audio_format": "WAV, 22050 Hz, mono, 16-bit",
"min_hours": 5,
"recommended_hours": "10-20 hours for good quality",
"metadata_format": "LJSpeech format: filename|text|normalized_text",
"audio_quality": "Clean, no background noise, consistent volume",
"text_quality": "Accurate transcriptions, proper punctuation",
}
preprocessing_steps = [
"1. Normalize audio (volume, sample rate)",
"2. Remove silence at start/end",
"3. Split long recordings into 5-15 second segments",
"4. Verify text-audio alignment",
"5. Remove bad quality samples",
"6. Create train/eval splits (95/5)",
]
return {
"requirements": requirements,
"preprocessing": preprocessing_steps,
"dataset_structure": {
"wavs/": "Directory containing audio files",
"metadata.csv": "filename|text|normalized_text",
},
}
def training_config(self, model_type="vits"):
"""Generate training configuration"""
configs = {
"vits": {
"model": "VITS",
"description": "End-to-end, high quality, fast inference",
"training_params": {
"batch_size": 32,
"learning_rate": 0.0002,
"epochs": 1000,
"grad_clip": 1.0,
"scheduler": "ExponentialLR",
},
"gpu_requirements": {
"min_vram_gb": 8,
"recommended_vram_gb": 16,
"training_time_10h_data": "24-48 hours on A100",
},
},
"tacotron2": {
"model": "Tacotron2 + HiFi-GAN",
"description": "Two-stage, proven quality",
"training_params": {
"batch_size": 64,
"learning_rate": 0.001,
"epochs": 500,
"grad_clip": 5.0,
},
"gpu_requirements": {
"min_vram_gb": 6,
"recommended_vram_gb": 12,
"training_time_10h_data": "36-72 hours on A100",
},
},
}
return configs.get(model_type, configs["vits"])
def estimate_training_cost(self, hours_data, model_type="vits"):
"""Estimate cloud training cost"""
gpu_hours = hours_data * 5 # rough estimate
costs = {
"aws_p3_xlarge": {"hourly": 3.06, "gpu": "V100 16GB"},
"aws_p4d_xlarge": {"hourly": 32.77, "gpu": "A100 40GB"},
"gcp_a100": {"hourly": 3.67, "gpu": "A100 40GB"},
"lambda_a100": {"hourly": 1.10, "gpu": "A100 40GB"},
}
estimates = {}
for provider, info in costs.items():
estimates[provider] = {
"gpu": info["gpu"],
"estimated_hours": gpu_hours,
"estimated_cost": round(gpu_hours * info["hourly"], 2),
}
return estimates
pipeline = TTSTrainingPipeline()
dataset = pipeline.prepare_dataset("/data/audio", "metadata.csv")
print("Requirements:", json.dumps(dataset["requirements"], indent=2))
config = pipeline.training_config("vits")
print("\nVITS Config:", json.dumps(config["gpu_requirements"], indent=2))
costs = pipeline.estimate_training_cost(hours_data=10)
print("\nCost Estimates:", json.dumps(costs, indent=2))
Capacity Planning สำหรับ TTS
วางแผน capacity สำหรับ TTS service
#!/usr/bin/env python3
# capacity_planning.py — TTS Capacity Planning
import json
import math
import logging
from typing import Dict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")
class TTSCapacityPlanner:
def __init__(self):
self.benchmarks = {
"vits_gpu_a100": {"rtf": 0.02, "latency_1s_ms": 50, "vram_gb": 2},
"vits_gpu_t4": {"rtf": 0.08, "latency_1s_ms": 180, "vram_gb": 2},
"vits_cpu": {"rtf": 0.5, "latency_1s_ms": 1200, "vram_gb": 0},
"xtts_gpu_a100": {"rtf": 0.1, "latency_1s_ms": 250, "vram_gb": 4},
"xtts_gpu_t4": {"rtf": 0.4, "latency_1s_ms": 900, "vram_gb": 4},
"tacotron2_gpu_a100": {"rtf": 0.05, "latency_1s_ms": 120, "vram_gb": 3},
}
def estimate_throughput(self, model, avg_text_length_chars=200):
"""Estimate requests per second"""
bench = self.benchmarks.get(model)
if not bench:
return {"error": f"Unknown model: {model}"}
avg_audio_duration_s = avg_text_length_chars / 15 # ~15 chars per second
generation_time_s = avg_audio_duration_s * bench["rtf"]
rps = 1 / generation_time_s if generation_time_s > 0 else 0
return {
"model": model,
"avg_text_chars": avg_text_length_chars,
"avg_audio_duration_s": round(avg_audio_duration_s, 1),
"generation_time_s": round(generation_time_s, 3),
"requests_per_second": round(rps, 2),
"requests_per_minute": round(rps * 60, 0),
"latency_ms": round(generation_time_s * 1000, 0),
"vram_gb": bench["vram_gb"],
}
def plan_infrastructure(self, peak_rps, model, redundancy=1.5):
"""Plan infrastructure for target RPS"""
single = self.estimate_throughput(model)
if "error" in single:
return single
single_rps = single["requests_per_second"]
instances_needed = math.ceil(peak_rps / single_rps * redundancy)
# Cost estimation (monthly)
gpu_costs = {
"a100": {"hourly": 3.67, "monthly": 2642},
"t4": {"hourly": 0.35, "monthly": 252},
"cpu": {"hourly": 0.10, "monthly": 72},
}
gpu_type = "a100" if "a100" in model else "t4" if "t4" in model else "cpu"
cost = gpu_costs[gpu_type]
return {
"target_rps": peak_rps,
"model": model,
"single_instance_rps": single_rps,
"instances_needed": instances_needed,
"redundancy_factor": redundancy,
"total_vram_gb": single["vram_gb"] * instances_needed,
"monthly_cost": round(cost["monthly"] * instances_needed, 2),
"gpu_type": gpu_type,
"latency_p50_ms": single["latency_ms"],
}
def storage_estimation(self, daily_requests, avg_audio_duration_s=10, retention_days=30):
"""Estimate storage needs"""
avg_file_size_mb = avg_audio_duration_s * 0.17 # WAV ~170KB/s
daily_storage_gb = daily_requests * avg_file_size_mb / 1024
total_storage_gb = daily_storage_gb * retention_days
return {
"daily_requests": daily_requests,
"avg_audio_duration_s": avg_audio_duration_s,
"daily_storage_gb": round(daily_storage_gb, 2),
"monthly_storage_gb": round(total_storage_gb, 2),
"storage_cost_s3_monthly": round(total_storage_gb * 0.023, 2),
}
planner = TTSCapacityPlanner()
# Single instance throughput
throughput = planner.estimate_throughput("vits_gpu_a100", avg_text_length_chars=200)
print("Throughput:", json.dumps(throughput, indent=2))
# Infrastructure plan for 10 RPS
infra = planner.plan_infrastructure(peak_rps=10, model="vits_gpu_t4")
print("\nInfra Plan:", json.dumps(infra, indent=2))
# Storage
storage = planner.storage_estimation(daily_requests=50000)
print("\nStorage:", json.dumps(storage, indent=2))
Scaling และ Optimization
Scale TTS service
# === TTS Scaling and Optimization ===
# 1. Kubernetes Deployment
cat > k8s/tts-deployment.yaml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
name: tts-service
spec:
replicas: 3
selector:
matchLabels:
app: tts
template:
metadata:
labels:
app: tts
spec:
containers:
- name: tts
image: coqui-tts:latest
ports:
- containerPort: 5002
resources:
limits:
nvidia.com/gpu: 1
memory: "8Gi"
cpu: "4"
requests:
nvidia.com/gpu: 1
memory: "4Gi"
cpu: "2"
readinessProbe:
httpGet:
path: /health
port: 5002
initialDelaySeconds: 30
periodSeconds: 10
env:
- name: MODEL_NAME
value: "tts_models/en/ljspeech/vits"
---
apiVersion: v1
kind: Service
metadata:
name: tts-service
spec:
selector:
app: tts
ports:
- port: 80
targetPort: 5002
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: tts-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: tts-service
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Pods
pods:
metric:
name: tts_queue_length
target:
type: AverageValue
averageValue: "5"
EOF
# 2. Optimization Techniques
# ===================================
# Model Optimization:
# - Use ONNX Runtime for faster inference
# - Quantize model (INT8) for 2-3x speedup
# - Use TensorRT for NVIDIA GPUs
# - Batch requests for better GPU utilization
# Caching:
# - Cache generated audio for repeated texts
# - Use Redis for cache with TTL
# - Hash text + voice params as cache key
# - Cache hit rate typically 10-30%
# Audio Optimization:
# - Stream audio (chunked response)
# - Use MP3/OGG instead of WAV (10x smaller)
# - Adjust sample rate (22050 vs 44100)
# 3. Request Queue with Redis
# ===================================
# Use async processing for long texts:
# 1. Client sends text -> gets job_id
# 2. Worker processes text -> stores audio
# 3. Client polls for result or gets webhook
# 4. Load Balancing
# ===================================
# - Round-robin for uniform requests
# - Least-connections for variable text lengths
# - Sticky sessions if using stateful models
# - Health checks to remove unhealthy instances
echo "Scaling configured"
Monitoring และ Cost Management
Monitor TTS service
#!/usr/bin/env python3
# tts_monitor.py — TTS Service Monitoring
import json
import logging
from datetime import datetime
from typing import Dict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("monitor")
class TTSMonitor:
def __init__(self):
self.metrics = []
def collect_metrics(self):
return {
"timestamp": datetime.utcnow().isoformat(),
"requests": {
"total_today": 45000,
"rps_current": 8.5,
"rps_peak": 15.2,
"queue_length": 3,
},
"latency": {
"p50_ms": 180,
"p95_ms": 450,
"p99_ms": 1200,
"avg_ms": 220,
},
"gpu": {
"utilization_pct": 72,
"memory_used_gb": 5.2,
"memory_total_gb": 16,
"temperature_c": 68,
},
"instances": {
"running": 3,
"healthy": 3,
"unhealthy": 0,
},
"cache": {
"hit_rate_pct": 22,
"size_mb": 450,
"entries": 12000,
},
"errors": {
"total_today": 45,
"rate_pct": 0.1,
"types": {
"timeout": 20,
"oom": 5,
"invalid_input": 15,
"model_error": 5,
},
},
}
def cost_analysis(self, metrics):
instances = metrics["instances"]["running"]
gpu_cost_hourly = 0.35 # T4
daily_cost = instances * gpu_cost_hourly * 24
monthly_cost = daily_cost * 30
cost_per_request = monthly_cost / max(metrics["requests"]["total_today"] * 30, 1)
return {
"instances": instances,
"daily_cost": round(daily_cost, 2),
"monthly_cost": round(monthly_cost, 2),
"cost_per_1000_requests": round(cost_per_request * 1000, 4),
"vs_cloud_tts": {
"google_cloud_tts": "$4.00 per 1M chars",
"aws_polly": "$4.00 per 1M chars",
"coqui_self_hosted": f" per 1K requests",
"savings_pct": "60-80% at scale",
},
}
def capacity_alerts(self, metrics):
alerts = []
if metrics["gpu"]["utilization_pct"] > 85:
alerts.append({"severity": "high", "msg": f"GPU utilization {metrics['gpu']['utilization_pct']}%"})
if metrics["latency"]["p99_ms"] > 2000:
alerts.append({"severity": "high", "msg": f"P99 latency {metrics['latency']['p99_ms']}ms"})
if metrics["requests"]["queue_length"] > 10:
alerts.append({"severity": "medium", "msg": f"Queue length {metrics['requests']['queue_length']}"})
if metrics["errors"]["rate_pct"] > 1.0:
alerts.append({"severity": "high", "msg": f"Error rate {metrics['errors']['rate_pct']}%"})
return {"alerts": alerts, "total": len(alerts)}
monitor = TTSMonitor()
metrics = monitor.collect_metrics()
print("Metrics:", json.dumps(metrics["latency"], indent=2))
costs = monitor.cost_analysis(metrics)
print("\nCosts:", json.dumps(costs, indent=2))
alerts = monitor.capacity_alerts(metrics)
print("\nAlerts:", json.dumps(alerts, indent=2))
FAQ คำถามที่พบบ่อย
Q: Coqui TTS กับ Google Cloud TTS เลือกใช้อย่างไร?
A: Coqui TTS (self-hosted) ข้อดี ฟรี ไม่มีค่า API, data privacy (ข้อมูลไม่ออกไปข้างนอก), customizable train voice models ได้, ไม่มี rate limits ข้อเสีย ต้อง manage infrastructure เอง, ต้องมี GPU, คุณภาพเสียงอาจด้อยกว่าบาง cloud services Google Cloud TTS ข้อดี คุณภาพเสียงดีมาก, ไม่ต้อง manage infra, หลายภาษา/voices ข้อเสีย $4-16/1M chars, data ส่งไป Google, limited customization สำหรับ high volume (100K+ requests/day) self-hosted ประหยัดกว่า 60-80%
Q: GPU ไหนเหมาะกับ TTS inference?
A: NVIDIA T4 เหมาะสุดสำหรับ inference ราคาถูก ($0.35/hr cloud) VRAM 16GB เพียงพอ performance ดีสำหรับ real-time A100 สำหรับ high-throughput ต้องการหลาย RPS หรือ batch processing training ด้วย RTX 3090/4090 สำหรับ on-premise ราคาดี VRAM 24GB เพียงพอสำหรับทั้ง training และ inference CPU inference ทำได้แต่ช้ากว่า GPU 5-10 เท่า เหมาะสำหรับ low-traffic use cases เท่านั้น
Q: VITS กับ XTTS v2 เลือกรุ่นไหน?
A: VITS เร็วกว่ามาก (RTF 0.02 บน A100) เหมาะสำหรับ real-time, latency-sensitive applications คุณภาพเสียงดี แต่ต้อง train สำหรับแต่ละ speaker XTTS v2 รองรับ voice cloning จาก audio sample สั้นๆ (6 วินาที) หลายภาษา zero-shot ไม่ต้อง train ใหม่ แต่ช้ากว่า (RTF 0.1-0.4) ใช้ VRAM มากกว่า เลือก VITS สำหรับ fixed voices ที่ต้องการ speed เลือก XTTS สำหรับ custom voices และ multilingual
Q: Capacity planning ควรคิดอะไรบ้าง?
A: คำนวณ peak RPS จาก traffic patterns (ปกติ peak สูงกว่า average 3-5 เท่า), เลือก model ที่เหมาะกับ latency requirement (real-time ต้อง RTF ต่ำกว่า 0.5), คำนวณ instances จาก peak RPS / single instance RPS x redundancy factor (1.5-2.0), เผื่อ GPU memory สำหรับ model loading, plan storage สำหรับ cached audio (ถ้ามี), monitor และ autoscale ตาม GPU utilization และ queue length, ทำ load test ก่อน production เพื่อ validate estimates
