Coqui TTS Metrics
TTS Coqui Text-to-Speech Metric Collection Voice Cloning VITS XTTS Tacotron2 Inference Latency RTF MOS Prometheus Grafana GPU Production
| Model | Quality | Speed | Multi-lingual | Voice Clone | เหมาะกับ |
|---|---|---|---|---|---|
| VITS | ดีมาก | เร็ว | ไม่ | ไม่ | Single Language |
| XTTS v2 | ดีเยี่ยม | ปานกลาง | ใช่ 17 ภาษา | ใช่ | Multi-lingual Clone |
| Tacotron2 | ดี | ช้า | ไม่ | ไม่ | Research |
| Bark | ดี | ช้า | ใช่ | Limited | Creative Audio |
| Piper | ดี | เร็วมาก | ใช่ 30+ | ไม่ | Edge/Offline |
TTS Setup
# === Coqui TTS Installation & Usage ===
# pip install TTS
# CLI Usage
# tts --text "Hello World" --model_name "tts_models/en/ljspeech/vits" --out_path output.wav
# tts --text "สวัสดีครับ" --model_name "tts_models/multilingual/multi-dataset/xtts_v2" \
# --speaker_wav sample.wav --language_idx th --out_path output_th.wav
# Python API
# from TTS.api import TTS
#
# # Single Speaker TTS
# tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
# tts.tts_to_file(text="Hello, this is a test.", file_path="output.wav")
#
# # Multi-lingual + Voice Cloning (XTTS v2)
# tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
# tts.tts_to_file(
# text="สวัสดีครับ นี่คือการทดสอบระบบสังเคราะห์เสียง",
# file_path="output_th.wav",
# speaker_wav="reference_voice.wav",
# language="th",
# )
# TTS Server
# tts-server --model_name tts_models/en/ljspeech/vits --port 5002
# curl "http://localhost:5002/api/tts?text=Hello+World" -o output.wav
# FastAPI Custom Server
# from fastapi import FastAPI
# from fastapi.responses import StreamingResponse
# from TTS.api import TTS
# import io, soundfile as sf
#
# app = FastAPI()
# tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
#
# @app.post("/synthesize")
# async def synthesize(text: str, language: str = "en", speaker_wav: str = None):
# wav = tts.tts(text=text, language=language, speaker_wav=speaker_wav)
# buffer = io.BytesIO()
# sf.write(buffer, wav, 22050, format='WAV')
# buffer.seek(0)
# return StreamingResponse(buffer, media_type="audio/wav")
from dataclasses import dataclass
import time
@dataclass
class TTSBenchmark:
model: str
text_length: int
audio_duration: float
inference_time: float
rtf: float
gpu_mem_mb: int
benchmarks = [
TTSBenchmark("VITS", 50, 3.2, 0.15, 0.047, 512),
TTSBenchmark("VITS", 200, 12.5, 0.52, 0.042, 512),
TTSBenchmark("XTTS v2", 50, 3.5, 1.20, 0.343, 2048),
TTSBenchmark("XTTS v2", 200, 14.0, 4.80, 0.343, 2048),
TTSBenchmark("Tacotron2", 50, 3.2, 2.10, 0.656, 1024),
]
print("=== TTS Benchmarks (GPU) ===")
for b in benchmarks:
print(f" [{b.model}] Text: {b.text_length} chars | Audio: {b.audio_duration}s")
print(f" Inference: {b.inference_time}s | RTF: {b.rtf:.3f} | GPU: {b.gpu_mem_mb}MB")
Metric Collection
# === TTS Metrics with Prometheus ===
# pip install prometheus_client
# from prometheus_client import Counter, Histogram, Gauge, start_http_server
#
# # Metrics
# TTS_REQUESTS = Counter('tts_requests_total', 'Total TTS requests', ['model', 'language', 'status'])
# TTS_LATENCY = Histogram('tts_inference_seconds', 'TTS inference latency',
# ['model'], buckets=[0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0])
# TTS_AUDIO_DURATION = Histogram('tts_audio_duration_seconds', 'Generated audio duration',
# buckets=[1, 2, 5, 10, 30, 60])
# TTS_RTF = Histogram('tts_real_time_factor', 'Real-time factor', buckets=[0.01, 0.05, 0.1, 0.5, 1.0])
# TTS_GPU_UTILIZATION = Gauge('tts_gpu_utilization_percent', 'GPU utilization')
# TTS_GPU_MEMORY = Gauge('tts_gpu_memory_used_mb', 'GPU memory used')
# TTS_QUEUE_SIZE = Gauge('tts_request_queue_size', 'Pending requests in queue')
#
# # Instrumented TTS function
# def synthesize_with_metrics(text, model, language):
# start = time.time()
# try:
# wav = tts.tts(text=text, language=language)
# duration = len(wav) / 22050 # sample rate
# latency = time.time() - start
# rtf = latency / duration
#
# TTS_REQUESTS.labels(model=model, language=language, status="success").inc()
# TTS_LATENCY.labels(model=model).observe(latency)
# TTS_AUDIO_DURATION.observe(duration)
# TTS_RTF.observe(rtf)
# return wav
# except Exception as e:
# TTS_REQUESTS.labels(model=model, language=language, status="error").inc()
# raise
#
# start_http_server(9090) # Prometheus metrics endpoint
@dataclass
class MetricDef:
name: str
metric_type: str
unit: str
alert_threshold: str
dashboard: str
metrics = [
MetricDef("tts_inference_seconds", "Histogram", "seconds", "p99 > 5s", "Latency Distribution"),
MetricDef("tts_real_time_factor", "Histogram", "ratio", "RTF > 1.0", "Real-time Performance"),
MetricDef("tts_requests_total", "Counter", "count", "error_rate > 5%", "Request Rate"),
MetricDef("tts_audio_duration_seconds", "Histogram", "seconds", "avg > 30s", "Audio Length"),
MetricDef("tts_gpu_utilization_percent", "Gauge", "%", "> 90%", "GPU Usage"),
MetricDef("tts_gpu_memory_used_mb", "Gauge", "MB", "> 80% capacity", "GPU Memory"),
MetricDef("tts_request_queue_size", "Gauge", "count", "> 50", "Queue Depth"),
]
print("\n=== TTS Metrics ===")
for m in metrics:
print(f" [{m.metric_type}] {m.name} ({m.unit})")
print(f" Alert: {m.alert_threshold} | Dashboard: {m.dashboard}")
Production Deployment
# === Production TTS Deployment ===
# Dockerfile
# FROM nvidia/cuda:12.1-runtime-ubuntu22.04
# RUN pip install TTS fastapi uvicorn prometheus_client
# COPY app/ /app/
# WORKDIR /app
# CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]
# Kubernetes — GPU Deployment
# apiVersion: apps/v1
# kind: Deployment
# spec:
# replicas: 2
# template:
# spec:
# containers:
# - name: tts-server
# image: tts-server:v1.0
# resources:
# limits:
# nvidia.com/gpu: 1
# memory: 8Gi
# ports:
# - containerPort: 8080
# - containerPort: 9090 # metrics
@dataclass
class DeployConfig:
component: str
config: str
purpose: str
scaling: str
configs = [
DeployConfig("TTS Server", "2 replicas GPU T4", "Inference", "HPA on queue_size"),
DeployConfig("Audio Cache", "Redis 4GB", "Cache generated audio", "Cluster mode"),
DeployConfig("Load Balancer", "Nginx round-robin", "Distribute requests", "Auto"),
DeployConfig("Prometheus", "15s scrape interval", "Collect metrics", "Single instance"),
DeployConfig("Grafana", "TTS Dashboard", "Visualize metrics", "Single instance"),
DeployConfig("Alert Manager", "Slack notification", "Alert on issues", "Single instance"),
]
print("Production Architecture:")
for c in configs:
print(f" [{c.component}] {c.config}")
print(f" Purpose: {c.purpose} | Scaling: {c.scaling}")
perf = {
"Avg Latency (VITS)": "0.15s per request",
"Avg Latency (XTTS)": "1.2s per request",
"Throughput (VITS)": "50 req/s per GPU",
"Throughput (XTTS)": "5 req/s per GPU",
"Cache Hit Rate": "35% (repeated texts)",
"Error Rate": "0.1%",
"GPU Utilization": "72%",
"Daily Requests": "250K",
}
print(f"\n\nProduction Performance:")
for k, v in perf.items():
print(f" {k}: {v}")
เคล็ดลับ
- VITS: ใช้ VITS สำหรับ Speed ใช้ XTTS สำหรับ Quality + Clone
- Cache: Cache Audio ที่สร้างแล้ว ลด Compute 30%+
- RTF: ดู Real-time Factor ต้องน้อยกว่า 1 สำหรับ Real-time
- GPU: ใช้ GPU สำหรับ Production CPU ช้ามากสำหรับ TTS
- Monitor: ดู Latency Queue Size GPU Memory ทุกวัน
Coqui TTS คืออะไร
Open Source Text-to-Speech VITS XTTS Tacotron2 Voice Cloning Multi-lingual Pre-trained Fine-tuning Python CLI API Server GPU
เก็บ Metrics ของ TTS อย่างไร
Inference Latency RTF Real-time Factor MOS PESQ Throughput GPU Utilization Memory Error Rate Prometheus Grafana Dashboard Alert
Voice Cloning ทำอย่างไร
XTTS Model Audio ตัวอย่าง 6 วินาที+ ไม่มี Noise speaker_wav Zero-shot ไม่ต้อง Fine-tune ภาษาไทย XTTS v2 Multi-lingual
Deploy TTS Server อย่างไร
tts-server FastAPI Docker Kubernetes GPU Load Balancer Caching Redis Monitoring Latency Throughput Error Scale
สรุป
TTS Coqui Metric Collection Voice Cloning VITS XTTS Prometheus Grafana Inference Latency RTF GPU Docker Kubernetes Production Monitoring
