Coqui TTS Observability
Coqui TTS Observability Stack Prometheus Grafana Loki Jaeger Metrics Logs Traces Alerting GPU Production Monitoring
| Pillar | Tool | TTS Metrics | Purpose |
|---|---|---|---|
| Metrics | Prometheus + Grafana | Latency RTF Throughput Error GPU | Dashboard Alert Trend |
| Logs | Loki + Promtail | Request Log Error Log Debug | Troubleshoot Search Query |
| Traces | Jaeger / Tempo | Request Flow Preprocessing Inference Postprocessing | Bottleneck Analysis |
| Alerting | Alertmanager | P1 P2 P3 Thresholds | Slack PagerDuty Email |
TTS Server with Metrics
# === Coqui TTS Server with Prometheus Metrics ===
# from TTS.api import TTS
# from prometheus_client import Counter, Histogram, Gauge, start_http_server
# from fastapi import FastAPI
# import time
#
# app = FastAPI()
# tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
#
# # Prometheus Metrics
# REQUEST_COUNT = Counter('tts_requests_total', 'Total TTS requests', ['status'])
# SYNTHESIS_DURATION = Histogram('tts_synthesis_duration_seconds', 'Synthesis duration',
# buckets=[0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0])
# AUDIO_DURATION = Histogram('tts_audio_duration_seconds', 'Generated audio duration',
# buckets=[1, 2, 5, 10, 30, 60])
# QUEUE_LENGTH = Gauge('tts_queue_length', 'Current queue length')
# GPU_MEMORY = Gauge('tts_gpu_memory_bytes', 'GPU memory usage')
#
# @app.post("/synthesize")
# async def synthesize(text: str):
# QUEUE_LENGTH.inc()
# start = time.time()
# try:
# wav = tts.tts(text=text)
# duration = time.time() - start
# audio_len = len(wav) / 22050 # sample rate
# SYNTHESIS_DURATION.observe(duration)
# AUDIO_DURATION.observe(audio_len)
# REQUEST_COUNT.labels(status="success").inc()
# return {"audio_duration": audio_len, "rtf": duration / audio_len}
# except Exception as e:
# REQUEST_COUNT.labels(status="error").inc()
# raise
# finally:
# QUEUE_LENGTH.dec()
from dataclasses import dataclass
@dataclass
class TTSMetric:
metric: str
type_prom: str
labels: str
purpose: str
metrics = [
TTSMetric("tts_requests_total",
"Counter",
"status=[success|error], model, language",
"นับจำนวน Request ทั้งหมด แยกตาม Status"),
TTSMetric("tts_synthesis_duration_seconds",
"Histogram",
"model, language",
"เวลาสร้างเสียง P50 P95 P99"),
TTSMetric("tts_audio_duration_seconds",
"Histogram",
"model, language",
"ความยาว Audio ที่สร้าง ใช้คำนวณ RTF"),
TTSMetric("tts_queue_length",
"Gauge",
"instance",
"จำนวน Request ที่รออยู่ใน Queue"),
TTSMetric("tts_gpu_memory_bytes",
"Gauge",
"instance, gpu_id",
"GPU Memory ที่ใช้ ตรวจ OOM"),
TTSMetric("tts_errors_total",
"Counter",
"error_type=[oom|timeout|invalid_input]",
"นับ Error แยกประเภท หา Pattern"),
]
print("=== TTS Prometheus Metrics ===")
for m in metrics:
print(f" [{m.metric}] Type: {m.type_prom}")
print(f" Labels: {m.labels}")
print(f" Purpose: {m.purpose}")
Grafana Dashboard
# === Grafana Dashboard Panels ===
# Prometheus Queries (PromQL)
# RTF (Real-time Factor):
# rate(tts_synthesis_duration_seconds_sum[5m]) /
# rate(tts_audio_duration_seconds_sum[5m])
#
# Throughput (req/min):
# rate(tts_requests_total[5m]) * 60
#
# Error Rate (%):
# rate(tts_requests_total{status="error"}[5m]) /
# rate(tts_requests_total[5m]) * 100
#
# P99 Latency:
# histogram_quantile(0.99, rate(tts_synthesis_duration_seconds_bucket[5m]))
@dataclass
class DashPanel:
panel: str
promql: str
viz_type: str
alert_threshold: str
panels = [
DashPanel("Synthesis Latency P50/P95/P99",
"histogram_quantile(0.99, rate(tts_synthesis_duration_seconds_bucket[5m]))",
"Time Series (3 lines: P50 P95 P99)",
"P99 > 5s → Warning | P99 > 10s → Critical"),
DashPanel("Real-time Factor (RTF)",
"synthesis_sum / audio_sum",
"Gauge (green < 0.5, yellow < 1.0, red > 1.0)",
"RTF > 1.0 → Warning (ช้ากว่า Realtime)"),
DashPanel("Throughput (req/min)",
"rate(tts_requests_total[5m]) * 60",
"Time Series + Stat (current)",
"< 1 req/min → Check Server Health"),
DashPanel("Error Rate (%)",
"error_count / total_count * 100",
"Gauge (green < 1%, yellow < 5%, red > 10%)",
"> 5% → Warning | > 10% → Critical"),
DashPanel("GPU Memory Usage",
"tts_gpu_memory_bytes / gpu_total_bytes * 100",
"Gauge + Time Series",
"> 90% → Warning | > 95% → Critical OOM Risk"),
DashPanel("Queue Length",
"tts_queue_length",
"Time Series + Stat",
"> 20 → Warning | > 50 → Scale Instance"),
]
print("=== Grafana Dashboard ===")
for p in panels:
print(f" [{p.panel}]")
print(f" PromQL: {p.promql}")
print(f" Viz: {p.viz_type}")
print(f" Alert: {p.alert_threshold}")
Alerting & Runbook
# === Alert Rules & Runbook ===
@dataclass
class AlertRule:
alert: str
severity: str
condition: str
runbook: str
alerts = [
AlertRule("TTS Server Down",
"P1 Critical",
"up{job='tts'} == 0 for 1m",
"1. Check Container: docker ps 2. Check GPU: nvidia-smi 3. Restart: docker restart tts 4. Check Logs: docker logs tts"),
AlertRule("High Error Rate",
"P1 Critical",
"error_rate > 10% for 5m",
"1. Check Logs: grep ERROR 2. Check GPU OOM 3. Check Input Validation 4. Restart if OOM"),
AlertRule("High Latency",
"P2 Warning",
"P99 > 5s for 10m",
"1. Check Queue Length 2. Check GPU Util 3. Scale Instance 4. Reduce Batch Size"),
AlertRule("RTF > 1.0",
"P2 Warning",
"rtf > 1.0 for 10m",
"1. Check GPU Usage 2. Use Faster Model 3. Reduce Max Text Length 4. Scale GPU"),
AlertRule("GPU Memory High",
"P2 Warning",
"gpu_memory > 90% for 5m",
"1. Check Concurrent Requests 2. Reduce Batch 3. Use Smaller Model 4. Add GPU"),
AlertRule("Queue Backlog",
"P2 Warning",
"queue_length > 50 for 5m",
"1. Scale Instance 2. Add Load Balancer 3. Rate Limit Clients 4. Increase Timeout"),
]
print("=== Alert Rules ===")
for a in alerts:
print(f" [{a.alert}] Severity: {a.severity}")
print(f" Condition: {a.condition}")
print(f" Runbook: {a.runbook}")
เคล็ดลับ
- RTF: เฝ้าดู RTF ต้อง < 1.0 เสมอสำหรับ Real-time Use
- GPU: ใช้ GPU เสมอสำหรับ Production CUDA เร็วกว่า CPU 10-50x
- Queue: ตั้ง Queue Limit ป้องกัน Memory Overflow
- Cache: Cache Audio ที่สร้างแล้ว ลด Synthesis ซ้ำ
- Model: เลือก VITS สำหรับ คุณภาพดีที่สุด Tacotron2 สำหรับ เร็ว
Coqui TTS คืออะไร
Open Source Text-to-Speech Python VITS Tacotron2 YourTTS Voice Cloning Multi-speaker Multi-lingual GPU CUDA Streaming API
Observability Stack คืออะไร
Metrics Prometheus Grafana Logs Loki Traces Jaeger Alerting Alertmanager 3 Pillars Dashboard Latency RTF Throughput Error GPU
Metrics ตั้งอย่างไร
prometheus_client Counter Histogram Gauge requests synthesis_duration audio_duration queue gpu_memory RTF PromQL Grafana Panel
Alerting ตั้งอย่างไร
P1 Server Down Error 10% P2 Latency RTF GPU Queue P3 Error 1% Disk Slack PagerDuty Runbook Restart Scale Model Check
สรุป
Coqui TTS Observability Prometheus Grafana Loki Jaeger Metrics RTF Latency Throughput GPU Alerting Runbook Production Monitoring
