TTS Coqui Observability Stack — Monitor ระบบ

Coqui TTS Observability

Coqui TTS Observability Stack Prometheus Grafana Loki Jaeger Metrics Logs Traces Alerting GPU Production Monitoring

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Kubernetes Operator DNS Management

Pillar	Tool	TTS Metrics	Purpose
Metrics	Prometheus + Grafana	Latency RTF Throughput Error GPU	Dashboard Alert Trend
Logs	Loki + Promtail	Request Log Error Log Debug	Troubleshoot Search Query
Traces	Jaeger / Tempo	Request Flow Preprocessing Inference Postprocessing	Bottleneck Analysis
Alerting	Alertmanager	P1 P2 P3 Thresholds	Slack PagerDuty Email

TTS Server with Metrics

# === Coqui TTS Server with Prometheus Metrics ===

# from TTS.api import TTS
# from prometheus_client import Counter, Histogram, Gauge, start_http_server
# from fastapi import FastAPI
# import time
#
# app = FastAPI()
# tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
#
# # Prometheus Metrics
# REQUEST_COUNT = Counter('tts_requests_total', 'Total TTS requests', ['status'])
# SYNTHESIS_DURATION = Histogram('tts_synthesis_duration_seconds', 'Synthesis duration',
#     buckets=[0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0])
# AUDIO_DURATION = Histogram('tts_audio_duration_seconds', 'Generated audio duration',
#     buckets=[1, 2, 5, 10, 30, 60])
# QUEUE_LENGTH = Gauge('tts_queue_length', 'Current queue length')
# GPU_MEMORY = Gauge('tts_gpu_memory_bytes', 'GPU memory usage')
#
# @app.post("/synthesize")
# async def synthesize(text: str):
#     QUEUE_LENGTH.inc()
#     start = time.time()
#     try:
#         wav = tts.tts(text=text)
#         duration = time.time() - start
#         audio_len = len(wav) / 22050  # sample rate
#         SYNTHESIS_DURATION.observe(duration)
#         AUDIO_DURATION.observe(audio_len)
#         REQUEST_COUNT.labels(status="success").inc()
#         return {"audio_duration": audio_len, "rtf": duration / audio_len}
#     except Exception as e:
#         REQUEST_COUNT.labels(status="error").inc()
#         raise
#     finally:
#         QUEUE_LENGTH.dec()

from dataclasses import dataclass

@dataclass
class TTSMetric:
    metric: str
    type_prom: str
    labels: str
    purpose: str

metrics = [
    TTSMetric("tts_requests_total",
        "Counter",
        "status=[success|error], model, language",
        "นับจำนวน Request ทั้งหมด แยกตาม Status"),
    TTSMetric("tts_synthesis_duration_seconds",
        "Histogram",
        "model, language",
        "เวลาสร้างเสียง P50 P95 P99"),
    TTSMetric("tts_audio_duration_seconds",
        "Histogram",
        "model, language",
        "ความยาว Audio ที่สร้าง ใช้คำนวณ RTF"),
    TTSMetric("tts_queue_length",
        "Gauge",
        "instance",
        "จำนวน Request ที่รออยู่ใน Queue"),
    TTSMetric("tts_gpu_memory_bytes",
        "Gauge",
        "instance, gpu_id",
        "GPU Memory ที่ใช้ ตรวจ OOM"),
    TTSMetric("tts_errors_total",
        "Counter",
        "error_type=[oom|timeout|invalid_input]",
        "นับ Error แยกประเภท หา Pattern"),
]

print("=== TTS Prometheus Metrics ===")
for m in metrics:
    print(f"  [{m.metric}] Type: {m.type_prom}")
    print(f"    Labels: {m.labels}")
    print(f"    Purpose: {m.purpose}")

Grafana Dashboard

# === Grafana Dashboard Panels ===

# Prometheus Queries (PromQL)
# RTF (Real-time Factor):
#   rate(tts_synthesis_duration_seconds_sum[5m]) /
#   rate(tts_audio_duration_seconds_sum[5m])
#
# Throughput (req/min):
#   rate(tts_requests_total[5m]) * 60
#
# Error Rate (%):
#   rate(tts_requests_total{status="error"}[5m]) /
#   rate(tts_requests_total[5m]) * 100
#
# P99 Latency:
#   histogram_quantile(0.99, rate(tts_synthesis_duration_seconds_bucket[5m]))

@dataclass
class DashPanel:
    panel: str
    promql: str
    viz_type: str
    alert_threshold: str

panels = [
    DashPanel("Synthesis Latency P50/P95/P99",
        "histogram_quantile(0.99, rate(tts_synthesis_duration_seconds_bucket[5m]))",
        "Time Series (3 lines: P50 P95 P99)",
        "P99 > 5s → Warning | P99 > 10s → Critical"),
    DashPanel("Real-time Factor (RTF)",
        "synthesis_sum / audio_sum",
        "Gauge (green < 0.5, yellow < 1.0, red > 1.0)",
        "RTF > 1.0 → Warning (ช้ากว่า Realtime)"),
    DashPanel("Throughput (req/min)",
        "rate(tts_requests_total[5m]) * 60",
        "Time Series + Stat (current)",
        "< 1 req/min → Check Server Health"),
    DashPanel("Error Rate (%)",
        "error_count / total_count * 100",
        "Gauge (green < 1%, yellow < 5%, red > 10%)",
        "> 5% → Warning | > 10% → Critical"),
    DashPanel("GPU Memory Usage",
        "tts_gpu_memory_bytes / gpu_total_bytes * 100",
        "Gauge + Time Series",
        "> 90% → Warning | > 95% → Critical OOM Risk"),
    DashPanel("Queue Length",
        "tts_queue_length",
        "Time Series + Stat",
        "> 20 → Warning | > 50 → Scale Instance"),
]

print("=== Grafana Dashboard ===")
for p in panels:
    print(f"  [{p.panel}]")
    print(f"    PromQL: {p.promql}")
    print(f"    Viz: {p.viz_type}")
    print(f"    Alert: {p.alert_threshold}")

Alerting & Runbook

# === Alert Rules & Runbook ===

@dataclass
class AlertRule:
    alert: str
    severity: str
    condition: str
    runbook: str

alerts = [
    AlertRule("TTS Server Down",
        "P1 Critical",
        "up{job='tts'} == 0 for 1m",
        "1. Check Container: docker ps 2. Check GPU: nvidia-smi 3. Restart: docker restart tts 4. Check Logs: docker logs tts"),
    AlertRule("High Error Rate",
        "P1 Critical",
        "error_rate > 10% for 5m",
        "1. Check Logs: grep ERROR 2. Check GPU OOM 3. Check Input Validation 4. Restart if OOM"),
    AlertRule("High Latency",
        "P2 Warning",
        "P99 > 5s for 10m",
        "1. Check Queue Length 2. Check GPU Util 3. Scale Instance 4. Reduce Batch Size"),
    AlertRule("RTF > 1.0",
        "P2 Warning",
        "rtf > 1.0 for 10m",
        "1. Check GPU Usage 2. Use Faster Model 3. Reduce Max Text Length 4. Scale GPU"),
    AlertRule("GPU Memory High",
        "P2 Warning",
        "gpu_memory > 90% for 5m",
        "1. Check Concurrent Requests 2. Reduce Batch 3. Use Smaller Model 4. Add GPU"),
    AlertRule("Queue Backlog",
        "P2 Warning",
        "queue_length > 50 for 5m",
        "1. Scale Instance 2. Add Load Balancer 3. Rate Limit Clients 4. Increase Timeout"),
]

print("=== Alert Rules ===")
for a in alerts:
    print(f"  [{a.alert}] Severity: {a.severity}")
    print(f"    Condition: {a.condition}")
    print(f"    Runbook: {a.runbook}")

เคล็ดลับ

RTF: เฝ้าดู RTF ต้อง < 1.0 เสมอสำหรับ Real-time Use
GPU: ใช้ GPU เสมอสำหรับ Production CUDA เร็วกว่า CPU 10-50x
Queue: ตั้ง Queue Limit ป้องกัน Memory Overflow
Cache: Cache Audio ที่สร้างแล้ว ลด Synthesis ซ้ำ
Model: เลือก VITS สำหรับ คุณภาพดีที่สุด Tacotron2 สำหรับ เร็ว

Coqui TTS คืออะไร

Open Source Text-to-Speech Python VITS Tacotron2 YourTTS Voice Cloning Multi-speaker Multi-lingual GPU CUDA Streaming API

แนะนำเพิ่มเติม — บทวิเคราะห์จาก XM Signal

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Redis Pub Sub Docker Container Deploy

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Kubernetes CRD Scaling Strategy วิธี Scale