SiamCafe.net Blog
Technology

TTS Coqui Metric Collection

tts coqui metric collection
TTS Coqui Metric Collection | SiamCafe Blog
2025-09-26· อ. บอม — SiamCafe.net· 8,748 คำ

Coqui TTS Metrics

TTS Coqui Text-to-Speech Metric Collection Voice Cloning VITS XTTS Tacotron2 Inference Latency RTF MOS Prometheus Grafana GPU Production

ModelQualitySpeedMulti-lingualVoice Cloneเหมาะกับ
VITSดีมากเร็วไม่ไม่Single Language
XTTS v2ดีเยี่ยมปานกลางใช่ 17 ภาษาใช่Multi-lingual Clone
Tacotron2ดีช้าไม่ไม่Research
Barkดีช้าใช่LimitedCreative Audio
Piperดีเร็วมากใช่ 30+ไม่Edge/Offline

TTS Setup

# === Coqui TTS Installation & Usage ===

# pip install TTS

# CLI Usage
# tts --text "Hello World" --model_name "tts_models/en/ljspeech/vits" --out_path output.wav
# tts --text "สวัสดีครับ" --model_name "tts_models/multilingual/multi-dataset/xtts_v2" \
#   --speaker_wav sample.wav --language_idx th --out_path output_th.wav

# Python API
# from TTS.api import TTS
#
# # Single Speaker TTS
# tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
# tts.tts_to_file(text="Hello, this is a test.", file_path="output.wav")
#
# # Multi-lingual + Voice Cloning (XTTS v2)
# tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
# tts.tts_to_file(
#     text="สวัสดีครับ นี่คือการทดสอบระบบสังเคราะห์เสียง",
#     file_path="output_th.wav",
#     speaker_wav="reference_voice.wav",
#     language="th",
# )

# TTS Server
# tts-server --model_name tts_models/en/ljspeech/vits --port 5002
# curl "http://localhost:5002/api/tts?text=Hello+World" -o output.wav

# FastAPI Custom Server
# from fastapi import FastAPI
# from fastapi.responses import StreamingResponse
# from TTS.api import TTS
# import io, soundfile as sf
#
# app = FastAPI()
# tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
#
# @app.post("/synthesize")
# async def synthesize(text: str, language: str = "en", speaker_wav: str = None):
#     wav = tts.tts(text=text, language=language, speaker_wav=speaker_wav)
#     buffer = io.BytesIO()
#     sf.write(buffer, wav, 22050, format='WAV')
#     buffer.seek(0)
#     return StreamingResponse(buffer, media_type="audio/wav")

from dataclasses import dataclass
import time

@dataclass
class TTSBenchmark:
    model: str
    text_length: int
    audio_duration: float
    inference_time: float
    rtf: float
    gpu_mem_mb: int

benchmarks = [
    TTSBenchmark("VITS", 50, 3.2, 0.15, 0.047, 512),
    TTSBenchmark("VITS", 200, 12.5, 0.52, 0.042, 512),
    TTSBenchmark("XTTS v2", 50, 3.5, 1.20, 0.343, 2048),
    TTSBenchmark("XTTS v2", 200, 14.0, 4.80, 0.343, 2048),
    TTSBenchmark("Tacotron2", 50, 3.2, 2.10, 0.656, 1024),
]

print("=== TTS Benchmarks (GPU) ===")
for b in benchmarks:
    print(f"  [{b.model}] Text: {b.text_length} chars | Audio: {b.audio_duration}s")
    print(f"    Inference: {b.inference_time}s | RTF: {b.rtf:.3f} | GPU: {b.gpu_mem_mb}MB")

Metric Collection

# === TTS Metrics with Prometheus ===

# pip install prometheus_client

# from prometheus_client import Counter, Histogram, Gauge, start_http_server
#
# # Metrics
# TTS_REQUESTS = Counter('tts_requests_total', 'Total TTS requests', ['model', 'language', 'status'])
# TTS_LATENCY = Histogram('tts_inference_seconds', 'TTS inference latency',
#     ['model'], buckets=[0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0])
# TTS_AUDIO_DURATION = Histogram('tts_audio_duration_seconds', 'Generated audio duration',
#     buckets=[1, 2, 5, 10, 30, 60])
# TTS_RTF = Histogram('tts_real_time_factor', 'Real-time factor', buckets=[0.01, 0.05, 0.1, 0.5, 1.0])
# TTS_GPU_UTILIZATION = Gauge('tts_gpu_utilization_percent', 'GPU utilization')
# TTS_GPU_MEMORY = Gauge('tts_gpu_memory_used_mb', 'GPU memory used')
# TTS_QUEUE_SIZE = Gauge('tts_request_queue_size', 'Pending requests in queue')
#
# # Instrumented TTS function
# def synthesize_with_metrics(text, model, language):
#     start = time.time()
#     try:
#         wav = tts.tts(text=text, language=language)
#         duration = len(wav) / 22050  # sample rate
#         latency = time.time() - start
#         rtf = latency / duration
#
#         TTS_REQUESTS.labels(model=model, language=language, status="success").inc()
#         TTS_LATENCY.labels(model=model).observe(latency)
#         TTS_AUDIO_DURATION.observe(duration)
#         TTS_RTF.observe(rtf)
#         return wav
#     except Exception as e:
#         TTS_REQUESTS.labels(model=model, language=language, status="error").inc()
#         raise
#
# start_http_server(9090)  # Prometheus metrics endpoint

@dataclass
class MetricDef:
    name: str
    metric_type: str
    unit: str
    alert_threshold: str
    dashboard: str

metrics = [
    MetricDef("tts_inference_seconds", "Histogram", "seconds", "p99 > 5s", "Latency Distribution"),
    MetricDef("tts_real_time_factor", "Histogram", "ratio", "RTF > 1.0", "Real-time Performance"),
    MetricDef("tts_requests_total", "Counter", "count", "error_rate > 5%", "Request Rate"),
    MetricDef("tts_audio_duration_seconds", "Histogram", "seconds", "avg > 30s", "Audio Length"),
    MetricDef("tts_gpu_utilization_percent", "Gauge", "%", "> 90%", "GPU Usage"),
    MetricDef("tts_gpu_memory_used_mb", "Gauge", "MB", "> 80% capacity", "GPU Memory"),
    MetricDef("tts_request_queue_size", "Gauge", "count", "> 50", "Queue Depth"),
]

print("\n=== TTS Metrics ===")
for m in metrics:
    print(f"  [{m.metric_type}] {m.name} ({m.unit})")
    print(f"    Alert: {m.alert_threshold} | Dashboard: {m.dashboard}")

Production Deployment

# === Production TTS Deployment ===

# Dockerfile
# FROM nvidia/cuda:12.1-runtime-ubuntu22.04
# RUN pip install TTS fastapi uvicorn prometheus_client
# COPY app/ /app/
# WORKDIR /app
# CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]

# Kubernetes — GPU Deployment
# apiVersion: apps/v1
# kind: Deployment
# spec:
#   replicas: 2
#   template:
#     spec:
#       containers:
#         - name: tts-server
#           image: tts-server:v1.0
#           resources:
#             limits:
#               nvidia.com/gpu: 1
#               memory: 8Gi
#           ports:
#             - containerPort: 8080
#             - containerPort: 9090  # metrics

@dataclass
class DeployConfig:
    component: str
    config: str
    purpose: str
    scaling: str

configs = [
    DeployConfig("TTS Server", "2 replicas GPU T4", "Inference", "HPA on queue_size"),
    DeployConfig("Audio Cache", "Redis 4GB", "Cache generated audio", "Cluster mode"),
    DeployConfig("Load Balancer", "Nginx round-robin", "Distribute requests", "Auto"),
    DeployConfig("Prometheus", "15s scrape interval", "Collect metrics", "Single instance"),
    DeployConfig("Grafana", "TTS Dashboard", "Visualize metrics", "Single instance"),
    DeployConfig("Alert Manager", "Slack notification", "Alert on issues", "Single instance"),
]

print("Production Architecture:")
for c in configs:
    print(f"  [{c.component}] {c.config}")
    print(f"    Purpose: {c.purpose} | Scaling: {c.scaling}")

perf = {
    "Avg Latency (VITS)": "0.15s per request",
    "Avg Latency (XTTS)": "1.2s per request",
    "Throughput (VITS)": "50 req/s per GPU",
    "Throughput (XTTS)": "5 req/s per GPU",
    "Cache Hit Rate": "35% (repeated texts)",
    "Error Rate": "0.1%",
    "GPU Utilization": "72%",
    "Daily Requests": "250K",
}

print(f"\n\nProduction Performance:")
for k, v in perf.items():
    print(f"  {k}: {v}")

เคล็ดลับ

Coqui TTS คืออะไร

Open Source Text-to-Speech VITS XTTS Tacotron2 Voice Cloning Multi-lingual Pre-trained Fine-tuning Python CLI API Server GPU

เก็บ Metrics ของ TTS อย่างไร

Inference Latency RTF Real-time Factor MOS PESQ Throughput GPU Utilization Memory Error Rate Prometheus Grafana Dashboard Alert

Voice Cloning ทำอย่างไร

XTTS Model Audio ตัวอย่าง 6 วินาที+ ไม่มี Noise speaker_wav Zero-shot ไม่ต้อง Fine-tune ภาษาไทย XTTS v2 Multi-lingual

Deploy TTS Server อย่างไร

tts-server FastAPI Docker Kubernetes GPU Load Balancer Caching Redis Monitoring Latency Throughput Error Scale

สรุป

TTS Coqui Metric Collection Voice Cloning VITS XTTS Prometheus Grafana Inference Latency RTF GPU Docker Kubernetes Production Monitoring

📖 บทความที่เกี่ยวข้อง

TTS Coqui GreenOps Sustainabilityอ่านบทความ → Python Click CLI Metric Collectionอ่านบทความ → TTS Coqui Distributed Systemอ่านบทความ → TTS Coqui อ่านบทความ → TTS Coqui Progressive Deliveryอ่านบทความ →

📚 ดูบทความทั้งหมด →

script type="text/javascript"> var _Hasync= _Hasync|| []; _Hasync.push(['Histats.start', '1,4538569,4,0,0,0,00010000']); _Hasync.push(['Histats.fasi', '1']); _Hasync.push(['Histats.track_hits', '']); (function() { var hs = document.createElement('script'); hs.type = 'text/javascript'; hs.async = true; hs.src = ('//s10.histats.com/js15_as.js'); (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(hs); })();