TTS Coqui Metric Collection — ระบบ
Coqui TTS Metrics

TTS Coqui Text-to-Speech Metric Collection Voice Cloning VITS XTTS Tacotron2 Inference Latency RTF MOS Prometheus Grafana GPU Production
| Model | Quality | Speed | Multi-lingual | Voice Clone | เหมาะกับ |
|---|---|---|---|---|---|
| VITS | ดีมาก | เร็ว | ไม่ | ไม่ | Single Language |
| XTTS v2 | ดีเยี่ยม | ปานกลาง | ใช่ 17 ภาษา | ใช่ | Multi-lingual Clone |
| Tacotron2 | ดี | ช้า | ไม่ | ไม่ | Research |
| Bark | ดี | ช้า | ใช่ | Limited | Creative Audio |
| Piper | ดี | เร็วมาก | ใช่ 30+ | ไม่ | Edge/Offline |
TTS Setup
=== Coqui TTS Installation & Usage ===
pip install TTS
CLI Usage
tts --text "Hello World" --model_name "tts_models/en/ljspeech/vits" --out_path output.wav
tts --text "สวัสดีครับ" --model_name "tts_models/multilingual/multi-dataset/xtts_v2" \
--speaker_wav sample.wav --language_idx th --out_path output_th.wav
Python API
from TTS.api import TTS
# Single Speaker TTS
tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
tts.tts_to_file(text="Hello, this is a test.", file_path="output.wav")
# Multi-lingual + Voice Cloning (XTTS v2)
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
tts.tts_to_file(
text="สวัสดีครับ นี่คือการทดสอบระบบสังเคราะห์เสียง",
file_path="output_th.wav",
speaker_wav="reference_voice.wav",
language="th",
)
TTS Server
เนื้อหาเกี่ยวข้อง — อ่านต่อ: CSS คืออะไร — คู่มือฉบับสมบูรณ์ 2026
tts-server --model_name tts_models/en/ljspeech/vits --port 5002
curl "http://localhost:5002/api/tts?text=Hello+World" -o output.wav
FastAPI Custom Server
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from TTS.api import TTS
แนะนำเพิ่มเติม — iCafeForex
import io, soundfile as sf
app = FastAPI()
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
@app.post("/synthesize")
async def synthesize(text: str, language: str = "en", speaker_wav: str = None):
wav = tts.tts(text=text, language=language, speaker_wav=speaker_wav)
buffer = io.BytesIO()
sf.write(buffer, wav, 22050, format='WAV')
buffer.seek(0)
return StreamingResponse(buffer, media_type="audio/wav")
from dataclasses import dataclass
import time
@dataclass
class TTSBenchmark:
model: str
text_length: int
เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Nginx Plus 12 Factor App
audio_duration: float
inference_time: float
rtf: float
gpu_mem_mb: int
benchmarks = [
TTSBenchmark("VITS", 50, 3.2, 0.15, 0.047, 512),
TTSBenchmark("VITS", 200, 12.5, 0.52, 0.042, 512),
TTSBenchmark("XTTS v2", 50, 3.5, 1.20, 0.343, 2048),
TTSBenchmark("XTTS v2", 200, 14.0, 4.80, 0.343, 2048),
TTSBenchmark("Tacotron2", 50, 3.2, 2.10, 0.656, 1024),
]
print("=== TTS Benchmarks (GPU) ===")
แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook
for b in benchmarks:
print(f" [{b.model}] Text: {b.text_length} chars | Audio: {b.audio_duration}s")
print(f" Inference: {b.inference_time}s | RTF: {b.rtf:.3f} | GPU: {b.gpu_mem_mb}MB")
Metric Collection
=== TTS Metrics with Prometheus ===
pip install prometheus_client
from prometheus_client import Counter, Histogram, Gauge, start_http_server
# Metrics
TTS_REQUESTS = Counter('tts_requests_total', 'Total TTS requests', ['model', 'language', 'status'])
TTS_LATENCY = Histogram('tts_inference_seconds', 'TTS inference latency',
['model'], buckets=[0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0])
เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: wallpaper gaming pc
TTS_AUDIO_DURATION = Histogram('tts_audio_duration_seconds', 'Generated audio duration',
buckets=[1, 2, 5, 10, 30, 60])
TTS_RTF = Histogram('tts_real_time_factor', 'Real-time factor', buckets=[0.01, 0.05, 0.1, 0.5, 1.0])
TTS_GPU_UTILIZATION = Gauge('tts_gpu_utilization_percent', 'GPU utilization')
TTS_GPU_MEMORY = Gauge('tts_gpu_memory_used_mb', 'GPU memory used')
TTS_QUEUE_SIZE = Gauge('tts_request_queue_size', 'Pending requests in queue')
# Instrumented TTS function
def synthesize_with_metrics(text, model, language):

start = time.time()
try:
wav = tts.tts(text=text, language=language)
duration = len(wav) / 22050 # sample rate
latency = time.time() - start
rtf = latency / duration
TTS_REQUESTS.labels(model=model, language=language, status="success").inc()
TTS_LATENCY.labels(model=model).observe(latency)
TTS_AUDIO_DURATION.observe(duration)
TTS_RTF.observe(rtf)
return wav
except Exception as e:
TTS_REQUESTS.labels(model=model, language=language, status="error").inc()
raise
start_http_server(9090) # Prometheus metrics endpoint
เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน PHP Filament IoT Gateway
@dataclass
class MetricDef:
name: str
metric_type: str
unit: str
alert_threshold: str
dashboard: str
metrics = [
MetricDef("tts_inference_seconds", "Histogram", "seconds", "p99 > 5s", "Latency Distribution"),
MetricDef("tts_real_time_factor", "Histogram", "ratio", "RTF > 1.0", "Real-time Performance"),
MetricDef("tts_requests_total", "Counter", "count", "error_rate > 5%", "Request Rate"),
MetricDef("tts_audio_duration_seconds", "Histogram", "seconds", "avg > 30s", "Audio Length"),
MetricDef("tts_gpu_utilization_percent", "Gauge", "%", "> 90%", "GPU Usage"),
MetricDef("tts_gpu_memory_used_mb", "Gauge", "MB", "> 80% capacity", "GPU Memory"),
MetricDef("tts_request_queue_size", "Gauge", "count", "> 50", "Queue Depth"),
]
print("\n=== TTS Metrics ===")
for m in metrics:
print(f" [{m.metric_type}] {m.name} ({m.unit})")
print(f" Alert: {m.alert_threshold} | Dashboard: {m.dashboard}")
Production Deployment
# === Production TTS Deployment ===
# Dockerfile
# FROM nvidia/cuda:12.1-runtime-ubuntu22.04
# RUN pip install TTS fastapi uvicorn prometheus_client
# COPY app/ /app/
# WORKDIR /app
# CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]
# Kubernetes — GPU Deployment
# apiVersion: apps/v1
# kind: Deployment
# spec:
# replicas: 2
# template:
# spec:
# containers:
# - name: tts-server
# image: tts-server:v1.0
# resources:
# limits:
# nvidia.com/gpu: 1
# memory: 8Gi
# ports:
# - containerPort: 8080
# - containerPort: 9090 # metrics
@dataclass
class DeployConfig:
component: str
config: str
purpose: str
scaling: str
configs = [
DeployConfig("TTS Server", "2 replicas GPU T4", "Inference", "HPA on queue_size"),
DeployConfig("Audio Cache", "Redis 4GB", "Cache generated audio", "Cluster mode"),
DeployConfig("Load Balancer", "Nginx round-robin", "Distribute requests", "Auto"),
DeployConfig("Prometheus", "15s scrape interval", "Collect metrics", "Single instance"),
DeployConfig("Grafana", "TTS Dashboard", "Visualize metrics", "Single instance"),
DeployConfig("Alert Manager", "Slack notification", "Alert on issues", "Single instance"),
]
print("Production Architecture:")
for c in configs:
print(f" [{c.component}] {c.config}")
print(f" Purpose: {c.purpose} | Scaling: {c.scaling}")
perf = {
"Avg Latency (VITS)": "0.15s per request",
"Avg Latency (XTTS)": "1.2s per request",
"Throughput (VITS)": "50 req/s per GPU",
"Throughput (XTTS)": "5 req/s per GPU",
"Cache Hit Rate": "35% (repeated texts)",
"Error Rate": "0.1%",
"GPU Utilization": "72%",
"Daily Requests": "250K",
}
print(f"\n\nProduction Performance:")
for k, v in perf.items():
print(f" {k}: {v}")
เคล็ดลับ
- VITS: ใช้ VITS สำหรับ Speed ใช้ XTTS สำหรับ Quality + Clone
- Cache: Cache Audio ที่สร้างแล้ว ลด Compute 30%+
- RTF: ดู Real-time Factor ต้องน้อยกว่า 1 สำหรับ Real-time
- GPU: ใช้ GPU สำหรับ Production CPU ช้ามากสำหรับ TTS
- Monitor: ดู Latency Queue Size GPU Memory ทุกวัน
Coqui TTS คืออะไร
Open Source Text-to-Speech VITS XTTS Tacotron2 Voice Cloning Multi-lingual Pre-trained Fine-tuning Python CLI API Server GPU





