TTS Coqui Developer Experience DX — พัฒนา
Coqui TTS Developer Experience
Coqui TTS Text-to-Speech Python Deep Learning VITS Tacotron2 Voice Cloning Fine-tune API Server ONNX Production Multi-speaker
| Model | Quality | Speed | Use Case |
|---|---|---|---|
| VITS | สูงมาก (End-to-end) | เร็ว (Real-time) | Production แนะนำ |
| Tacotron2 + HiFi-GAN | สูง | ปานกลาง | High-quality TTS |
| Glow-TTS + HiFi-GAN | สูง | เร็ว | Low-latency TTS |
| FastSpeech2 | ปานกลาง-สูง | เร็วมาก | Batch Processing |
| YourTTS | สูง | ปานกลาง | Voice Cloning Zero-shot |
Quick Start
# === Coqui TTS Quick Start ===
# pip install TTS
# pip install TTS[server] # สำหรับ Demo Server
# CLI Usage
# tts --list_models # ดู Models ทั้งหมด
# tts --text "Hello World" \
# --model_name tts_models/en/ljspeech/vits \
# --out_path output.wav
#
# # Multi-speaker
# tts --text "Hello" \
# --model_name tts_models/en/vctk/vits \
# --speaker_idx "p225" \
# --out_path output.wav
#
# # Voice Cloning
# tts --text "Hello" \
# --model_name tts_models/multilingual/multi-dataset/your_tts \
# --speaker_wav reference.wav \
# --language_idx "en" \
# --out_path cloned.wav
# Python API
# from TTS.api import TTS
#
# # Single Speaker
# tts = TTS("tts_models/en/ljspeech/vits", gpu=True)
# tts.tts_to_file(text="Hello World", file_path="output.wav")
#
# # Multi Speaker
# tts = TTS("tts_models/en/vctk/vits", gpu=True)
# tts.tts_to_file(text="Hello", file_path="out.wav", speaker="p225")
#
# # Voice Cloning
# tts = TTS("tts_models/multilingual/multi-dataset/your_tts", gpu=True)
# tts.tts_to_file(text="Hello", file_path="cloned.wav",
# speaker_wav="reference.wav", language="en")
from dataclasses import dataclass
@dataclass
class TTSModel:
name: str
model_id: str
languages: str
speakers: str
quality: str
models = [
TTSModel("VITS (LJSpeech)",
"tts_models/en/ljspeech/vits",
"English",
"Single",
"สูงมาก End-to-end Fast"),
TTSModel("VITS (VCTK)",
"tts_models/en/vctk/vits",
"English",
"Multi (109 speakers)",
"สูง Multi-speaker"),
TTSModel("YourTTS",
"tts_models/multilingual/multi-dataset/your_tts",
"English Portuguese French",
"Voice Cloning",
"สูง Zero-shot Cloning"),
TTSModel("Tacotron2-DDC",
"tts_models/en/ljspeech/tacotron2-DDC",
"English",
"Single",
"สูง Classic Architecture"),
]
print("=== Available Models ===")
for m in models:
print(f" [{m.name}]")
print(f" ID: {m.model_id}")
print(f" Lang: {m.languages} | Speakers: {m.speakers}")
print(f" Quality: {m.quality}")
Fine-tuning
# === Fine-tune Coqui TTS ===
# Dataset Structure
# dataset/
# wavs/
# audio_001.wav (22050Hz Mono 16-bit)
# audio_002.wav
# metadata.csv
# audio_001|Hello this is a test
# audio_002|Another sentence here
# Training Config (config.json)
# {
# "model": "vits",
# "audio": {"sample_rate": 22050},
# "datasets": [{
# "formatter": "ljspeech",
# "path": "./dataset/",
# "meta_file_train": "metadata.csv"
# }],
# "training": {
# "batch_size": 32,
# "epochs": 1000,
# "lr": 0.0002,
# "print_step": 50,
# "save_step": 1000
# }
# }
# python -m TTS.bin.train_tts --config_path config.json
@dataclass
class TrainingTip:
aspect: str
recommendation: str
common_mistake: str
fix: str
tips = [
TrainingTip("Dataset Size",
"อย่างน้อย 2-5 ชั่วโมง Audio สำหรับ Fine-tune",
"Audio น้อยเกินไป Model ไม่เรียนรู้",
"ใช้ Data Augmentation หรือหา Audio เพิ่ม"),
TrainingTip("Audio Quality",
"22050Hz Mono 16-bit WAV ไม่มี Noise Background",
"Audio มี Noise Model เรียนรู้ Noise ด้วย",
"ใช้ Noise Reduction ก่อน Train เช่น RNNoise"),
TrainingTip("Transcription",
"ต้องตรงกับ Audio 100% Punctuation ถูกต้อง",
"Text ไม่ตรง Audio Model สับสน",
"ใช้ Whisper Transcribe แล้ว Manual Review"),
TrainingTip("Learning Rate",
"เริ่ม 0.0002 สำหรับ Fine-tune",
"LR สูงเกินไป Model Diverge",
"ใช้ LR Scheduler ลด LR เมื่อ Loss ไม่ลด"),
TrainingTip("Batch Size",
"32 (GPU 16GB) ลดถ้า OOM",
"Batch Size ใหญ่เกิน GPU Memory",
"ลด Batch Size หรือใช้ Gradient Accumulation"),
]
print("=== Training Tips ===")
for t in tips:
print(f"\n [{t.aspect}] {t.recommendation}")
print(f" Mistake: {t.common_mistake}")
print(f" Fix: {t.fix}")
Production API
# === Production TTS API ===
# # FastAPI TTS Server
# from fastapi import FastAPI, Response
# from TTS.api import TTS
# import io, soundfile as sf
#
# app = FastAPI()
# tts = TTS("tts_models/en/ljspeech/vits", gpu=True)
#
# @app.post("/tts")
# async def text_to_speech(text: str, speaker: str = None):
# wav = tts.tts(text=text, speaker=speaker)
# buffer = io.BytesIO()
# sf.write(buffer, wav, 22050, format="WAV")
# buffer.seek(0)
# return Response(content=buffer.read(),
# media_type="audio/wav")
#
# # gunicorn main:app -w 1 -k uvicorn.workers.UvicornWorker
# # (1 worker per GPU)
@dataclass
class ProductionConfig:
component: str
config: str
purpose: str
scaling: str
production = [
ProductionConfig("FastAPI + Gunicorn",
"1 Worker per GPU + Uvicorn",
"Production HTTP API Server",
"Horizontal: เพิ่ม Replicas ตาม GPU"),
ProductionConfig("ONNX Runtime",
"Export Model → ONNX → ONNXRuntime Inference",
"Inference เร็วขึ้น 2-5x ลด Memory",
"รองรับ CPU Inference ไม่ต้อง GPU"),
ProductionConfig("Redis Queue",
"Celery + Redis สำหรับ Async TTS",
"Queue Request ไม่ Block API",
"เพิ่ม Worker ตาม Backlog"),
ProductionConfig("Audio Cache",
"Redis/S3 Cache Generated Audio",
"ลด Compute สำหรับ Text ซ้ำ",
"Cache Hit Rate 30-50% ลด Cost"),
ProductionConfig("Nginx + Rate Limit",
"Nginx Reverse Proxy + limit_req",
"ป้องกัน Abuse DDoS",
"10 req/s per IP"),
ProductionConfig("Monitoring",
"Prometheus + Grafana",
"Latency Request Rate GPU Utilization",
"Alert เมื่อ Latency > 5s หรือ GPU > 90%"),
]
print("=== Production Config ===")
for p in production:
print(f" [{p.component}] {p.config}")
print(f" Purpose: {p.purpose}")
print(f" Scaling: {p.scaling}")
เคล็ดลับ
- VITS: ใช้ VITS Model สำหรับ Production ดีที่สุด End-to-end
- ONNX: Export ONNX Inference เร็วขึ้น 2-5x รองรับ CPU
- Cache: Cache Audio ที่ Generate แล้ว ลด Compute 30-50%
- GPU: 1 Worker per GPU ไม่ Share GPU ระหว่าง Workers
- Dataset: Audio Quality สำคัญที่สุด ใช้ Noise Reduction ก่อน Train
Coqui TTS คืออะไร
Open Source TTS Python Deep Learning VITS Tacotron2 Multi-speaker Voice Cloning Fine-tune API Server ONNX Pre-trained Models MPL