Coqui TTS Developer Experience
Coqui TTS Text-to-Speech Python Deep Learning VITS Tacotron2 Voice Cloning Fine-tune API Server ONNX Production Multi-speaker
| Model | Quality | Speed | Use Case |
|---|---|---|---|
| VITS | สูงมาก (End-to-end) | เร็ว (Real-time) | Production แนะนำ |
| Tacotron2 + HiFi-GAN | สูง | ปานกลาง | High-quality TTS |
| Glow-TTS + HiFi-GAN | สูง | เร็ว | Low-latency TTS |
| FastSpeech2 | ปานกลาง-สูง | เร็วมาก | Batch Processing |
| YourTTS | สูง | ปานกลาง | Voice Cloning Zero-shot |
Quick Start
# === Coqui TTS Quick Start ===
# pip install TTS
# pip install TTS[server] # สำหรับ Demo Server
# CLI Usage
# tts --list_models # ดู Models ทั้งหมด
# tts --text "Hello World" \
# --model_name tts_models/en/ljspeech/vits \
# --out_path output.wav
#
# # Multi-speaker
# tts --text "Hello" \
# --model_name tts_models/en/vctk/vits \
# --speaker_idx "p225" \
# --out_path output.wav
#
# # Voice Cloning
# tts --text "Hello" \
# --model_name tts_models/multilingual/multi-dataset/your_tts \
# --speaker_wav reference.wav \
# --language_idx "en" \
# --out_path cloned.wav
# Python API
# from TTS.api import TTS
#
# # Single Speaker
# tts = TTS("tts_models/en/ljspeech/vits", gpu=True)
# tts.tts_to_file(text="Hello World", file_path="output.wav")
#
# # Multi Speaker
# tts = TTS("tts_models/en/vctk/vits", gpu=True)
# tts.tts_to_file(text="Hello", file_path="out.wav", speaker="p225")
#
# # Voice Cloning
# tts = TTS("tts_models/multilingual/multi-dataset/your_tts", gpu=True)
# tts.tts_to_file(text="Hello", file_path="cloned.wav",
# speaker_wav="reference.wav", language="en")
from dataclasses import dataclass
@dataclass
class TTSModel:
name: str
model_id: str
languages: str
speakers: str
quality: str
models = [
TTSModel("VITS (LJSpeech)",
"tts_models/en/ljspeech/vits",
"English",
"Single",
"สูงมาก End-to-end Fast"),
TTSModel("VITS (VCTK)",
"tts_models/en/vctk/vits",
"English",
"Multi (109 speakers)",
"สูง Multi-speaker"),
TTSModel("YourTTS",
"tts_models/multilingual/multi-dataset/your_tts",
"English Portuguese French",
"Voice Cloning",
"สูง Zero-shot Cloning"),
TTSModel("Tacotron2-DDC",
"tts_models/en/ljspeech/tacotron2-DDC",
"English",
"Single",
"สูง Classic Architecture"),
]
print("=== Available Models ===")
for m in models:
print(f" [{m.name}]")
print(f" ID: {m.model_id}")
print(f" Lang: {m.languages} | Speakers: {m.speakers}")
print(f" Quality: {m.quality}")
Fine-tuning
# === Fine-tune Coqui TTS ===
# Dataset Structure
# dataset/
# wavs/
# audio_001.wav (22050Hz Mono 16-bit)
# audio_002.wav
# metadata.csv
# audio_001|Hello this is a test
# audio_002|Another sentence here
# Training Config (config.json)
# {
# "model": "vits",
# "audio": {"sample_rate": 22050},
# "datasets": [{
# "formatter": "ljspeech",
# "path": "./dataset/",
# "meta_file_train": "metadata.csv"
# }],
# "training": {
# "batch_size": 32,
# "epochs": 1000,
# "lr": 0.0002,
# "print_step": 50,
# "save_step": 1000
# }
# }
# python -m TTS.bin.train_tts --config_path config.json
@dataclass
class TrainingTip:
aspect: str
recommendation: str
common_mistake: str
fix: str
tips = [
TrainingTip("Dataset Size",
"อย่างน้อย 2-5 ชั่วโมง Audio สำหรับ Fine-tune",
"Audio น้อยเกินไป Model ไม่เรียนรู้",
"ใช้ Data Augmentation หรือหา Audio เพิ่ม"),
TrainingTip("Audio Quality",
"22050Hz Mono 16-bit WAV ไม่มี Noise Background",
"Audio มี Noise Model เรียนรู้ Noise ด้วย",
"ใช้ Noise Reduction ก่อน Train เช่น RNNoise"),
TrainingTip("Transcription",
"ต้องตรงกับ Audio 100% Punctuation ถูกต้อง",
"Text ไม่ตรง Audio Model สับสน",
"ใช้ Whisper Transcribe แล้ว Manual Review"),
TrainingTip("Learning Rate",
"เริ่ม 0.0002 สำหรับ Fine-tune",
"LR สูงเกินไป Model Diverge",
"ใช้ LR Scheduler ลด LR เมื่อ Loss ไม่ลด"),
TrainingTip("Batch Size",
"32 (GPU 16GB) ลดถ้า OOM",
"Batch Size ใหญ่เกิน GPU Memory",
"ลด Batch Size หรือใช้ Gradient Accumulation"),
]
print("=== Training Tips ===")
for t in tips:
print(f"\n [{t.aspect}] {t.recommendation}")
print(f" Mistake: {t.common_mistake}")
print(f" Fix: {t.fix}")
Production API
# === Production TTS API ===
# # FastAPI TTS Server
# from fastapi import FastAPI, Response
# from TTS.api import TTS
# import io, soundfile as sf
#
# app = FastAPI()
# tts = TTS("tts_models/en/ljspeech/vits", gpu=True)
#
# @app.post("/tts")
# async def text_to_speech(text: str, speaker: str = None):
# wav = tts.tts(text=text, speaker=speaker)
# buffer = io.BytesIO()
# sf.write(buffer, wav, 22050, format="WAV")
# buffer.seek(0)
# return Response(content=buffer.read(),
# media_type="audio/wav")
#
# # gunicorn main:app -w 1 -k uvicorn.workers.UvicornWorker
# # (1 worker per GPU)
@dataclass
class ProductionConfig:
component: str
config: str
purpose: str
scaling: str
production = [
ProductionConfig("FastAPI + Gunicorn",
"1 Worker per GPU + Uvicorn",
"Production HTTP API Server",
"Horizontal: เพิ่ม Replicas ตาม GPU"),
ProductionConfig("ONNX Runtime",
"Export Model → ONNX → ONNXRuntime Inference",
"Inference เร็วขึ้น 2-5x ลด Memory",
"รองรับ CPU Inference ไม่ต้อง GPU"),
ProductionConfig("Redis Queue",
"Celery + Redis สำหรับ Async TTS",
"Queue Request ไม่ Block API",
"เพิ่ม Worker ตาม Backlog"),
ProductionConfig("Audio Cache",
"Redis/S3 Cache Generated Audio",
"ลด Compute สำหรับ Text ซ้ำ",
"Cache Hit Rate 30-50% ลด Cost"),
ProductionConfig("Nginx + Rate Limit",
"Nginx Reverse Proxy + limit_req",
"ป้องกัน Abuse DDoS",
"10 req/s per IP"),
ProductionConfig("Monitoring",
"Prometheus + Grafana",
"Latency Request Rate GPU Utilization",
"Alert เมื่อ Latency > 5s หรือ GPU > 90%"),
]
print("=== Production Config ===")
for p in production:
print(f" [{p.component}] {p.config}")
print(f" Purpose: {p.purpose}")
print(f" Scaling: {p.scaling}")
เคล็ดลับ
- VITS: ใช้ VITS Model สำหรับ Production ดีที่สุด End-to-end
- ONNX: Export ONNX Inference เร็วขึ้น 2-5x รองรับ CPU
- Cache: Cache Audio ที่ Generate แล้ว ลด Compute 30-50%
- GPU: 1 Worker per GPU ไม่ Share GPU ระหว่าง Workers
- Dataset: Audio Quality สำคัญที่สุด ใช้ Noise Reduction ก่อน Train
Coqui TTS คืออะไร
Open Source TTS Python Deep Learning VITS Tacotron2 Multi-speaker Voice Cloning Fine-tune API Server ONNX Pre-trained Models MPL
ติดตั้งอย่างไร
pip install TTS Python 3.8+ PyTorch CUDA CLI tts --text Python API TTS.api Demo Server tts-server Docker GPU Support
Fine-tune ทำอย่างไร
Dataset WAV 22050Hz metadata.csv Config JSON Training batch_size lr epochs Checkpoint TensorBoard Loss Noise Reduction Whisper Transcribe
Production Deploy อย่างไร
FastAPI Gunicorn ONNX Runtime Redis Queue Audio Cache Nginx Rate Limit Docker GPU Prometheus Grafana Monitoring Latency Alert
สรุป
Coqui TTS Developer Experience VITS Tacotron2 Voice Cloning Fine-tune FastAPI ONNX Redis Cache GPU Monitoring Production
อ่านเพิ่มเติม: สอนเทรด Forex | XM Signal | IT Hardware | อาชีพ IT