SiamCafe.net Blog
Technology

TTS Coqui Developer Experience DX

TTS Coqui Developer Experience DX | SiamCafe Blog
2026-04-20· อ. บอม — SiamCafe.net· 11,309 คำ

Coqui TTS Developer Experience

Coqui TTS Text-to-Speech Python Deep Learning VITS Tacotron2 Voice Cloning Fine-tune API Server ONNX Production Multi-speaker

ModelQualitySpeedUse Case
VITSสูงมาก (End-to-end)เร็ว (Real-time)Production แนะนำ
Tacotron2 + HiFi-GANสูงปานกลางHigh-quality TTS
Glow-TTS + HiFi-GANสูงเร็วLow-latency TTS
FastSpeech2ปานกลาง-สูงเร็วมากBatch Processing
YourTTSสูงปานกลางVoice Cloning Zero-shot

Quick Start

# === Coqui TTS Quick Start ===

# pip install TTS
# pip install TTS[server]  # สำหรับ Demo Server

# CLI Usage
# tts --list_models  # ดู Models ทั้งหมด
# tts --text "Hello World" \
#     --model_name tts_models/en/ljspeech/vits \
#     --out_path output.wav
#
# # Multi-speaker
# tts --text "Hello" \
#     --model_name tts_models/en/vctk/vits \
#     --speaker_idx "p225" \
#     --out_path output.wav
#
# # Voice Cloning
# tts --text "Hello" \
#     --model_name tts_models/multilingual/multi-dataset/your_tts \
#     --speaker_wav reference.wav \
#     --language_idx "en" \
#     --out_path cloned.wav

# Python API
# from TTS.api import TTS
#
# # Single Speaker
# tts = TTS("tts_models/en/ljspeech/vits", gpu=True)
# tts.tts_to_file(text="Hello World", file_path="output.wav")
#
# # Multi Speaker
# tts = TTS("tts_models/en/vctk/vits", gpu=True)
# tts.tts_to_file(text="Hello", file_path="out.wav", speaker="p225")
#
# # Voice Cloning
# tts = TTS("tts_models/multilingual/multi-dataset/your_tts", gpu=True)
# tts.tts_to_file(text="Hello", file_path="cloned.wav",
#                 speaker_wav="reference.wav", language="en")

from dataclasses import dataclass

@dataclass
class TTSModel:
    name: str
    model_id: str
    languages: str
    speakers: str
    quality: str

models = [
    TTSModel("VITS (LJSpeech)",
        "tts_models/en/ljspeech/vits",
        "English",
        "Single",
        "สูงมาก End-to-end Fast"),
    TTSModel("VITS (VCTK)",
        "tts_models/en/vctk/vits",
        "English",
        "Multi (109 speakers)",
        "สูง Multi-speaker"),
    TTSModel("YourTTS",
        "tts_models/multilingual/multi-dataset/your_tts",
        "English Portuguese French",
        "Voice Cloning",
        "สูง Zero-shot Cloning"),
    TTSModel("Tacotron2-DDC",
        "tts_models/en/ljspeech/tacotron2-DDC",
        "English",
        "Single",
        "สูง Classic Architecture"),
]

print("=== Available Models ===")
for m in models:
    print(f"  [{m.name}]")
    print(f"    ID: {m.model_id}")
    print(f"    Lang: {m.languages} | Speakers: {m.speakers}")
    print(f"    Quality: {m.quality}")

Fine-tuning

# === Fine-tune Coqui TTS ===

# Dataset Structure
# dataset/
#   wavs/
#     audio_001.wav  (22050Hz Mono 16-bit)
#     audio_002.wav
#   metadata.csv
#     audio_001|Hello this is a test
#     audio_002|Another sentence here

# Training Config (config.json)
# {
#   "model": "vits",
#   "audio": {"sample_rate": 22050},
#   "datasets": [{
#     "formatter": "ljspeech",
#     "path": "./dataset/",
#     "meta_file_train": "metadata.csv"
#   }],
#   "training": {
#     "batch_size": 32,
#     "epochs": 1000,
#     "lr": 0.0002,
#     "print_step": 50,
#     "save_step": 1000
#   }
# }

# python -m TTS.bin.train_tts --config_path config.json

@dataclass
class TrainingTip:
    aspect: str
    recommendation: str
    common_mistake: str
    fix: str

tips = [
    TrainingTip("Dataset Size",
        "อย่างน้อย 2-5 ชั่วโมง Audio สำหรับ Fine-tune",
        "Audio น้อยเกินไป Model ไม่เรียนรู้",
        "ใช้ Data Augmentation หรือหา Audio เพิ่ม"),
    TrainingTip("Audio Quality",
        "22050Hz Mono 16-bit WAV ไม่มี Noise Background",
        "Audio มี Noise Model เรียนรู้ Noise ด้วย",
        "ใช้ Noise Reduction ก่อน Train เช่น RNNoise"),
    TrainingTip("Transcription",
        "ต้องตรงกับ Audio 100% Punctuation ถูกต้อง",
        "Text ไม่ตรง Audio Model สับสน",
        "ใช้ Whisper Transcribe แล้ว Manual Review"),
    TrainingTip("Learning Rate",
        "เริ่ม 0.0002 สำหรับ Fine-tune",
        "LR สูงเกินไป Model Diverge",
        "ใช้ LR Scheduler ลด LR เมื่อ Loss ไม่ลด"),
    TrainingTip("Batch Size",
        "32 (GPU 16GB) ลดถ้า OOM",
        "Batch Size ใหญ่เกิน GPU Memory",
        "ลด Batch Size หรือใช้ Gradient Accumulation"),
]

print("=== Training Tips ===")
for t in tips:
    print(f"\n  [{t.aspect}] {t.recommendation}")
    print(f"    Mistake: {t.common_mistake}")
    print(f"    Fix: {t.fix}")

Production API

# === Production TTS API ===

# # FastAPI TTS Server
# from fastapi import FastAPI, Response
# from TTS.api import TTS
# import io, soundfile as sf
#
# app = FastAPI()
# tts = TTS("tts_models/en/ljspeech/vits", gpu=True)
#
# @app.post("/tts")
# async def text_to_speech(text: str, speaker: str = None):
#     wav = tts.tts(text=text, speaker=speaker)
#     buffer = io.BytesIO()
#     sf.write(buffer, wav, 22050, format="WAV")
#     buffer.seek(0)
#     return Response(content=buffer.read(),
#                     media_type="audio/wav")
#
# # gunicorn main:app -w 1 -k uvicorn.workers.UvicornWorker
# # (1 worker per GPU)

@dataclass
class ProductionConfig:
    component: str
    config: str
    purpose: str
    scaling: str

production = [
    ProductionConfig("FastAPI + Gunicorn",
        "1 Worker per GPU + Uvicorn",
        "Production HTTP API Server",
        "Horizontal: เพิ่ม Replicas ตาม GPU"),
    ProductionConfig("ONNX Runtime",
        "Export Model → ONNX → ONNXRuntime Inference",
        "Inference เร็วขึ้น 2-5x ลด Memory",
        "รองรับ CPU Inference ไม่ต้อง GPU"),
    ProductionConfig("Redis Queue",
        "Celery + Redis สำหรับ Async TTS",
        "Queue Request ไม่ Block API",
        "เพิ่ม Worker ตาม Backlog"),
    ProductionConfig("Audio Cache",
        "Redis/S3 Cache Generated Audio",
        "ลด Compute สำหรับ Text ซ้ำ",
        "Cache Hit Rate 30-50% ลด Cost"),
    ProductionConfig("Nginx + Rate Limit",
        "Nginx Reverse Proxy + limit_req",
        "ป้องกัน Abuse DDoS",
        "10 req/s per IP"),
    ProductionConfig("Monitoring",
        "Prometheus + Grafana",
        "Latency Request Rate GPU Utilization",
        "Alert เมื่อ Latency > 5s หรือ GPU > 90%"),
]

print("=== Production Config ===")
for p in production:
    print(f"  [{p.component}] {p.config}")
    print(f"    Purpose: {p.purpose}")
    print(f"    Scaling: {p.scaling}")

เคล็ดลับ

Coqui TTS คืออะไร

Open Source TTS Python Deep Learning VITS Tacotron2 Multi-speaker Voice Cloning Fine-tune API Server ONNX Pre-trained Models MPL

ติดตั้งอย่างไร

pip install TTS Python 3.8+ PyTorch CUDA CLI tts --text Python API TTS.api Demo Server tts-server Docker GPU Support

Fine-tune ทำอย่างไร

Dataset WAV 22050Hz metadata.csv Config JSON Training batch_size lr epochs Checkpoint TensorBoard Loss Noise Reduction Whisper Transcribe

Production Deploy อย่างไร

FastAPI Gunicorn ONNX Runtime Redis Queue Audio Cache Nginx Rate Limit Docker GPU Prometheus Grafana Monitoring Latency Alert

สรุป

Coqui TTS Developer Experience VITS Tacotron2 Voice Cloning Fine-tune FastAPI ONNX Redis Cache GPU Monitoring Production

📖 บทความที่เกี่ยวข้อง