TTS Coqui Batch Processing Pipeline คืออะไร
Coqui TTS เป็น open source text-to-speech library ที่รองรับหลายภาษาและหลาย models เช่น Tacotron2, VITS, YourTTS สำหรับแปลงข้อความเป็นเสียงพูดคุณภาพสูง Batch Processing Pipeline คือระบบประมวลผลข้อมูลจำนวนมากเป็น batch แทนการทำทีละรายการ การรวมสองแนวคิดนี้ช่วยสร้างระบบ TTS ที่ประมวลผลข้อความหลายพันรายการเป็นไฟล์เสียงอัตโนมัติ เหมาะสำหรับ audiobook production, voice notification systems, accessibility tools และ content localization
Coqui TTS Fundamentals
# coqui_basics.py — Coqui TTS fundamentals
import json
class CoquiTTSBasics:
MODELS = {
"tacotron2": {
"name": "Tacotron2",
"type": "Autoregressive",
"quality": "สูง",
"speed": "ช้า (sequential generation)",
"use": "High-quality single speaker TTS",
},
"vits": {
"name": "VITS (Variational Inference TTS)",
"type": "End-to-end",
"quality": "สูงมาก",
"speed": "เร็ว (parallel generation)",
"use": "Production TTS — quality + speed balance",
},
"yourtts": {
"name": "YourTTS",
"type": "Multi-speaker, Multi-lingual",
"quality": "สูง",
"speed": "ปานกลาง",
"use": "Voice cloning, multi-language, zero-shot TTS",
},
"bark": {
"name": "Bark (Suno)",
"type": "Transformer-based",
"quality": "สูงมาก (natural prosody)",
"speed": "ช้า",
"use": "Expressive speech, music, sound effects",
},
}
SETUP_CODE = """
# Install Coqui TTS
# pip install TTS
from TTS.api import TTS
import torch
# List available models
print(TTS.list_models())
# Initialize model
tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=torch.cuda.is_available())
# Single text to speech
tts.tts_to_file(
text="Hello, this is a test of Coqui TTS.",
file_path="output.wav"
)
# Multi-speaker model
tts_multi = TTS(model_name="tts_models/en/vctk/vits")
tts_multi.tts_to_file(
text="Hello world",
speaker="p225",
file_path="output_p225.wav"
)
# Voice cloning (YourTTS)
tts_clone = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
tts_clone.tts_to_file(
text="สวัสดีครับ ทดสอบระบบ TTS",
speaker_wav="reference_voice.wav",
language="th",
file_path="cloned_output.wav"
)
"""
def show_models(self):
print("=== Coqui TTS Models ===\n")
for key, model in self.MODELS.items():
print(f"[{model['name']}] ({model['type']})")
print(f" Quality: {model['quality']} | Speed: {model['speed']}")
print(f" Use: {model['use']}")
print()
def show_setup(self):
print("=== Setup Code ===")
print(self.SETUP_CODE[:500])
basics = CoquiTTSBasics()
basics.show_models()
basics.show_setup()
Batch Processing Pipeline
# batch_pipeline.py — TTS batch processing pipeline
import json
class TTSBatchPipeline:
CODE = """
# tts_batch.py — Batch TTS processing pipeline
from TTS.api import TTS
import torch
import os
import json
import time
import logging
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class TTSBatchProcessor:
def __init__(self, model_name="tts_models/en/ljspeech/vits",
output_dir="output", gpu=True):
self.model_name = model_name
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.gpu = gpu and torch.cuda.is_available()
self.tts = None
self.stats = {"processed": 0, "failed": 0, "total_time": 0}
def _init_model(self):
if self.tts is None:
self.tts = TTS(model_name=self.model_name, gpu=self.gpu)
def process_single(self, text, filename, speaker=None):
'''Process single text to audio'''
self._init_model()
output_path = self.output_dir / filename
try:
start = time.time()
kwargs = {"text": text, "file_path": str(output_path)}
if speaker:
kwargs["speaker"] = speaker
self.tts.tts_to_file(**kwargs)
duration = time.time() - start
file_size = output_path.stat().st_size
self.stats["processed"] += 1
self.stats["total_time"] += duration
return {
"status": "success",
"file": str(output_path),
"duration": round(duration, 2),
"size_kb": round(file_size / 1024, 1),
}
except Exception as e:
self.stats["failed"] += 1
logger.error(f"Failed: {filename} — {e}")
return {"status": "failed", "error": str(e)}
def process_batch(self, items):
'''Process batch of (text, filename) tuples'''
results = []
total = len(items)
for i, item in enumerate(items):
text = item["text"]
filename = item.get("filename", f"audio_{i:05d}.wav")
speaker = item.get("speaker")
logger.info(f"Processing {i+1}/{total}: {filename}")
result = self.process_single(text, filename, speaker)
results.append(result)
return results
def process_from_file(self, input_file):
'''Process from JSON file'''
with open(input_file) as f:
items = json.load(f)
return self.process_batch(items)
def report(self):
avg_time = (self.stats['total_time'] / max(self.stats['processed'], 1))
return {
**self.stats,
"avg_time_per_item": round(avg_time, 2),
"success_rate": f"{self.stats['processed'] / max(self.stats['processed'] + self.stats['failed'], 1) * 100:.1f}%",
}
# Usage
processor = TTSBatchProcessor(
model_name="tts_models/en/ljspeech/vits",
output_dir="./tts_output",
gpu=True
)
# Batch items
items = [
{"text": "Welcome to our platform.", "filename": "welcome.wav"},
{"text": "Your order has been confirmed.", "filename": "order_confirm.wav"},
{"text": "Thank you for your purchase.", "filename": "thank_you.wav"},
]
results = processor.process_batch(items)
print(json.dumps(processor.report(), indent=2))
"""
def show_code(self):
print("=== Batch Pipeline ===")
print(self.CODE[:600])
pipeline = TTSBatchPipeline()
pipeline.show_code()
Queue-based Architecture
# queue_arch.py — Queue-based TTS pipeline
import json
class QueueArchitecture:
CELERY_WORKER = """
# tts_worker.py — Celery worker for TTS tasks
from celery import Celery
from TTS.api import TTS
import torch
import os
app = Celery('tts', broker='redis://localhost:6379/0',
backend='redis://localhost:6379/1')
# Initialize model once per worker
tts_model = None
def get_model():
global tts_model
if tts_model is None:
tts_model = TTS(
model_name="tts_models/en/ljspeech/vits",
gpu=torch.cuda.is_available()
)
return tts_model
@app.task(bind=True, max_retries=3, default_retry_delay=30)
def synthesize(self, text, output_path, speaker=None):
try:
model = get_model()
kwargs = {"text": text, "file_path": output_path}
if speaker:
kwargs["speaker"] = speaker
model.tts_to_file(**kwargs)
return {
"status": "success",
"file": output_path,
"size": os.path.getsize(output_path),
}
except Exception as exc:
raise self.retry(exc=exc)
@app.task
def batch_synthesize(items):
results = []
for item in items:
result = synthesize.delay(
item["text"],
item["output_path"],
item.get("speaker")
)
results.append(result.id)
return results
"""
DOCKER_COMPOSE = """
# docker-compose.yml — TTS pipeline infrastructure
version: '3.8'
services:
redis:
image: redis:7-alpine
ports:
- "6379:6379"
tts-worker:
build: .
command: celery -A tts_worker worker --loglevel=info --concurrency=2
deploy:
replicas: 2
resources:
reservations:
devices:
- capabilities: [gpu]
volumes:
- ./output:/app/output
depends_on:
- redis
tts-api:
build: .
command: uvicorn api:app --host 0.0.0.0 --port 8000
ports:
- "8000:8000"
depends_on:
- redis
flower:
image: mher/flower
command: celery --broker=redis://redis:6379/0 flower
ports:
- "5555:5555"
depends_on:
- redis
"""
def show_worker(self):
print("=== Celery Worker ===")
print(self.CELERY_WORKER[:500])
def show_docker(self):
print(f"\n=== Docker Compose ===")
print(self.DOCKER_COMPOSE[:500])
queue = QueueArchitecture()
queue.show_worker()
queue.show_docker()
REST API
# api.py — FastAPI for TTS batch processing
import json
class TTSAPI:
CODE = """
# api.py — FastAPI TTS service
from fastapi import FastAPI, BackgroundTasks, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uuid
app = FastAPI(title="TTS Batch API")
class TTSRequest(BaseModel):
text: str
filename: Optional[str] = None
speaker: Optional[str] = None
language: Optional[str] = "en"
class BatchRequest(BaseModel):
items: List[TTSRequest]
model: Optional[str] = "tts_models/en/ljspeech/vits"
class BatchResponse(BaseModel):
batch_id: str
total_items: int
status: str
@app.post("/api/tts/single")
async def synthesize_single(req: TTSRequest, background_tasks: BackgroundTasks):
job_id = str(uuid.uuid4())[:8]
filename = req.filename or f"{job_id}.wav"
background_tasks.add_task(
process_tts, req.text, filename, req.speaker
)
return {"job_id": job_id, "filename": filename, "status": "processing"}
@app.post("/api/tts/batch", response_model=BatchResponse)
async def synthesize_batch(req: BatchRequest):
batch_id = str(uuid.uuid4())[:8]
# Submit to Celery queue
from tts_worker import batch_synthesize
task_ids = batch_synthesize.delay([
{"text": item.text, "output_path": f"output/{batch_id}_{i}.wav",
"speaker": item.speaker}
for i, item in enumerate(req.items)
])
return BatchResponse(
batch_id=batch_id,
total_items=len(req.items),
status="queued"
)
@app.get("/api/tts/batch/{batch_id}/status")
async def batch_status(batch_id: str):
# Check Celery task status
return {"batch_id": batch_id, "completed": 0, "total": 0, "status": "processing"}
@app.get("/api/tts/models")
async def list_models():
return {
"models": [
{"id": "vits-en", "name": "VITS English", "languages": ["en"]},
{"id": "vits-multi", "name": "VITS Multi-speaker", "languages": ["en"]},
{"id": "yourtts", "name": "YourTTS", "languages": ["en", "th", "ja"]},
]
}
"""
def show_api(self):
print("=== TTS API ===")
print(self.CODE[:600])
def endpoints(self):
print(f"\n=== API Endpoints ===")
eps = [
("POST", "/api/tts/single", "Single text → audio"),
("POST", "/api/tts/batch", "Batch text → audio (async)"),
("GET", "/api/tts/batch/{id}/status", "Check batch status"),
("GET", "/api/tts/models", "List available models"),
]
for method, path, desc in eps:
print(f" [{method:>4}] {path:<35} — {desc}")
api = TTSAPI()
api.show_api()
api.endpoints()
Performance & Monitoring
# monitoring.py — TTS pipeline monitoring
import json
import random
class TTSMonitoring:
METRICS = {
"throughput": "Audio files generated per minute",
"latency": "Time per audio generation (seconds)",
"queue_depth": "Pending items in processing queue",
"gpu_utilization": "GPU usage during synthesis",
"error_rate": "Failed synthesis percentage",
"audio_quality": "MOS (Mean Opinion Score) estimate",
}
def dashboard(self):
print("=== TTS Pipeline Dashboard ===\n")
print(f" Throughput: {random.randint(20, 60)} files/min")
print(f" Latency (avg): {random.uniform(0.5, 3.0):.1f}s per file")
print(f" Queue depth: {random.randint(0, 200)} pending")
print(f" GPU utilization: {random.randint(60, 95)}%")
print(f" Error rate: {random.uniform(0, 1.5):.2f}%")
print(f" Workers: {random.randint(2, 4)} active")
print(f" Today processed: {random.randint(1000, 5000):,} files")
def optimization_tips(self):
print(f"\n=== Optimization Tips ===")
tips = [
"ใช้ VITS model — เร็วกว่า Tacotron2 2-3x",
"Enable GPU — เร็วกว่า CPU 10-50x",
"Batch same-length texts — ลด padding waste",
"Pre-load model — ลด cold start",
"Use FP16 — เร็วขึ้น 1.5x, memory ลด 50%",
"Cache repeated texts — ไม่ต้อง synthesize ซ้ำ",
]
for tip in tips:
print(f" • {tip}")
mon = TTSMonitoring()
mon.dashboard()
mon.optimization_tips()
FAQ - คำถามที่พบบ่อย
Q: Coqui TTS รองรับภาษาไทยไหม?
A: Coqui TTS มี models หลายภาษา แต่ภาษาไทยยังจำกัด YourTTS รองรับ multilingual (รวม Thai ด้วย reference voice) ทางเลือก: Google Cloud TTS, Azure TTS (รองรับไทยดี) หรือ train custom model กับ Thai dataset Fine-tune: ใช้ VITS + Thai speech dataset → custom Thai TTS model
Q: GPU จำเป็นไหม?
A: ไม่จำเป็น แต่แนะนำอย่างยิ่ง: CPU: VITS ~2-5 วินาที/ประโยค (ช้า) GPU: VITS ~0.1-0.5 วินาที/ประโยค (เร็ว 10-50x) สำหรับ batch processing: GPU จำเป็น — ถ้า 10,000 ไฟล์ → CPU: ~14 ชม. vs GPU: ~1 ชม. Minimum GPU: NVIDIA GTX 1060 6GB+, แนะนำ RTX 3060+
Q: Batch processing เร็วแค่ไหน?
A: ขึ้นกับ model + hardware: VITS on RTX 3090: ~100-200 files/minute (short sentences) Tacotron2: ~30-60 files/minute (slower, higher quality) Scale: 2 GPU workers → 200-400 files/minute ใช้ Celery + Redis queue → horizontal scaling ได้ง่าย
Q: Audio quality ดีแค่ไหน?
A: VITS: MOS ~4.0-4.3 (ใกล้เสียงจริง) YourTTS (voice clone): MOS ~3.5-4.0 (ขึ้นกับ reference audio quality) Bark: เสียงธรรมชาติมาก แต่ช้า เทียบ commercial: Google TTS MOS ~4.3, Azure TTS ~4.2 Coqui competitive กับ commercial — ข้อดีคือ free + self-hosted
