SiamCafe.net Blog
Technology

TTS Coqui Batch Processing Pipeline

tts coqui batch processing pipeline
TTS Coqui Batch Processing Pipeline | SiamCafe Blog
2025-07-02· อ. บอม — SiamCafe.net· 1,774 คำ

TTS Coqui Batch Processing Pipeline คืออะไร

Coqui TTS เป็น open source text-to-speech library ที่รองรับหลายภาษาและหลาย models เช่น Tacotron2, VITS, YourTTS สำหรับแปลงข้อความเป็นเสียงพูดคุณภาพสูง Batch Processing Pipeline คือระบบประมวลผลข้อมูลจำนวนมากเป็น batch แทนการทำทีละรายการ การรวมสองแนวคิดนี้ช่วยสร้างระบบ TTS ที่ประมวลผลข้อความหลายพันรายการเป็นไฟล์เสียงอัตโนมัติ เหมาะสำหรับ audiobook production, voice notification systems, accessibility tools และ content localization

Coqui TTS Fundamentals

# coqui_basics.py — Coqui TTS fundamentals
import json

class CoquiTTSBasics:
    MODELS = {
        "tacotron2": {
            "name": "Tacotron2",
            "type": "Autoregressive",
            "quality": "สูง",
            "speed": "ช้า (sequential generation)",
            "use": "High-quality single speaker TTS",
        },
        "vits": {
            "name": "VITS (Variational Inference TTS)",
            "type": "End-to-end",
            "quality": "สูงมาก",
            "speed": "เร็ว (parallel generation)",
            "use": "Production TTS — quality + speed balance",
        },
        "yourtts": {
            "name": "YourTTS",
            "type": "Multi-speaker, Multi-lingual",
            "quality": "สูง",
            "speed": "ปานกลาง",
            "use": "Voice cloning, multi-language, zero-shot TTS",
        },
        "bark": {
            "name": "Bark (Suno)",
            "type": "Transformer-based",
            "quality": "สูงมาก (natural prosody)",
            "speed": "ช้า",
            "use": "Expressive speech, music, sound effects",
        },
    }

    SETUP_CODE = """
# Install Coqui TTS
# pip install TTS

from TTS.api import TTS
import torch

# List available models
print(TTS.list_models())

# Initialize model
tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=torch.cuda.is_available())

# Single text to speech
tts.tts_to_file(
    text="Hello, this is a test of Coqui TTS.",
    file_path="output.wav"
)

# Multi-speaker model
tts_multi = TTS(model_name="tts_models/en/vctk/vits")
tts_multi.tts_to_file(
    text="Hello world",
    speaker="p225",
    file_path="output_p225.wav"
)

# Voice cloning (YourTTS)
tts_clone = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
tts_clone.tts_to_file(
    text="สวัสดีครับ ทดสอบระบบ TTS",
    speaker_wav="reference_voice.wav",
    language="th",
    file_path="cloned_output.wav"
)
"""

    def show_models(self):
        print("=== Coqui TTS Models ===\n")
        for key, model in self.MODELS.items():
            print(f"[{model['name']}] ({model['type']})")
            print(f"  Quality: {model['quality']} | Speed: {model['speed']}")
            print(f"  Use: {model['use']}")
            print()

    def show_setup(self):
        print("=== Setup Code ===")
        print(self.SETUP_CODE[:500])

basics = CoquiTTSBasics()
basics.show_models()
basics.show_setup()

Batch Processing Pipeline

# batch_pipeline.py — TTS batch processing pipeline
import json

class TTSBatchPipeline:
    CODE = """
# tts_batch.py — Batch TTS processing pipeline
from TTS.api import TTS
import torch
import os
import json
import time
import logging
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TTSBatchProcessor:
    def __init__(self, model_name="tts_models/en/ljspeech/vits", 
                 output_dir="output", gpu=True):
        self.model_name = model_name
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.gpu = gpu and torch.cuda.is_available()
        self.tts = None
        self.stats = {"processed": 0, "failed": 0, "total_time": 0}
    
    def _init_model(self):
        if self.tts is None:
            self.tts = TTS(model_name=self.model_name, gpu=self.gpu)
    
    def process_single(self, text, filename, speaker=None):
        '''Process single text to audio'''
        self._init_model()
        output_path = self.output_dir / filename
        
        try:
            start = time.time()
            kwargs = {"text": text, "file_path": str(output_path)}
            if speaker:
                kwargs["speaker"] = speaker
            
            self.tts.tts_to_file(**kwargs)
            
            duration = time.time() - start
            file_size = output_path.stat().st_size
            
            self.stats["processed"] += 1
            self.stats["total_time"] += duration
            
            return {
                "status": "success",
                "file": str(output_path),
                "duration": round(duration, 2),
                "size_kb": round(file_size / 1024, 1),
            }
        except Exception as e:
            self.stats["failed"] += 1
            logger.error(f"Failed: {filename} — {e}")
            return {"status": "failed", "error": str(e)}
    
    def process_batch(self, items):
        '''Process batch of (text, filename) tuples'''
        results = []
        total = len(items)
        
        for i, item in enumerate(items):
            text = item["text"]
            filename = item.get("filename", f"audio_{i:05d}.wav")
            speaker = item.get("speaker")
            
            logger.info(f"Processing {i+1}/{total}: {filename}")
            result = self.process_single(text, filename, speaker)
            results.append(result)
        
        return results
    
    def process_from_file(self, input_file):
        '''Process from JSON file'''
        with open(input_file) as f:
            items = json.load(f)
        return self.process_batch(items)
    
    def report(self):
        avg_time = (self.stats['total_time'] / max(self.stats['processed'], 1))
        return {
            **self.stats,
            "avg_time_per_item": round(avg_time, 2),
            "success_rate": f"{self.stats['processed'] / max(self.stats['processed'] + self.stats['failed'], 1) * 100:.1f}%",
        }

# Usage
processor = TTSBatchProcessor(
    model_name="tts_models/en/ljspeech/vits",
    output_dir="./tts_output",
    gpu=True
)

# Batch items
items = [
    {"text": "Welcome to our platform.", "filename": "welcome.wav"},
    {"text": "Your order has been confirmed.", "filename": "order_confirm.wav"},
    {"text": "Thank you for your purchase.", "filename": "thank_you.wav"},
]

results = processor.process_batch(items)
print(json.dumps(processor.report(), indent=2))
"""

    def show_code(self):
        print("=== Batch Pipeline ===")
        print(self.CODE[:600])

pipeline = TTSBatchPipeline()
pipeline.show_code()

Queue-based Architecture

# queue_arch.py — Queue-based TTS pipeline
import json

class QueueArchitecture:
    CELERY_WORKER = """
# tts_worker.py — Celery worker for TTS tasks
from celery import Celery
from TTS.api import TTS
import torch
import os

app = Celery('tts', broker='redis://localhost:6379/0',
             backend='redis://localhost:6379/1')

# Initialize model once per worker
tts_model = None

def get_model():
    global tts_model
    if tts_model is None:
        tts_model = TTS(
            model_name="tts_models/en/ljspeech/vits",
            gpu=torch.cuda.is_available()
        )
    return tts_model

@app.task(bind=True, max_retries=3, default_retry_delay=30)
def synthesize(self, text, output_path, speaker=None):
    try:
        model = get_model()
        kwargs = {"text": text, "file_path": output_path}
        if speaker:
            kwargs["speaker"] = speaker
        model.tts_to_file(**kwargs)
        
        return {
            "status": "success",
            "file": output_path,
            "size": os.path.getsize(output_path),
        }
    except Exception as exc:
        raise self.retry(exc=exc)

@app.task
def batch_synthesize(items):
    results = []
    for item in items:
        result = synthesize.delay(
            item["text"],
            item["output_path"],
            item.get("speaker")
        )
        results.append(result.id)
    return results
"""

    DOCKER_COMPOSE = """
# docker-compose.yml — TTS pipeline infrastructure
version: '3.8'
services:
  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"

  tts-worker:
    build: .
    command: celery -A tts_worker worker --loglevel=info --concurrency=2
    deploy:
      replicas: 2
      resources:
        reservations:
          devices:
            - capabilities: [gpu]
    volumes:
      - ./output:/app/output
    depends_on:
      - redis

  tts-api:
    build: .
    command: uvicorn api:app --host 0.0.0.0 --port 8000
    ports:
      - "8000:8000"
    depends_on:
      - redis

  flower:
    image: mher/flower
    command: celery --broker=redis://redis:6379/0 flower
    ports:
      - "5555:5555"
    depends_on:
      - redis
"""

    def show_worker(self):
        print("=== Celery Worker ===")
        print(self.CELERY_WORKER[:500])

    def show_docker(self):
        print(f"\n=== Docker Compose ===")
        print(self.DOCKER_COMPOSE[:500])

queue = QueueArchitecture()
queue.show_worker()
queue.show_docker()

REST API

# api.py — FastAPI for TTS batch processing
import json

class TTSAPI:
    CODE = """
# api.py — FastAPI TTS service
from fastapi import FastAPI, BackgroundTasks, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uuid

app = FastAPI(title="TTS Batch API")

class TTSRequest(BaseModel):
    text: str
    filename: Optional[str] = None
    speaker: Optional[str] = None
    language: Optional[str] = "en"

class BatchRequest(BaseModel):
    items: List[TTSRequest]
    model: Optional[str] = "tts_models/en/ljspeech/vits"

class BatchResponse(BaseModel):
    batch_id: str
    total_items: int
    status: str

@app.post("/api/tts/single")
async def synthesize_single(req: TTSRequest, background_tasks: BackgroundTasks):
    job_id = str(uuid.uuid4())[:8]
    filename = req.filename or f"{job_id}.wav"
    
    background_tasks.add_task(
        process_tts, req.text, filename, req.speaker
    )
    
    return {"job_id": job_id, "filename": filename, "status": "processing"}

@app.post("/api/tts/batch", response_model=BatchResponse)
async def synthesize_batch(req: BatchRequest):
    batch_id = str(uuid.uuid4())[:8]
    
    # Submit to Celery queue
    from tts_worker import batch_synthesize
    task_ids = batch_synthesize.delay([
        {"text": item.text, "output_path": f"output/{batch_id}_{i}.wav",
         "speaker": item.speaker}
        for i, item in enumerate(req.items)
    ])
    
    return BatchResponse(
        batch_id=batch_id,
        total_items=len(req.items),
        status="queued"
    )

@app.get("/api/tts/batch/{batch_id}/status")
async def batch_status(batch_id: str):
    # Check Celery task status
    return {"batch_id": batch_id, "completed": 0, "total": 0, "status": "processing"}

@app.get("/api/tts/models")
async def list_models():
    return {
        "models": [
            {"id": "vits-en", "name": "VITS English", "languages": ["en"]},
            {"id": "vits-multi", "name": "VITS Multi-speaker", "languages": ["en"]},
            {"id": "yourtts", "name": "YourTTS", "languages": ["en", "th", "ja"]},
        ]
    }
"""

    def show_api(self):
        print("=== TTS API ===")
        print(self.CODE[:600])

    def endpoints(self):
        print(f"\n=== API Endpoints ===")
        eps = [
            ("POST", "/api/tts/single", "Single text → audio"),
            ("POST", "/api/tts/batch", "Batch text → audio (async)"),
            ("GET", "/api/tts/batch/{id}/status", "Check batch status"),
            ("GET", "/api/tts/models", "List available models"),
        ]
        for method, path, desc in eps:
            print(f"  [{method:>4}] {path:<35} — {desc}")

api = TTSAPI()
api.show_api()
api.endpoints()

Performance & Monitoring

# monitoring.py — TTS pipeline monitoring
import json
import random

class TTSMonitoring:
    METRICS = {
        "throughput": "Audio files generated per minute",
        "latency": "Time per audio generation (seconds)",
        "queue_depth": "Pending items in processing queue",
        "gpu_utilization": "GPU usage during synthesis",
        "error_rate": "Failed synthesis percentage",
        "audio_quality": "MOS (Mean Opinion Score) estimate",
    }

    def dashboard(self):
        print("=== TTS Pipeline Dashboard ===\n")
        print(f"  Throughput: {random.randint(20, 60)} files/min")
        print(f"  Latency (avg): {random.uniform(0.5, 3.0):.1f}s per file")
        print(f"  Queue depth: {random.randint(0, 200)} pending")
        print(f"  GPU utilization: {random.randint(60, 95)}%")
        print(f"  Error rate: {random.uniform(0, 1.5):.2f}%")
        print(f"  Workers: {random.randint(2, 4)} active")
        print(f"  Today processed: {random.randint(1000, 5000):,} files")

    def optimization_tips(self):
        print(f"\n=== Optimization Tips ===")
        tips = [
            "ใช้ VITS model — เร็วกว่า Tacotron2 2-3x",
            "Enable GPU — เร็วกว่า CPU 10-50x",
            "Batch same-length texts — ลด padding waste",
            "Pre-load model — ลด cold start",
            "Use FP16 — เร็วขึ้น 1.5x, memory ลด 50%",
            "Cache repeated texts — ไม่ต้อง synthesize ซ้ำ",
        ]
        for tip in tips:
            print(f"  • {tip}")

mon = TTSMonitoring()
mon.dashboard()
mon.optimization_tips()

FAQ - คำถามที่พบบ่อย

Q: Coqui TTS รองรับภาษาไทยไหม?

A: Coqui TTS มี models หลายภาษา แต่ภาษาไทยยังจำกัด YourTTS รองรับ multilingual (รวม Thai ด้วย reference voice) ทางเลือก: Google Cloud TTS, Azure TTS (รองรับไทยดี) หรือ train custom model กับ Thai dataset Fine-tune: ใช้ VITS + Thai speech dataset → custom Thai TTS model

Q: GPU จำเป็นไหม?

A: ไม่จำเป็น แต่แนะนำอย่างยิ่ง: CPU: VITS ~2-5 วินาที/ประโยค (ช้า) GPU: VITS ~0.1-0.5 วินาที/ประโยค (เร็ว 10-50x) สำหรับ batch processing: GPU จำเป็น — ถ้า 10,000 ไฟล์ → CPU: ~14 ชม. vs GPU: ~1 ชม. Minimum GPU: NVIDIA GTX 1060 6GB+, แนะนำ RTX 3060+

Q: Batch processing เร็วแค่ไหน?

A: ขึ้นกับ model + hardware: VITS on RTX 3090: ~100-200 files/minute (short sentences) Tacotron2: ~30-60 files/minute (slower, higher quality) Scale: 2 GPU workers → 200-400 files/minute ใช้ Celery + Redis queue → horizontal scaling ได้ง่าย

Q: Audio quality ดีแค่ไหน?

A: VITS: MOS ~4.0-4.3 (ใกล้เสียงจริง) YourTTS (voice clone): MOS ~3.5-4.0 (ขึ้นกับ reference audio quality) Bark: เสียงธรรมชาติมาก แต่ช้า เทียบ commercial: Google TTS MOS ~4.3, Azure TTS ~4.2 Coqui competitive กับ commercial — ข้อดีคือ free + self-hosted

📖 บทความที่เกี่ยวข้อง

TTS Coqui GreenOps Sustainabilityอ่านบทความ → TTS Coqui Multi-cloud Strategyอ่านบทความ → TTS Coqui Progressive Deliveryอ่านบทความ → TTS Coqui Capacity Planningอ่านบทความ → TTS Coqui Stream Processingอ่านบทความ →

📚 ดูบทความทั้งหมด →