Text Generation WebUI Chaos Engineering
Text Generation WebUI Chaos Engineering LLM oobabooga Resilience Circuit Breaker Retry Fallback GPU Queue Production Monitoring
| Chaos Experiment | Target | Expected Behavior | Severity |
|---|---|---|---|
| GPU Failure | Worker GPU | Fallback Model / Queue Hold | P1 Critical |
| Memory Pressure | VRAM/RAM | OOM Handler / Smaller Model | P1 Critical |
| High Concurrency | API Gateway | Queue + Rate Limit | P2 High |
| Network Partition | Worker Network | Timeout + Retry + Fallback | P2 High |
| Model Corruption | Model Files | Health Check + Alert + Reload | P1 Critical |
| Disk Full | Cache/Log Disk | Alert + Cleanup + Continue | P3 Medium |
Chaos Experiments
# === Chaos Engineering for LLM ===
# Experiment 1: GPU Failure Simulation
# nvidia-smi -i 0 --gpu-reset # Reset GPU 0
# OR: Kill text-gen-webui process
# kill -9 $(pgrep -f text-generation)
#
# Expected: Health check detects GPU down
# Queue holds requests
# Fallback to secondary GPU/Model
# Alert sent to PagerDuty
#
# Experiment 2: Memory Pressure
# stress-ng --vm 1 --vm-bytes 90% -t 60s # Fill RAM
# OR: Load model larger than VRAM
#
# Expected: OOM handler catches error
# Unload current model
# Load smaller quantized model
# Continue serving (degraded)
#
# Experiment 3: High Concurrency
# hey -n 1000 -c 100 http://localhost:5000/api/v1/generate
# OR: locust -f load_test.py --users 200 --spawn-rate 10
from dataclasses import dataclass
@dataclass
class ChaosExperiment:
name: str
injection: str
steady_state: str
expected_result: str
rollback: str
experiments = [
ChaosExperiment("GPU Failure",
"Kill GPU process / nvidia-smi --gpu-reset",
"Response < 5s | Success Rate > 99%",
"Fallback Model activated | Queue holds | Alert sent",
"Restart text-gen-webui | Reload Model"),
ChaosExperiment("VRAM Exhaustion",
"Load oversized model / stress-ng --vm",
"VRAM < 90% | Model loaded",
"OOM caught | Smaller model loaded | Degraded mode",
"Unload model | Free VRAM | Reload correct model"),
ChaosExperiment("100 Concurrent Users",
"hey -n 1000 -c 100 / locust --users 200",
"P95 < 10s | Error Rate < 1%",
"Queue buffers | Rate limit kicks in | No crash",
"Scale workers | Increase queue size"),
ChaosExperiment("Network Partition",
"iptables -A INPUT -p tcp --dport 5000 -j DROP",
"API reachable | Timeout < 30s",
"Client timeout + retry | Circuit breaker open",
"iptables -D INPUT -p tcp --dport 5000 -j DROP"),
ChaosExperiment("Model File Corruption",
"truncate -s 1M model.safetensors",
"Model loaded | Health check pass",
"Load error caught | Alert | Redownload model",
"Restore from backup | Reload model"),
]
print("=== Chaos Experiments ===")
for e in experiments:
print(f"\n [{e.name}]")
print(f" Inject: {e.injection}")
print(f" Steady: {e.steady_state}")
print(f" Expected: {e.expected_result}")
print(f" Rollback: {e.rollback}")
Resilience Patterns
# === Resilience Patterns for LLM Service ===
# Circuit Breaker (Python)
# from circuitbreaker import circuit
#
# @circuit(failure_threshold=3, recovery_timeout=30)
# def generate_text(prompt, max_tokens=512):
# response = requests.post("http://localhost:5000/api/v1/generate",
# json={"prompt": prompt, "max_new_tokens": max_tokens},
# timeout=30)
# response.raise_for_status()
# return response.json()
#
# # Fallback
# def generate_with_fallback(prompt):
# try:
# return generate_text(prompt) # Primary (Local LLM)
# except CircuitBreakerError:
# return generate_cloud_fallback(prompt) # Fallback (Cloud API)
@dataclass
class ResiliencePattern:
pattern: str
implementation: str
config: str
benefit: str
patterns = [
ResiliencePattern("Circuit Breaker",
"circuitbreaker library / Istio Circuit Breaker",
"failure_threshold=3 recovery_timeout=30s",
"ป้องกัน Cascade Failure หยุดส่ง Request ไป Service ที่ล้ม"),
ResiliencePattern("Retry with Backoff",
"tenacity library / urllib3 Retry",
"max_retries=3 backoff=exponential(1,2,4,8)",
"จัดการ Transient Error (Network Timeout)"),
ResiliencePattern("Timeout",
"requests.post(timeout=30) / aiohttp timeout",
"connect=5s read=30s (Generation) total=60s",
"ป้องกัน Hang ไม่รอ Response ตลอดไป"),
ResiliencePattern("Fallback",
"try/except → Fallback Model/API",
"Primary: Local 13B → Fallback: Local 7B → Cloud API",
"Service Available เสมอ แม้ Primary ล้มเหลว"),
ResiliencePattern("Bulkhead",
"ThreadPoolExecutor / Kubernetes Resource Limits",
"Chat: 4 workers | API: 2 workers | Batch: 1 worker",
"แยก Resource Pool ป้องกัน 1 Feature กิน Resource ทั้งหมด"),
ResiliencePattern("Rate Limiting",
"FastAPI SlowAPI / Nginx limit_req",
"10 req/min/user | 100 req/min/total",
"ป้องกัน Overload GPU จาก User เดียว"),
ResiliencePattern("Health Check",
"FastAPI /health endpoint / K8s liveness probe",
"Check: Model loaded + GPU available + VRAM < 90%",
"ตรวจจับปัญหาเร็ว Auto-restart ถ้า Unhealthy"),
]
print("=== Resilience Patterns ===")
for p in patterns:
print(f" [{p.pattern}] {p.implementation}")
print(f" Config: {p.config}")
print(f" Benefit: {p.benefit}")
Production Monitoring
# === LLM Production Monitoring ===
@dataclass
class MonitorMetric:
metric: str
source: str
target: str
alert: str
metrics = [
MonitorMetric("GPU Utilization",
"nvidia-smi / DCGM Exporter → Prometheus",
"40-80% (Healthy) ไม่ควร 100% ตลอด",
"> 95% sustained → P2 Scale Worker"),
MonitorMetric("VRAM Usage",
"nvidia-smi / DCGM Exporter → Prometheus",
"< 90% ของ Total VRAM",
"> 90% → P1 OOM Risk ลด Batch/Context"),
MonitorMetric("Inference Latency",
"Application Metrics → Prometheus",
"P50 < 3s | P95 < 10s | P99 < 30s",
"P95 > 15s → P2 Check GPU Load Queue"),
MonitorMetric("Tokens/Second",
"Application Metrics",
"> 20 tok/s (7B) > 10 tok/s (13B)",
"< 50% baseline → P2 Check GPU Thermal"),
MonitorMetric("Queue Length",
"Redis/RabbitMQ Metrics",
"< 10 requests in queue",
"> 50 → P2 Scale Workers | > 100 → P1"),
MonitorMetric("Error Rate",
"Application Logs / Metrics",
"< 1% of total requests",
"> 1% → P2 | > 5% → P1 Check Model/GPU"),
]
print("=== Production Monitoring ===")
for m in metrics:
print(f" [{m.metric}]")
print(f" Source: {m.source}")
print(f" Target: {m.target}")
print(f" Alert: {m.alert}")
เคล็ดลับ
- Fallback: มี Fallback Model เสมอ 13B → 7B → Cloud API
- Queue: ใช้ Queue จัดการ Request ไม่ส่งตรงไป GPU
- VRAM: Monitor VRAM < 90% ป้องกัน OOM Kill
- Chaos: ทดสอบ GPU Failure เดือนละครั้ง ตรวจ Fallback ทำงาน
- Rate Limit: จำกัด Request ต่อ User ป้องกัน GPU Overload
Text Generation WebUI คืออะไร
oobabooga Open Source LLM WebUI LLaMA Mistral GPTQ GGUF Chat API Extensions GPU VRAM Local Privacy
Chaos Engineering สำหรับ AI คืออะไร
ทดสอบความล้มเหลว GPU Failure Memory Pressure Concurrency Network Model Corruption Steady State Blast Radius Automate
Resilience Patterns มีอะไร
Circuit Breaker Retry Backoff Timeout Fallback Bulkhead Rate Limiting Health Check Queue Management Worker Pool Scale
Production Setup ทำอย่างไร
LB Gateway Queue Worker GPU Docker Kubernetes Monitor VRAM Latency Tokens/s Error Rate Canary Deploy Rollback Scale
สรุป
Text Generation WebUI Chaos Engineering LLM Resilience Circuit Breaker Fallback Queue GPU VRAM Monitoring Production Scale Deploy
