Text Generation WebUI Chaos Engineering — ทดสอบ
Text Generation WebUI Chaos Engineering
Text Generation WebUI Chaos Engineering LLM oobabooga Resilience Circuit Breaker Retry Fallback GPU Queue Production Monitoring
| Chaos Experiment | Target | Expected Behavior | Severity |
|---|---|---|---|
| GPU Failure | Worker GPU | Fallback Model / Queue Hold | P1 Critical |
| Memory Pressure | VRAM/RAM | OOM Handler / Smaller Model | P1 Critical |
| High Concurrency | API Gateway | Queue + Rate Limit | P2 High |
| Network Partition | Worker Network | Timeout + Retry + Fallback | P2 High |
| Model Corruption | Model Files | Health Check + Alert + Reload | P1 Critical |
| Disk Full | Cache/Log Disk | Alert + Cleanup + Continue | P3 Medium |
Chaos Experiments
# === Chaos Engineering for LLM ===
# Experiment 1: GPU Failure Simulation
# nvidia-smi -i 0 --gpu-reset # Reset GPU 0
# OR: Kill text-gen-webui process
# kill -9 $(pgrep -f text-generation)
#
# Expected: Health check detects GPU down
# Queue holds requests
# Fallback to secondary GPU/Model
# Alert sent to PagerDuty
#
# Experiment 2: Memory Pressure
# stress-ng --vm 1 --vm-bytes 90% -t 60s # Fill RAM
# OR: Load model larger than VRAM
#
# Expected: OOM handler catches error
# Unload current model
# Load smaller quantized model
# Continue serving (degraded)
#
# Experiment 3: High Concurrency
# hey -n 1000 -c 100 http://localhost:5000/api/v1/generate
# OR: locust -f load_test.py --users 200 --spawn-rate 10
from dataclasses import dataclass
@dataclass
class ChaosExperiment:
name: str
injection: str
steady_state: str
expected_result: str
rollback: str
experiments = [
ChaosExperiment("GPU Failure",
"Kill GPU process / nvidia-smi --gpu-reset",
"Response < 5s | Success Rate > 99%",
"Fallback Model activated | Queue holds | Alert sent",
"Restart text-gen-webui | Reload Model"),
ChaosExperiment("VRAM Exhaustion",
"Load oversized model / stress-ng --vm",
"VRAM < 90% | Model loaded",
"OOM caught | Smaller model loaded | Degraded mode",
"Unload model | Free VRAM | Reload correct model"),
ChaosExperiment("100 Concurrent Users",
"hey -n 1000 -c 100 / locust --users 200",
"P95 < 10s | Error Rate < 1%",
"Queue buffers | Rate limit kicks in | No crash",
"Scale workers | Increase queue size"),
ChaosExperiment("Network Partition",
"iptables -A INPUT -p tcp --dport 5000 -j DROP",
"API reachable | Timeout < 30s",
"Client timeout + retry | Circuit breaker open",
"iptables -D INPUT -p tcp --dport 5000 -j DROP"),
ChaosExperiment("Model File Corruption",
"truncate -s 1M model.safetensors",
"Model loaded | Health check pass",
"Load error caught | Alert | Redownload model",
"Restore from backup | Reload model"),
]
print("=== Chaos Experiments ===")
for e in experiments:
print(f"\n [{e.name}]")
print(f" Inject: {e.injection}")
print(f" Steady: {e.steady_state}")
print(f" Expected: {e.expected_result}")
print(f" Rollback: {e.rollback}")
Resilience Patterns
# === Resilience Patterns for LLM Service ===
# Circuit Breaker (Python)
# from circuitbreaker import circuit
#
# @circuit(failure_threshold=3, recovery_timeout=30)
# def generate_text(prompt, max_tokens=512):
# response = requests.post("http://localhost:5000/api/v1/generate",
# json={"prompt": prompt, "max_new_tokens": max_tokens},
# timeout=30)
# response.raise_for_status()
# return response.json()
#
# # Fallback
# def generate_with_fallback(prompt):
# try:
# return generate_text(prompt) # Primary (Local LLM)
# except CircuitBreakerError:
# return generate_cloud_fallback(prompt) # Fallback (Cloud API)
@dataclass
class ResiliencePattern:
pattern: str
implementation: str
config: str
benefit: str
patterns = [
ResiliencePattern("Circuit Breaker",
"circuitbreaker library / Istio Circuit Breaker",
"failure_threshold=3 recovery_timeout=30s",
"ป้องกัน Cascade Failure หยุดส่ง Request ไป Service ที่ล้ม"),
ResiliencePattern("Retry with Backoff",
"tenacity library / urllib3 Retry",
"max_retries=3 backoff=exponential(1,2,4,8)",
"จัดการ Transient Error (Network Timeout)"),
ResiliencePattern("Timeout",
"requests.post(timeout=30) / aiohttp timeout",
"connect=5s read=30s (Generation) total=60s",
"ป้องกัน Hang ไม่รอ Response ตลอดไป"),
ResiliencePattern("Fallback",
"try/except → Fallback Model/API",
"Primary: Local 13B → Fallback: Local 7B → Cloud API",
"Service Available เสมอ แม้ Primary ล้มเหลว"),
ResiliencePattern("Bulkhead",
"ThreadPoolExecutor / Kubernetes Resource Limits",
"Chat: 4 workers | API: 2 workers | Batch: 1 worker",
"แยก Resource Pool ป้องกัน 1 Feature กิน Resource ทั้งหมด"),
ResiliencePattern("Rate Limiting",
"FastAPI SlowAPI / Nginx limit_req",
"10 req/min/user | 100 req/min/total",
"ป้องกัน Overload GPU จาก User เดียว"),
ResiliencePattern("Health Check",
"FastAPI /health endpoint / K8s liveness probe",
"Check: Model loaded + GPU available + VRAM < 90%",
"ตรวจจับปัญหาเร็ว Auto-restart ถ้า Unhealthy"),
]
print("=== Resilience Patterns ===")
for p in patterns:
print(f" [{p.pattern}] {p.implementation}")
print(f" Config: {p.config}")
print(f" Benefit: {p.benefit}")
Production Monitoring
# === LLM Production Monitoring ===
@dataclass
class MonitorMetric:
metric: str
source: str
target: str
alert: str
metrics = [
MonitorMetric("GPU Utilization",
"nvidia-smi / DCGM Exporter → Prometheus",
"40-80% (Healthy) ไม่ควร 100% ตลอด",
"> 95% sustained → P2 Scale Worker"),
MonitorMetric("VRAM Usage",
"nvidia-smi / DCGM Exporter → Prometheus",
"< 90% ของ Total VRAM",
"> 90% → P1 OOM Risk ลด Batch/Context"),
MonitorMetric("Inference Latency",
"Application Metrics → Prometheus",
"P50 < 3s | P95 < 10s | P99 < 30s",
"P95 > 15s → P2 Check GPU Load Queue"),
MonitorMetric("Tokens/Second",
"Application Metrics",
"> 20 tok/s (7B) > 10 tok/s (13B)",
"< 50% baseline → P2 Check GPU Thermal"),
MonitorMetric("Queue Length",
"Redis/RabbitMQ Metrics",
"< 10 requests in queue",
"> 50 → P2 Scale Workers | > 100 → P1"),
MonitorMetric("Error Rate",
"Application Logs / Metrics",
"< 1% of total requests",
"> 1% → P2 | > 5% → P1 Check Model/GPU"),
]
print("=== Production Monitoring ===")
for m in metrics:
print(f" [{m.metric}]")
print(f" Source: {m.source}")
print(f" Target: {m.target}")
print(f" Alert: {m.alert}")
เคล็ดลับ
- Fallback: มี Fallback Model เสมอ 13B → 7B → Cloud API
- Queue: ใช้ Queue จัดการ Request ไม่ส่งตรงไป GPU
- VRAM: Monitor VRAM < 90% ป้องกัน OOM Kill
- Chaos: ทดสอบ GPU Failure เดือนละครั้ง ตรวจ Fallback ทำงาน
- Rate Limit: จำกัด Request ต่อ User ป้องกัน GPU Overload
Text Generation WebUI คืออะไร
oobabooga Open Source LLM WebUI LLaMA Mistral GPTQ GGUF Chat API Extensions GPU VRAM Local Privacy