LocalAI Self-hosted Progressive Delivery คืออะไร
LocalAI เป็น open-source AI inference server ที่รันบนเครื่องตัวเอง (self-hosted) รองรับ LLMs, Image Generation, Text-to-Speech, Speech-to-Text โดยไม่ต้องพึ่ง cloud APIs เข้ากันได้กับ OpenAI API format Progressive Delivery คือการปล่อย software version ใหม่แบบค่อยๆ เพิ่มจำนวนผู้ใช้ ใช้ feature flags, canary releases และ A/B testing เพื่อลดความเสี่ยง การรวม LocalAI กับ Progressive Delivery ช่วยให้อัพเดท AI models อย่างปลอดภัย ทดสอบ model ใหม่กับ traffic จริงก่อน rollout เต็ม
LocalAI Architecture
# localai_arch.py — LocalAI architecture
import json
class LocalAIArchitecture:
FEATURES = {
"openai_compatible": {
"name": "OpenAI API Compatible",
"description": "ใช้ endpoint เดียวกับ OpenAI — เปลี่ยน base URL เป็น LocalAI ได้เลย",
"endpoints": "/v1/chat/completions, /v1/embeddings, /v1/images/generations",
},
"multi_model": {
"name": "Multi-Model Support",
"description": "รัน LLM, Stable Diffusion, Whisper, TTS พร้อมกันบน server เดียว",
"models": "LLaMA, Mistral, Phi, SDXL, Whisper, Bark, Piper",
},
"gpu_cpu": {
"name": "GPU + CPU Inference",
"description": "รันได้ทั้ง GPU (CUDA, ROCm) และ CPU (AVX2, AVX512)",
"quantization": "GGUF quantization: Q4_K_M, Q5_K_M, Q8_0",
},
"docker": {
"name": "Docker Deployment",
"description": "Deploy ง่ายด้วย Docker — GPU support ผ่าน nvidia-docker",
},
}
DEPLOYMENT = """
# docker-compose.yaml — LocalAI deployment
version: '3.8'
services:
localai:
image: localai/localai:latest-aio-gpu-nvidia-cuda-12
ports:
- "8080:8080"
volumes:
- ./models:/build/models
environment:
- THREADS=4
- CONTEXT_SIZE=4096
- GALLERIES=[{"name":"model-gallery","url":"github:mudler/LocalAI/gallery/index.yaml@master"}]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
"""
def show_features(self):
print("=== LocalAI Features ===\n")
for key, feat in self.FEATURES.items():
print(f"[{feat['name']}]")
print(f" {feat['description']}")
print()
def show_deployment(self):
print("=== Docker Deployment ===")
print(self.DEPLOYMENT[:400])
arch = LocalAIArchitecture()
arch.show_features()
arch.show_deployment()
Progressive Delivery for AI Models
# progressive.py — Progressive delivery patterns for AI models
import json
class AIProgressiveDelivery:
PATTERNS = {
"canary_model": {
"name": "Canary Model Release",
"description": "ปล่อย model ใหม่ให้ 5-10% ของ traffic ก่อน — วัด quality metrics",
"flow": "Model v1 (90% traffic) + Model v2 (10% traffic) → compare metrics → promote/rollback",
},
"shadow_mode": {
"name": "Shadow Mode Testing",
"description": "ส่ง request ไปทั้ง model เก่าและใหม่ — ใช้ผลจาก model เก่า แต่ log ผลจาก model ใหม่",
"benefit": "ทดสอบ model ใหม่กับ production traffic จริง โดยไม่กระทบ users",
},
"ab_testing": {
"name": "A/B Testing Models",
"description": "แบ่ง users เป็น 2 กลุ่ม — กลุ่ม A ใช้ model เก่า กลุ่ม B ใช้ model ใหม่",
"metrics": "วัด quality score, latency, user satisfaction, cost per request",
},
"blue_green": {
"name": "Blue-Green Model Swap",
"description": "โหลด model ใหม่ใน instance ใหม่ → สลับ traffic ทีเดียว",
"benefit": "Instant rollback — สลับกลับ model เก่าได้ทันที",
},
"gradual_rollout": {
"name": "Gradual Rollout",
"description": "เพิ่ม traffic ทีละ stage: 1% → 5% → 25% → 50% → 100%",
"automation": "Auto-promote ถ้า metrics ดี > threshold สำหรับ 24 ชม.",
},
}
MODEL_METRICS = {
"quality": ["BLEU score", "Perplexity", "Human eval score", "Toxicity score"],
"performance": ["P50/P95/P99 latency", "Tokens per second", "Time to first token"],
"cost": ["GPU utilization", "Memory usage", "Cost per 1K tokens"],
"user": ["User satisfaction (thumbs up/down)", "Completion rate", "Error rate"],
}
def show_patterns(self):
print("=== Progressive Delivery Patterns ===\n")
for key, p in self.PATTERNS.items():
print(f"[{p['name']}]")
print(f" {p['description']}")
print()
def show_metrics(self):
print("=== Model Metrics ===")
for category, metrics in self.MODEL_METRICS.items():
print(f"\n[{category}]")
for m in metrics:
print(f" • {m}")
pd = AIProgressiveDelivery()
pd.show_patterns()
pd.show_metrics()
Python Implementation
# implementation.py — Python progressive delivery for LocalAI
import json
class LocalAIProgressiveDelivery:
CODE = """
# model_router.py — Route requests between model versions
import requests
import random
import hashlib
import time
from datetime import datetime
class ModelRouter:
def __init__(self, localai_url="http://localhost:8080"):
self.base_url = localai_url
self.models = {
'stable': {'name': 'llama-3-8b-q4', 'weight': 90},
'canary': {'name': 'llama-3.1-8b-q4', 'weight': 10},
}
self.metrics = {'stable': [], 'canary': []}
def route_request(self, user_id, prompt):
'''Route request based on canary weight'''
# Deterministic routing based on user_id
hash_val = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
bucket = hash_val % 100
canary_weight = self.models['canary']['weight']
variant = 'canary' if bucket < canary_weight else 'stable'
model_name = self.models[variant]['name']
# Send request
start = time.time()
try:
resp = requests.post(
f"{self.base_url}/v1/chat/completions",
json={
'model': model_name,
'messages': [{'role': 'user', 'content': prompt}],
'max_tokens': 512,
},
timeout=30,
)
elapsed = (time.time() - start) * 1000
result = resp.json()
tokens = result.get('usage', {}).get('total_tokens', 0)
self.metrics[variant].append({
'timestamp': datetime.utcnow().isoformat(),
'latency_ms': round(elapsed),
'tokens': tokens,
'status': resp.status_code,
'error': resp.status_code != 200,
})
return {
'variant': variant,
'model': model_name,
'response': result,
'latency_ms': round(elapsed),
}
except Exception as e:
self.metrics[variant].append({
'timestamp': datetime.utcnow().isoformat(),
'error': True,
'message': str(e),
})
return {'variant': variant, 'error': str(e)}
def get_metrics_summary(self):
'''Compare metrics between stable and canary'''
summary = {}
for variant in ['stable', 'canary']:
data = self.metrics[variant]
if not data:
continue
latencies = [d['latency_ms'] for d in data if 'latency_ms' in d]
errors = sum(1 for d in data if d.get('error'))
summary[variant] = {
'total_requests': len(data),
'error_rate': round(errors / max(len(data), 1) * 100, 2),
'avg_latency_ms': round(sum(latencies) / max(len(latencies), 1)),
'p95_latency_ms': round(sorted(latencies)[int(len(latencies) * 0.95)] if latencies else 0),
}
return summary
def should_promote(self, threshold=None):
'''Check if canary should be promoted'''
if threshold is None:
threshold = {
'max_error_rate_diff': 2.0,
'max_latency_diff_pct': 20,
'min_requests': 100,
}
summary = self.get_metrics_summary()
stable = summary.get('stable', {})
canary = summary.get('canary', {})
if canary.get('total_requests', 0) < threshold['min_requests']:
return {'ready': False, 'reason': 'Not enough canary requests'}
error_diff = canary.get('error_rate', 0) - stable.get('error_rate', 0)
latency_diff = (canary.get('avg_latency_ms', 0) - stable.get('avg_latency_ms', 1)) / max(stable.get('avg_latency_ms', 1), 1) * 100
promote = (
error_diff <= threshold['max_error_rate_diff'] and
latency_diff <= threshold['max_latency_diff_pct']
)
return {
'ready': promote,
'error_diff': round(error_diff, 2),
'latency_diff_pct': round(latency_diff, 1),
'action': 'PROMOTE' if promote else 'HOLD or ROLLBACK',
}
def update_weights(self, canary_weight):
'''Update canary traffic weight'''
self.models['canary']['weight'] = canary_weight
self.models['stable']['weight'] = 100 - canary_weight
# router = ModelRouter("http://localhost:8080")
# result = router.route_request("user123", "Hello, how are you?")
# summary = router.get_metrics_summary()
# decision = router.should_promote()
"""
def show_code(self):
print("=== Model Router ===")
print(self.CODE[:600])
impl = LocalAIProgressiveDelivery()
impl.show_code()
Kubernetes Deployment
# k8s.py — Kubernetes deployment with progressive delivery
import json
class K8sDeployment:
MANIFESTS = """
# localai-deployment.yaml — Kubernetes with Argo Rollouts
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: localai
spec:
replicas: 3
strategy:
canary:
steps:
- setWeight: 5
- pause: {duration: 1h}
- setWeight: 25
- pause: {duration: 2h}
- setWeight: 50
- pause: {duration: 4h}
- setWeight: 100
canaryService: localai-canary
stableService: localai-stable
analysis:
templates:
- templateName: localai-analysis
startingStep: 1
selector:
matchLabels:
app: localai
template:
metadata:
labels:
app: localai
spec:
containers:
- name: localai
image: localai/localai:latest-aio-gpu-nvidia-cuda-12
ports:
- containerPort: 8080
env:
- name: THREADS
value: "4"
- name: CONTEXT_SIZE
value: "4096"
resources:
limits:
nvidia.com/gpu: 1
memory: 16Gi
requests:
memory: 8Gi
volumeMounts:
- name: models
mountPath: /build/models
volumes:
- name: models
persistentVolumeClaim:
claimName: localai-models
"""
ANALYSIS = """
# analysis-template.yaml — Argo Rollouts Analysis
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
name: localai-analysis
spec:
metrics:
- name: error-rate
interval: 5m
successCondition: result[0] < 0.05
provider:
prometheus:
address: http://prometheus:9090
query: |
sum(rate(http_requests_total{service="localai-canary",status=~"5.."}[5m]))
/ sum(rate(http_requests_total{service="localai-canary"}[5m]))
- name: latency-p95
interval: 5m
successCondition: result[0] < 2000
provider:
prometheus:
address: http://prometheus:9090
query: |
histogram_quantile(0.95, rate(http_request_duration_ms_bucket{service="localai-canary"}[5m]))
"""
def show_manifests(self):
print("=== Kubernetes Rollout ===")
print(self.MANIFESTS[:500])
def show_analysis(self):
print("\n=== Analysis Template ===")
print(self.ANALYSIS[:400])
k8s = K8sDeployment()
k8s.show_manifests()
k8s.show_analysis()
Monitoring & Alerting
# monitoring.py — Monitoring for AI model progressive delivery
import json
class AIModelMonitoring:
METRICS = {
"inference": {
"name": "Inference Metrics",
"items": [
"request_duration_ms — latency per request",
"tokens_per_second — throughput",
"time_to_first_token — responsiveness",
"total_tokens — usage tracking",
],
},
"quality": {
"name": "Quality Metrics",
"items": [
"response_quality_score — automated quality check",
"user_feedback — thumbs up/down ratio",
"safety_score — toxicity/harmful content detection",
"hallucination_rate — factual accuracy",
],
},
"system": {
"name": "System Metrics",
"items": [
"gpu_utilization_pct — GPU usage",
"gpu_memory_used_mb — VRAM usage",
"model_load_time_ms — time to load model",
"queue_depth — pending requests",
],
},
}
ALERTS = {
"canary_error_spike": {
"condition": "canary error_rate > stable error_rate * 2",
"action": "Auto-rollback canary",
"severity": "critical",
},
"canary_latency_high": {
"condition": "canary p95_latency > stable p95_latency * 1.5",
"action": "Pause rollout, investigate",
"severity": "warning",
},
"quality_degradation": {
"condition": "canary quality_score < stable quality_score - 10%",
"action": "Auto-rollback, review model",
"severity": "critical",
},
}
def show_metrics(self):
print("=== Monitoring Metrics ===\n")
for key, category in self.METRICS.items():
print(f"[{category['name']}]")
for item in category['items'][:2]:
print(f" • {item}")
print()
def show_alerts(self):
print("=== Alerts ===")
for key, alert in self.ALERTS.items():
print(f"\n[{key}] ({alert['severity']})")
print(f" Condition: {alert['condition']}")
print(f" Action: {alert['action']}")
monitor = AIModelMonitoring()
monitor.show_metrics()
monitor.show_alerts()
FAQ - คำถามที่พบบ่อย
Q: ทำไมต้อง Progressive Delivery สำหรับ AI models?
A: AI models มีความเสี่ยงสูงกว่า code ทั่วไป: Model ใหม่อาจ hallucinate มากขึ้น, ตอบผิด, หรือมี bias ที่ไม่คาดคิด Latency อาจเปลี่ยน — model ใหญ่กว่า = ช้ากว่า Quality ยากวัดอัตโนมัติ — ต้อง human eval + automated checks Progressive Delivery: ปล่อย model ใหม่ให้คนส่วันนี้อยก่อน → วัด metrics → promote หรือ rollback
Q: LocalAI กับ vLLM อันไหนดีกว่า?
A: LocalAI: multi-modal (LLM + Image + TTS + STT), OpenAI-compatible, ง่าย deploy vLLM: เน้น LLM อย่างเดียว, throughput สูงกว่า (PagedAttention), production-grade เลือก LocalAI: ถ้าต้องการ all-in-one (LLM + Image + Audio) เลือก vLLM: ถ้าเน้น LLM performance สูงสุด, high-throughput serving ใช้ร่วมกัน: vLLM สำหรับ LLM + LocalAI สำหรับ Image/Audio
Q: ต้องใช้ GPU ไหม?
A: ไม่จำเป็น — LocalAI รันบน CPU ได้ (ใช้ GGUF quantized models) CPU: ช้ากว่า (1-5 tokens/sec สำหรับ 7B model) แต่ใช้ได้สำหรับ low-traffic GPU: เร็วกว่ามาก (20-50 tokens/sec สำหรับ 7B model) — จำเป็นสำหรับ production แนะนำ: NVIDIA RTX 4090 (24GB) สำหรับ dev, A100/H100 สำหรับ production Quantization: Q4_K_M ให้ quality ดีที่ memory ลดลง 75%
Q: Shadow Mode Testing ทำอย่างไร?
A: 1) ส่ง request ไป model เก่า (ใช้ผลนี้ตอบ user) 2) ส่ง request เดียวกันไป model ใหม่ (async, ไม่ตอบ user) 3) Log ผลทั้งสองและเปรียบเทียบ 4) วัด: quality, latency, token usage ข้อดี: ทดสอบกับ production traffic จริง ไม่กระทบ users ข้อเสีย: ใช้ resources 2 เท่า (รัน 2 models พร้อมกัน)
