LLM LoRA Zero Downtime
LLM Fine-tuning LoRA Zero Downtime Deployment Blue-Green Canary Rolling Update Hot-swap PEFT QLoRA Adapter Monitor Rollback
| Strategy | Downtime | Risk | Complexity | Rollback |
|---|---|---|---|---|
| Blue-Green | 0 (Switch) | ต่ำ (Full Test ก่อน Switch) | ปานกลาง (2x Resource) | ทันที (Switch Back) |
| Canary | 0 (Gradual) | ต่ำมาก (5% Traffic ก่อน) | สูง (Traffic Splitting) | ทันที (Route 100% Old) |
| Rolling Update | 0 (K8s) | ปานกลาง | ต่ำ (K8s Native) | Auto (K8s Rollback) |
| LoRA Hot-swap | 0 (In-memory) | ต่ำ (Same Base Model) | ต่ำ (Pointer Swap) | ทันที (Swap Back) |
| Shadow Mode | 0 (No Impact) | ไม่มี (Log Only) | ปานกลาง | ไม่ต้อง (ไม่ Serve) |
LoRA Fine-tuning
# === LoRA Fine-tuning with PEFT ===
# pip install transformers peft datasets accelerate bitsandbytes trl
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# from trl import SFTTrainer, SFTConfig
# import torch
#
# # QLoRA: 4-bit Quantization + LoRA
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.bfloat16,
# bnb_4bit_use_double_quant=True,
# )
#
# model = AutoModelForCausalLM.from_pretrained(
# "mistralai/Mistral-7B-v0.1",
# quantization_config=bnb_config,
# device_map="auto",
# )
# model = prepare_model_for_kbit_training(model)
#
# lora_config = LoraConfig(
# r=16, # Rank
# lora_alpha=32, # Scaling factor
# target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
# lora_dropout=0.05,
# bias="none",
# task_type="CAUSAL_LM",
# )
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# # trainable params: 13.6M || all params: 3.7B || trainable%: 0.37%
#
# # Training
# training_args = SFTConfig(
# output_dir="./lora-adapter",
# num_train_epochs=3,
# per_device_train_batch_size=4,
# gradient_accumulation_steps=4,
# learning_rate=2e-4,
# logging_steps=10,
# save_steps=500,
# fp16=True,
# )
# trainer = SFTTrainer(model=model, args=training_args, train_dataset=dataset)
# trainer.train()
# model.save_pretrained("./lora-adapter") # Save only adapter (~50MB)
from dataclasses import dataclass
@dataclass
class LoRAConfig:
param: str
recommended: str
effect: str
tip: str
configs = [
LoRAConfig("rank (r)",
"8-32 (เริ่ม 16)",
"สูง = Capacity มาก Memory มาก",
"เริ่ม r=16 ถ้า Loss ไม่ลด เพิ่มเป็น 32"),
LoRAConfig("lora_alpha",
"2x rank (เช่น r=16 alpha=32)",
"Scaling Factor สำหรับ LoRA Output",
"alpha/r = effective learning rate ของ LoRA"),
LoRAConfig("target_modules",
"q_proj v_proj (เริ่มต้น) + k_proj o_proj (เพิ่ม)",
"Modules ที่จะเพิ่ม LoRA Adapter",
"เพิ่ม Module = ดีขึ้น แต่ Memory มาก"),
LoRAConfig("dropout",
"0.05-0.1",
"ป้องกัน Overfitting",
"Dataset น้อย ใช้ Dropout สูง 0.1"),
LoRAConfig("learning_rate",
"1e-4 ถึง 3e-4",
"เร็ว = Diverge ช้า = ไม่ Converge",
"ใช้ Cosine Schedule + Warmup 10%"),
]
print("=== LoRA Config ===")
for c in configs:
print(f" [{c.param}] → {c.recommended}")
print(f" Effect: {c.effect}")
print(f" Tip: {c.tip}")
Zero Downtime Deployment
# === Zero Downtime Deployment Strategies ===
# LoRA Hot-swap (Fastest - No Restart)
# from peft import PeftModel
#
# # Load base model once at startup
# base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
#
# # Load adapter v1
# model = PeftModel.from_pretrained(base_model, "./lora-adapter-v1")
#
# # Hot-swap to adapter v2 (no restart!)
# model.load_adapter("./lora-adapter-v2", adapter_name="v2")
# model.set_adapter("v2") # Switch instantly
#
# # Rollback to v1
# model.set_adapter("default") # Back to v1
# Kubernetes Rolling Update
# apiVersion: apps/v1
# kind: Deployment
# spec:
# replicas: 3
# strategy:
# type: RollingUpdate
# rollingUpdate:
# maxSurge: 1
# maxUnavailable: 0 # Zero downtime
# template:
# spec:
# containers:
# - name: llm-server
# image: llm-server:v2
# env:
# - name: ADAPTER_PATH
# value: "s3://models/lora-adapter-v2"
# readinessProbe:
# httpGet:
# path: /health
# port: 8080
# initialDelaySeconds: 60 # Model load time
@dataclass
class DeployStrategy:
strategy: str
steps: str
duration: str
resource_overhead: str
best_for: str
strategies = [
DeployStrategy("LoRA Hot-swap",
"1.Upload Adapter 2.Load in Memory 3.set_adapter() 4.Done",
"วินาที (Load Adapter ~50MB)",
"ต่ำ (Same Base Model)",
"Adapter Update บ่อย Same Base Model"),
DeployStrategy("Blue-Green",
"1.Deploy Green 2.Load Model 3.Test 4.Switch LB 5.Shutdown Blue",
"5-15 นาที",
"2x (ใช้ทั้ง Blue + Green)",
"Major Version Update Base Model Change"),
DeployStrategy("Canary (Istio/Nginx)",
"1.Deploy Canary 2.Route 5% 3.Monitor 4.Increase 5.100%",
"30-60 นาที (Gradual)",
"1.05-1.5x",
"Risk-sensitive Production High Traffic"),
DeployStrategy("Rolling Update (K8s)",
"1.Update Deployment 2.K8s Roll Pods 3.Readiness Check 4.Done",
"5-30 นาที (ตาม Replicas)",
"1.3x (maxSurge)",
"Standard K8s Deployment"),
]
print("=== Deploy Strategies ===")
for s in strategies:
print(f"\n [{s.strategy}]")
print(f" Steps: {s.steps}")
print(f" Duration: {s.duration}")
print(f" Overhead: {s.resource_overhead}")
print(f" Best for: {s.best_for}")
Monitoring & Rollback
# === Monitoring & Auto-rollback ===
@dataclass
class MonitorMetric:
metric: str
baseline: str
alert_threshold: str
rollback_trigger: str
metrics = [
MonitorMetric("Latency P99",
"< 500ms",
"> 750ms (1.5x baseline)",
"> 1000ms (2x baseline) → Auto Rollback"),
MonitorMetric("Error Rate (5xx)",
"< 0.1%",
"> 1%",
"> 5% → Auto Rollback"),
MonitorMetric("Throughput (req/s)",
"100 req/s",
"< 80 req/s (20% drop)",
"< 50 req/s (50% drop) → Auto Rollback"),
MonitorMetric("GPU Memory",
"< 80%",
"> 85%",
"> 95% → OOM Risk → Rollback"),
MonitorMetric("Quality Score",
"BLEU > 0.7",
"BLEU < 0.65 (7% drop)",
"BLEU < 0.6 (14% drop) → Rollback"),
MonitorMetric("Hallucination Rate",
"< 5%",
"> 8%",
"> 15% → Auto Rollback"),
]
print("=== Monitoring Metrics ===")
for m in metrics:
print(f" [{m.metric}] Baseline: {m.baseline}")
print(f" Alert: {m.alert_threshold}")
print(f" Rollback: {m.rollback_trigger}")
เคล็ดลับ
- Hot-swap: ใช้ LoRA Hot-swap สำหรับ Adapter Update เร็วที่สุด
- QLoRA: ใช้ QLoRA 4-bit Fine-tune 7B Model ด้วย GPU 24GB
- Canary: เริ่ม 5% Traffic ก่อน Monitor 30 นาที
- Rollback: ตั้ง Auto-rollback ทุก Metric ป้องกัน Degradation
- Shadow: ใช้ Shadow Mode ทดสอบก่อน Deploy จริง
LoRA คืออะไร
Low-Rank Adaptation Fine-tune LLM Adapter 10-100MB QLoRA 4-bit PEFT HuggingFace rank alpha target_modules Hot-swap Multiple Adapters
Fine-tune ทำอย่างไร
PEFT Library QLoRA BitsAndBytesConfig LoraConfig r=16 alpha=32 SFTTrainer Dataset JSONL Save Adapter 50MB Merge Inference
Zero Downtime Deploy ทำอย่างไร
Hot-swap set_adapter() Blue-Green Switch Canary 5% Rolling Update K8s Shadow Mode Readiness Probe Rollback ทันที
Monitor อย่างไร
Latency P99 Error Rate Throughput GPU Memory Quality BLEU Hallucination Drift Auto-rollback Threshold Prometheus Grafana
สรุป
LLM Fine-tuning LoRA QLoRA PEFT Zero Downtime Blue-Green Canary Hot-swap Rolling Update Monitor Rollback Production
