SiamCafe · Blog
LLM Fine-tuning LoRA Zero Downtime Deployment —
บทความ

LLM Fine-tuning LoRA Zero Downtime Deployment —

เผยแพร่ 28 พฤษภาคม 2569

LLM LoRA Zero Downtime

LLM Fine-tuning LoRA Zero Downtime Deployment —

LLM Fine-tuning LoRA Zero Downtime Deployment Blue-Green Canary Rolling Update Hot-swap PEFT QLoRA Adapter Monitor Rollback

StrategyDowntimeRiskComplexityRollback
Blue-Green0 (Switch)ต่ำ (Full Test ก่อน Switch)ปานกลาง (2x Resource)ทันที (Switch Back)
Canary0 (Gradual)ต่ำมาก (5% Traffic ก่อน)สูง (Traffic Splitting)ทันที (Route 100% Old)
Rolling Update0 (K8s)ปานกลางต่ำ (K8s Native)Auto (K8s Rollback)
LoRA Hot-swap0 (In-memory)ต่ำ (Same Base Model)ต่ำ (Pointer Swap)ทันที (Swap Back)
Shadow Mode0 (No Impact)ไม่มี (Log Only)ปานกลางไม่ต้อง (ไม่ Serve)

LoRA Fine-tuning

# === LoRA Fine-tuning with PEFT ===

# pip install transformers peft datasets accelerate bitsandbytes trl

# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# from trl import SFTTrainer, SFTConfig
# import torch
#
# # QLoRA: 4-bit Quantization + LoRA
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
# )
#
# model = AutoModelForCausalLM.from_pretrained(
#     "mistralai/Mistral-7B-v0.1",
#     quantization_config=bnb_config,
#     device_map="auto",
# )
# model = prepare_model_for_kbit_training(model)
#
# lora_config = LoraConfig(
#     r=16,                    # Rank
#     lora_alpha=32,           # Scaling factor
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
# )
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# # trainable params: 13.6M || all params: 3.7B || trainable%: 0.37%
#
# # Training
# training_args = SFTConfig(
#     output_dir="./lora-adapter",
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     learning_rate=2e-4,
#     logging_steps=10,
#     save_steps=500,
#     fp16=True,
# )
# trainer = SFTTrainer(model=model, args=training_args, train_dataset=dataset)
# trainer.train()
# model.save_pretrained("./lora-adapter")  # Save only adapter (~50MB)

from dataclasses import dataclass

@dataclass
class LoRAConfig:
    param: str
    recommended: str
    effect: str
    tip: str

configs = [
    LoRAConfig("rank (r)",
        "8-32 (เริ่ม 16)",
        "สูง = Capacity มาก Memory มาก",
        "เริ่ม r=16 ถ้า Loss ไม่ลด เพิ่มเป็น 32"),
    LoRAConfig("lora_alpha",
        "2x rank (เช่น r=16 alpha=32)",
        "Scaling Factor สำหรับ LoRA Output",
        "alpha/r = effective learning rate ของ LoRA"),
    LoRAConfig("target_modules",
        "q_proj v_proj (เริ่มต้น) + k_proj o_proj (เพิ่ม)",
        "Modules ที่จะเพิ่ม LoRA Adapter",
        "เพิ่ม Module = ดีขึ้น แต่ Memory มาก"),
    LoRAConfig("dropout",
        "0.05-0.1",
        "ป้องกัน Overfitting",
        "Dataset น้อย ใช้ Dropout สูง 0.1"),
    LoRAConfig("learning_rate",
        "1e-4 ถึง 3e-4",
        "เร็ว = Diverge ช้า = ไม่ Converge",
        "ใช้ Cosine Schedule + Warmup 10%"),
]

print("=== LoRA Config ===")
for c in configs:
    print(f"  [{c.param}] → {c.recommended}")
    print(f"    Effect: {c.effect}")
    print(f"    Tip: {c.tip}")

Zero Downtime Deployment

LLM Fine-tuning LoRA Zero Downtime Deployment —
# === Zero Downtime Deployment Strategies ===

# LoRA Hot-swap (Fastest - No Restart)
# from peft import PeftModel
#
# # Load base model once at startup
# base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
#
# # Load adapter v1
# model = PeftModel.from_pretrained(base_model, "./lora-adapter-v1")
#
# # Hot-swap to adapter v2 (no restart!)
# model.load_adapter("./lora-adapter-v2", adapter_name="v2")
# model.set_adapter("v2")  # Switch instantly
#
# # Rollback to v1
# model.set_adapter("default")  # Back to v1

# Kubernetes Rolling Update
# apiVersion: apps/v1
# kind: Deployment
# spec:
#   replicas: 3
#   strategy:
#     type: RollingUpdate
#     rollingUpdate:
#       maxSurge: 1
#       maxUnavailable: 0  # Zero downtime
#   template:
#     spec:
#       containers:
#         - name: llm-server
#           image: llm-server:v2
#           env:
#             - name: ADAPTER_PATH
#               value: "s3://models/lora-adapter-v2"
#           readinessProbe:
#             httpGet:
#               path: /health
#               port: 8080
#             initialDelaySeconds: 60  # Model load time

@dataclass
class DeployStrategy:
    strategy: str
    steps: str
    duration: str
    resource_overhead: str
    best_for: str

strategies = [
    DeployStrategy("LoRA Hot-swap",
        "1.Upload Adapter 2.Load in Memory 3.set_adapter() 4.Done",
        "วินาที (Load Adapter ~50MB)",
        "ต่ำ (Same Base Model)",
        "Adapter Update บ่อย Same Base Model"),
    DeployStrategy("Blue-Green",
        "1.Deploy Green 2.Load Model 3.Test 4.Switch LB 5.Shutdown Blue",
        "5-15 นาที",
        "2x (ใช้ทั้ง Blue + Green)",
        "Major Version Update Base Model Change"),
    DeployStrategy("Canary (Istio/Nginx)",
        "1.Deploy Canary 2.Route 5% 3.Monitor 4.Increase 5.100%",
        "30-60 นาที (Gradual)",
        "1.05-1.5x",
        "Risk-sensitive Production High Traffic"),
    DeployStrategy("Rolling Update (K8s)",
        "1.Update Deployment 2.K8s Roll Pods 3.Readiness Check 4.Done",
        "5-30 นาที (ตาม Replicas)",
        "1.3x (maxSurge)",
        "Standard K8s Deployment"),
]

print("=== Deploy Strategies ===")
for s in strategies:
    print(f"\n  [{s.strategy}]")
    print(f"    Steps: {s.steps}")
    print(f"    Duration: {s.duration}")
    print(f"    Overhead: {s.resource_overhead}")
    print(f"    Best for: {s.best_for}")

Monitoring & Rollback

# === Monitoring & Auto-rollback ===

@dataclass
class MonitorMetric:
    metric: str
    baseline: str
    alert_threshold: str
    rollback_trigger: str

metrics = [
    MonitorMetric("Latency P99",
        "< 500ms",
        "> 750ms (1.5x baseline)",
        "> 1000ms (2x baseline) → Auto Rollback"),
    MonitorMetric("Error Rate (5xx)",
        "< 0.1%",
        "> 1%",
        "> 5% → Auto Rollback"),
    MonitorMetric("Throughput (req/s)",
        "100 req/s",
        "< 80 req/s (20% drop)",
        "< 50 req/s (50% drop) → Auto Rollback"),
    MonitorMetric("GPU Memory",
        "< 80%",
        "> 85%",
        "> 95% → OOM Risk → Rollback"),
    MonitorMetric("Quality Score",
        "BLEU > 0.7",
        "BLEU < 0.65 (7% drop)",
        "BLEU < 0.6 (14% drop) → Rollback"),
    MonitorMetric("Hallucination Rate",
        "< 5%",
        "> 8%",
        "> 15% → Auto Rollback"),
]

print("=== Monitoring Metrics ===")
for m in metrics:
    print(f"  [{m.metric}] Baseline: {m.baseline}")
    print(f"    Alert: {m.alert_threshold}")
    print(f"    Rollback: {m.rollback_trigger}")

เคล็ดลับ

  • Hot-swap: ใช้ LoRA Hot-swap สำหรับ Adapter Update เร็วที่สุด
  • QLoRA: ใช้ QLoRA 4-bit Fine-tune 7B Model ด้วย GPU 24GB
  • Canary: เริ่ม 5% Traffic ก่อน Monitor 30 นาที
  • Rollback: ตั้ง Auto-rollback ทุก Metric ป้องกัน Degradation
  • Shadow: ใช้ Shadow Mode ทดสอบก่อน Deploy จริง

LoRA คืออะไร

Low-Rank Adaptation Fine-tune LLM Adapter 10-100MB QLoRA 4-bit PEFT HuggingFace rank alpha target_modules Hot-swap Multiple Adapters