SiamCafe · Blog
LLM Fine-tuning LoRA High Availability HA Setup
บทความ

LLM Fine-tuning LoRA High Availability HA Setup

เผยแพร่ 28 พฤษภาคม 2569

LLM LoRA HA

LLM Fine-tuning LoRA High Availability Low-Rank Adaptation Training Pipeline Model Serving vLLM TGI Kubernetes GPU Auto-scale Production Deployment

MethodTrainable ParamsGPU MemorySpeedQualityเหมาะกับ
Full Fine-tune100%สูงมากช้าดีที่สุดมี Resource มาก
LoRA0.1-1%ต่ำเร็วดีมากทั่วไป
QLoRA0.1-1%ต่ำมากเร็วดีGPU น้อย
Prefix Tuning0.01%ต่ำมากเร็วมากปานกลางSimple Task
Prompt Tuning0.001%ต่ำมากเร็วมากปานกลางClassification

LoRA Training

=== LoRA Fine-tuning Pipeline ===

pip install transformers peft datasets accelerate bitsandbytes trl

from peft import LoraConfig, get_peft_model, TaskType

from transformers import (

AutoModelForCausalLM, AutoTokenizer,

TrainingArguments, BitsAndBytesConfig

)

from trl import SFTTrainer

from datasets import load_dataset

import torch

# QLoRA — 4-bit Quantization

bnb_config = BitsAndBytesConfig(

load_in_4bit=True,

bnb_4bit_quant_type="nf4",

bnb_4bit_compute_dtype=torch.bfloat16,

bnb_4bit_use_double_quant=True,

)

# Load Base Model

model_name = "meta-llama/Llama-3.1-8B"

model = AutoModelForCausalLM.from_pretrained(

model_name,

quantization_config=bnb_config,

device_map="auto",

trust_remote_code=True,

)

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

# LoRA Configuration

lora_config = LoraConfig(

r=16, # Rank

lora_alpha=32, # Alpha = 2x Rank

target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],

lora_dropout=0.05,

bias="none",

task_type=TaskType.CAUSAL_LM,

)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

# trainable params: 13,107,200 || all params: 8,030,261,248 || trainable%: 0.163

# Training Arguments

training_args = TrainingArguments(

output_dir="./lora-llama-8b",

num_train_epochs=3,

per_device_train_batch_size=4,

gradient_accumulation_steps=4,

learning_rate=2e-4,

warmup_steps=100,

logging_steps=10,

save_steps=500,

evaluation_strategy="steps",

eval_steps=500,

bf16=True,

optim="paged_adamw_8bit",

)

from dataclasses import dataclass

@dataclass

class LoRAExperiment:

rank: int

alpha: int

targets: str

params: str

gpu_mem: str

train_time: str

eval_loss: float

experiments = [

LoRAExperiment(8, 16, "q_proj v_proj", "6.5M", "12GB", "2h", 1.45),

LoRAExperiment(16, 32, "q_proj k_proj v_proj o_proj", "13.1M", "14GB", "3h", 1.32),

LoRAExperiment(32, 64, "q_proj k_proj v_proj o_proj", "26.2M", "16GB", "5h", 1.28),

LoRAExperiment(64, 128, "all linear", "52.4M", "20GB", "8h", 1.25),

]

print("=== LoRA Experiments (LLaMA 8B) ===")

for e in experiments:

print(f" [r={e.rank} α={e.alpha}] Targets: {e.targets}")

print(f" Params: {e.params} | GPU: {e.gpu_mem} | Time: {e.train_time}")

print(f" Eval Loss: {e.eval_loss}")

HA Architecture

=== High Availability LLM Serving ===

vLLM — High Performance Serving

pip install vllm

python -m vllm.entrypoints.openai.api_server \

--model meta-llama/Llama-3.1-8B \

--lora-modules my-lora=./lora-adapter \

--max-loras 4 \

--port 8000 \

--tensor-parallel-size 1

Kubernetes Deployment — Multi-replica

apiVersion: apps/v1

kind: Deployment

metadata:

name: llm-serving

spec:

replicas: 3

selector:

matchLabels:

app: llm-serving

template:

spec:

containers:

  • name: vllm

image: vllm/vllm-openai:latest

args:

  • --model=meta-llama/Llama-3.1-8B
  • --lora-modules=my-lora=/models/lora-adapter
  • --max-model-len=4096

ports: [{containerPort: 8000}]

resources:

limits:

nvidia.com/gpu: 1

readinessProbe:

httpGet: {path: /health, port: 8000}

initialDelaySeconds: 120

periodSeconds: 10

livenessProbe:

httpGet: {path: /health, port: 8000}

initialDelaySeconds: 180

periodSeconds: 30

HPA — Auto-scale on GPU

apiVersion: autoscaling/v2

kind: HorizontalPodAutoscaler

metadata:

name: llm-hpa

spec:

scaleTargetRef:

apiVersion: apps/v1

kind: Deployment

name: llm-serving

minReplicas: 2

maxReplicas: 8

metrics:

  • type: Pods

pods:

metric: {name: gpu_utilization}

target: {type: AverageValue, averageValue: "70"}

@dataclass

class HAComponent:

component: str

technology: str

purpose: str

replicas: str

failover: str

components = [

HAComponent("Load Balancer", "Nginx / Traefik", "Route requests", "2 (active-passive)", "Auto failover"),

HAComponent("Model Server", "vLLM / TGI", "Inference serving", "3-8 (GPU nodes)", "Health check + replace"),

HAComponent("Model Registry", "MLflow / S3", "Store adapters", "Replicated storage", "Multi-region"),

HAComponent("Cache", "Redis Cluster", "Cache responses", "3 nodes", "Sentinel failover"),

HAComponent("Queue", "RabbitMQ / Redis", "Buffer requests", "3 nodes", "Mirror queues"),

HAComponent("Monitoring", "Prometheus + Grafana", "GPU metrics latency", "2 replicas", "Alert on failure"),

]

print("\n=== HA Architecture ===")

for c in components:

print(f" [{c.component}] {c.technology}")

print(f" Purpose: {c.purpose} | Replicas: {c.replicas}")

print(f" Failover: {c.failover}")

Production Monitoring

# === Production LLM Metrics ===

@dataclass
class LLMMetric:
    metric: str
    value: str
    target: str
    alert: str

metrics = [
    LLMMetric("Inference Latency (p50)", "180ms", "<200ms", "> 500ms"),
    LLMMetric("Inference Latency (p99)", "850ms", "<1000ms", "> 2000ms"),
    LLMMetric("Throughput", "45 req/s", ">30 req/s", "< 15 req/s"),
    LLMMetric("GPU Utilization", "72%", "60-80%", "> 90%"),
    LLMMetric("GPU Memory", "85%", "<90%", "> 95%"),
    LLMMetric("Error Rate", "0.1%", "<0.5%", "> 1%"),
    LLMMetric("Cache Hit Rate", "35%", ">25%", "< 10%"),
    LLMMetric("Model Load Time", "45s", "<60s", "> 120s"),
    LLMMetric("Active Replicas", "3/3", "All healthy", "Any unhealthy"),
    LLMMetric("Token Cost/day", "$120", "<$200", "> $300"),
]

print("Production Metrics:")
for m in metrics:
    print(f"  {m.metric}: {m.value} (Target: {m.target} | Alert: {m.alert})")

deployment_checklist = {
    "Model Tested": "Eval loss < threshold, A/B test passed",
    "LoRA Adapter": "Uploaded to Model Registry with version tag",
    "Health Check": "Readiness + Liveness probes configured",
    "Auto-scale": "HPA on GPU utilization 70%",
    "Rollback": "Previous adapter version tagged for rollback",
    "Monitoring": "Prometheus metrics + Grafana dashboard",
    "Alerting": "PagerDuty for latency and error rate",
    "Blue-green": "New adapter deployed to canary first",
}

print(f"\n\nDeployment Checklist:")
for k, v in deployment_checklist.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

  • QLoRA: ใช้ QLoRA 4-bit ถ้า GPU จำกัด ประหยัด Memory 4x
  • Rank 16: เริ่มที่ Rank 16 Alpha 32 ปรับตามผลลัพธ์
  • vLLM: ใช้ vLLM สำหรับ Production Serving เร็วกว่า HF 3-5x
  • Replicas: Deploy อย่างน้อย 2 Replicas สำหรับ HA
  • Canary: Deploy Adapter ใหม่แบบ Canary ก่อน Full Rollout

LoRA Fine-tuning คืออะไร

Low-Rank Adaptation ไม่อัปเดต Weight ทั้งหมด Low-rank Matrices Attention Layers ลด Parameters GPU น้อย Adapter เล็ก LLaMA Mistral