LLM Fine-tuning LoRA
LLM Fine-tuning LoRA Low-Rank Adaptation QLoRA Memory Efficient Trainable Parameters Progressive Delivery Canary Feature Flags A/B Testing Model Deployment
| Method | Parameters | VRAM (7B) | Speed | Quality |
|---|---|---|---|---|
| Full Fine-tune | 100% | ~60GB | ช้า | สูงสุด |
| LoRA | ~1% | ~24GB | เร็ว | สูง |
| QLoRA | ~1% | ~12GB | เร็ว | สูง |
| Prompt Tuning | ~0.01% | ~16GB | เร็วมาก | ปานกลาง |
| Prefix Tuning | ~0.1% | ~18GB | เร็ว | ปานกลาง-สูง |
LoRA Fine-tuning Implementation
# === LoRA Fine-tuning with PEFT ===
# pip install transformers peft datasets accelerate bitsandbytes
# from transformers import (
# AutoModelForCausalLM, AutoTokenizer,
# TrainingArguments, Trainer, BitsAndBytesConfig,
# )
# from peft import LoraConfig, get_peft_model, TaskType
# from datasets import load_dataset
# import torch
#
# # QLoRA: 4-bit quantization
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.bfloat16,
# bnb_4bit_use_double_quant=True,
# )
#
# # Load Base Model
# model = AutoModelForCausalLM.from_pretrained(
# "meta-llama/Llama-2-7b-hf",
# quantization_config=bnb_config,
# device_map="auto",
# trust_remote_code=True,
# )
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# tokenizer.pad_token = tokenizer.eos_token
#
# # LoRA Config
# lora_config = LoraConfig(
# r=16, # Rank
# lora_alpha=32, # Alpha
# target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
# lora_dropout=0.05,
# bias="none",
# task_type=TaskType.CAUSAL_LM,
# )
#
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# # trainable params: 4,194,304 || all params: 6,742,609,920 || 0.06%
#
# # Training
# training_args = TrainingArguments(
# output_dir="./lora-output",
# num_train_epochs=3,
# per_device_train_batch_size=4,
# gradient_accumulation_steps=4,
# learning_rate=2e-4,
# warmup_steps=100,
# logging_steps=10,
# save_strategy="epoch",
# fp16=True,
# optim="paged_adamw_8bit",
# )
#
# trainer = Trainer(
# model=model,
# args=training_args,
# train_dataset=train_dataset,
# eval_dataset=eval_dataset,
# )
# trainer.train()
# model.save_pretrained("./lora-adapter")
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class LoRAExperiment:
name: str
base_model: str
rank: int
alpha: int
dataset_size: int
epochs: int
train_loss: float
eval_loss: float
vram_gb: float
time_hours: float
experiments = [
LoRAExperiment("v1-baseline", "Llama-2-7B", 8, 16, 5000, 3, 1.25, 1.32, 14.5, 2.1),
LoRAExperiment("v2-rank16", "Llama-2-7B", 16, 32, 5000, 3, 1.18, 1.24, 15.2, 2.5),
LoRAExperiment("v3-more-data", "Llama-2-7B", 16, 32, 15000, 3, 1.05, 1.12, 15.2, 6.8),
LoRAExperiment("v4-rank32", "Llama-2-7B", 32, 64, 15000, 5, 0.95, 1.08, 16.8, 11.2),
LoRAExperiment("v5-mistral", "Mistral-7B", 16, 32, 15000, 3, 0.98, 1.05, 15.5, 7.0),
]
print("=== LoRA Experiments ===")
for e in experiments:
print(f"\n [{e.name}] {e.base_model}")
print(f" Rank: {e.rank} | Alpha: {e.alpha} | Data: {e.dataset_size:,}")
print(f" Loss: {e.train_loss:.2f}/{e.eval_loss:.2f} | "
f"VRAM: {e.vram_gb}GB | Time: {e.time_hours}h")
Progressive Delivery for ML Models
# === Progressive Delivery Pipeline ===
# Canary Deployment for LLM
# 1. Deploy new model as canary (5% traffic)
# 2. Compare metrics: latency, quality, error rate
# 3. If OK: increase to 25%, 50%, 100%
# 4. If bad: rollback to previous model
# Kubernetes Canary with Istio
# apiVersion: networking.istio.io/v1beta1
# kind: VirtualService
# metadata:
# name: llm-service
# spec:
# hosts:
# - llm-service
# http:
# - route:
# - destination:
# host: llm-service
# subset: stable
# weight: 95
# - destination:
# host: llm-service
# subset: canary
# weight: 5
#
# ---
# apiVersion: networking.istio.io/v1beta1
# kind: DestinationRule
# metadata:
# name: llm-service
# spec:
# host: llm-service
# subsets:
# - name: stable
# labels:
# version: v3-lora
# - name: canary
# labels:
# version: v4-lora
@dataclass
class DeliveryStage:
stage: str
traffic_pct: int
duration_min: int
metrics_check: List[str]
status: str
canary_stages = [
DeliveryStage("Canary 5%", 5, 30,
["Latency P99 < 500ms", "Error rate < 0.1%", "Quality score > 0.85"], "Passed"),
DeliveryStage("Canary 25%", 25, 60,
["Latency P99 < 500ms", "Error rate < 0.1%", "Quality score > 0.85"], "Passed"),
DeliveryStage("Canary 50%", 50, 120,
["Latency P99 < 500ms", "Error rate < 0.1%", "Quality score > 0.85"], "Passed"),
DeliveryStage("Full Rollout", 100, 0,
["All metrics stable for 24h"], "In Progress"),
]
print("\n=== Progressive Delivery ===")
for s in canary_stages:
print(f" [{s.status}] {s.stage} ({s.traffic_pct}%)")
print(f" Duration: {s.duration_min}min")
for check in s.metrics_check:
print(f" - {check}")
# A/B Testing for Models
ab_results = {
"Model A (v3-lora)": {
"latency_p99": "380ms",
"quality_score": 0.87,
"user_satisfaction": "4.2/5",
"cost_per_1k": "$0.12",
},
"Model B (v4-lora)": {
"latency_p99": "420ms",
"quality_score": 0.91,
"user_satisfaction": "4.5/5",
"cost_per_1k": "$0.14",
},
}
print(f"\n\nA/B Test Results:")
for model, metrics in ab_results.items():
print(f"\n [{model}]")
for k, v in metrics.items():
print(f" {k}: {v}")
Evaluation และ Monitoring
# === Model Evaluation & Monitoring ===
# Evaluation Script
# from transformers import pipeline
# import json
#
# # Load fine-tuned model
# pipe = pipeline("text-generation",
# model="./merged-model",
# tokenizer="./merged-model",
# max_new_tokens=256,
# temperature=0.7,
# )
#
# # Evaluate on test set
# with open("test_data.jsonl") as f:
# test_data = [json.loads(line) for line in f]
#
# correct = 0
# for item in test_data:
# output = pipe(item["prompt"])[0]["generated_text"]
# if evaluate_quality(output, item["expected"]) > 0.8:
# correct += 1
#
# accuracy = correct / len(test_data)
# print(f"Accuracy: {accuracy:.2%}")
eval_metrics = {
"Perplexity": {"v3": 8.5, "v4": 7.2, "improvement": "-15.3%"},
"BLEU Score": {"v3": 0.42, "v4": 0.48, "improvement": "+14.3%"},
"ROUGE-L": {"v3": 0.55, "v4": 0.61, "improvement": "+10.9%"},
"Human Eval": {"v3": "4.2/5", "v4": "4.5/5", "improvement": "+7.1%"},
"Latency P99": {"v3": "380ms", "v4": "420ms", "improvement": "+10.5%"},
"Cost/1K req": {"v3": "$0.12", "v4": "$0.14", "improvement": "+16.7%"},
}
print("Model Evaluation:")
print(f" {'Metric':<16} {'v3 (stable)':>12} {'v4 (canary)':>12} {'Change':>10}")
for metric, vals in eval_metrics.items():
print(f" {metric:<16} {str(vals['v3']):>12} {str(vals['v4']):>12} "
f"{vals['improvement']:>10}")
# Monitoring Alerts
alerts = [
"Latency P99 > 500ms for 5 min → Rollback",
"Error rate > 1% for 3 min → Rollback",
"Quality score < 0.80 for 10 min → Pause & Investigate",
"VRAM usage > 90% → Scale up or Quantize",
"Throughput drop > 30% → Check batch size / scaling",
]
print(f"\n\nProduction Alerts:")
for i, alert in enumerate(alerts, 1):
print(f" {i}. {alert}")
เคล็ดลับ
- QLoRA: ใช้ QLoRA ประหยัด VRAM ครึ่งหนึ่ง คุณภาพใกล้เคียง LoRA
- Rank: เริ่มที่ r=16 ถ้าไม่พอค่อยเพิ่มเป็น 32 หรือ 64
- Data: คุณภาพข้อมูลสำคัญกว่าปริมาณ ตรวจสอบก่อน Train
- Canary: เริ่ม 5% Traffic ดู 30 นาที ก่อนเพิ่ม
- Merge: Merge LoRA adapter เข้า Base Model ก่อน Deploy ลด Latency
LoRA คืออะไร
Low-Rank Adaptation Fine-tuning LLM ประหยัด Memory Low-rank Matrices Attention Layers Trainable Parameters น้อย GPU เดียว Merge ได้
Progressive Delivery คืออะไร
ค่อยๆปล่อย Version ใหม่ Canary Release Traffic เล็กน้อย Metrics Feature Flags A/B Testing ลดความเสี่ยง Model Regression
Fine-tune LLM ต้องใช้ GPU อะไร
LoRA 7B RTX 4090 24GB QLoRA 12GB Model 13B A100 40GB Model 70B A100 80GB หลายใบ Cloud $1-3/hr
Dataset สำหรับ Fine-tuning ต้องมีเท่าไหร่
เริ่ม 1,000-5,000 ตัวอย่าง คุณภาพสูง 10,000-50,000 คุณภาพสำคัญกว่าปริมาณ Instruction Format Data Quality
สรุป
LLM Fine-tuning LoRA QLoRA Low-Rank Adaptation PEFT Progressive Delivery Canary Feature Flags A/B Testing Evaluation Perplexity BLEU ROUGE Monitoring Rollback GPU VRAM Dataset Quality
