LoRA Fine-Tuning คืออะไร
LoRA (Low-Rank Adaptation) เป็นเทคนิค parameter-efficient fine-tuning สำหรับ Large Language Models (LLMs) แทนที่จะ train ทุก parameters ของ model (ซึ่งอาจมีหลายพันล้าน parameters) LoRA เพิ่ม trainable low-rank matrices เข้าไปใน attention layers ทำให้ train เฉพาะ parameters ใหม่ที่เพิ่มเข้ามา (ปกติ 0.1-1% ของ original parameters)
ข้อดีของ LoRA ได้แก่ ใช้ VRAM น้อยกว่า full fine-tuning 60-80%, training เร็วกว่า 2-3 เท่า, LoRA adapters มีขนาดเล็ก (10-100MB แทน model เต็มหลาย GB), สามารถ swap adapters ได้ (ใช้ base model เดียวกับหลาย adapters), ไม่ทำให้ base model เสียหาย (catastrophic forgetting น้อยกว่า)
QLoRA เป็น extension ของ LoRA ที่ quantize base model เป็น 4-bit ก่อน แล้ว train LoRA adapters บน quantized model ลด VRAM ได้อีก 50% ทำให้ fine-tune 7B model ได้บน GPU 16GB และ 70B model ได้บน GPU 48GB
Capacity Planning สำคัญเพราะ VRAM เป็น bottleneck หลัก ต้องรู้ว่า model size, batch size, sequence length และ LoRA rank ต้องการ VRAM เท่าไหร เพื่อเลือก hardware ที่เหมาะสมและ optimize training configuration
Capacity Planning สำหรับ LoRA Training
คำนวณ resource requirements
#!/usr/bin/env python3
# lora_capacity_planner.py — LoRA Training Resource Calculator
import json
import math
import logging
from typing import Dict
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")
@dataclass
class ModelConfig:
name: str
params_billion: float
hidden_size: int
num_layers: int
num_heads: int
vocab_size: int
MODELS = {
"llama-3-8b": ModelConfig("Llama 3 8B", 8.0, 4096, 32, 32, 128256),
"mistral-7b": ModelConfig("Mistral 7B", 7.2, 4096, 32, 32, 32000),
"llama-3-70b": ModelConfig("Llama 3 70B", 70.0, 8192, 80, 64, 128256),
"phi-3-mini": ModelConfig("Phi-3 Mini 3.8B", 3.8, 3072, 32, 32, 32064),
"qwen-2-7b": ModelConfig("Qwen 2 7B", 7.0, 3584, 28, 28, 152064),
}
class LoRACapacityPlanner:
def __init__(self):
self.bytes_per_param = {
"fp32": 4, "fp16": 2, "bf16": 2,
"int8": 1, "int4": 0.5, "nf4": 0.5,
}
def estimate_vram(self, model_name, lora_rank=16, batch_size=4,
seq_length=2048, precision="bf16", quantization=None):
model = MODELS.get(model_name)
if not model:
return {"error": f"Unknown model: {model_name}"}
params = model.params_billion * 1e9
# Base model memory
if quantization == "4bit":
model_bytes = params * self.bytes_per_param["int4"]
elif quantization == "8bit":
model_bytes = params * self.bytes_per_param["int8"]
else:
model_bytes = params * self.bytes_per_param[precision]
# LoRA adapter parameters
# Each LoRA layer: 2 matrices of size (hidden_size x rank) and (rank x hidden_size)
# Applied to Q, K, V, O projections = 4 per layer
lora_params_per_layer = 4 * 2 * model.hidden_size * lora_rank
total_lora_params = lora_params_per_layer * model.num_layers
lora_bytes = total_lora_params * self.bytes_per_param[precision]
# Optimizer states (AdamW: 2 states per param)
optimizer_bytes = total_lora_params * 2 * self.bytes_per_param["fp32"]
# Gradients
gradient_bytes = total_lora_params * self.bytes_per_param[precision]
# Activation memory (rough estimate)
activation_bytes = (
batch_size * seq_length * model.hidden_size *
model.num_layers * self.bytes_per_param[precision] * 0.5
)
# KV Cache
kv_cache_bytes = (
2 * batch_size * seq_length * model.hidden_size *
model.num_layers * self.bytes_per_param[precision]
)
total_bytes = (model_bytes + lora_bytes + optimizer_bytes +
gradient_bytes + activation_bytes)
# Add 10% overhead
total_bytes *= 1.1
total_gb = total_bytes / 1e9
# GPU recommendations
gpu_recs = []
gpus = [
("RTX 3060 12GB", 12), ("RTX 4060 Ti 16GB", 16),
("RTX 3090 24GB", 24), ("RTX 4090 24GB", 24),
("A100 40GB", 40), ("A100 80GB", 80),
("H100 80GB", 80),
]
for gpu_name, vram_gb in gpus:
if vram_gb >= total_gb:
gpu_recs.append(gpu_name)
return {
"model": model.name,
"model_params": f"{model.params_billion}B",
"lora_rank": lora_rank,
"batch_size": batch_size,
"seq_length": seq_length,
"precision": precision,
"quantization": quantization or "none",
"memory_breakdown_gb": {
"base_model": round(model_bytes / 1e9, 1),
"lora_adapters": round(lora_bytes / 1e9, 3),
"optimizer_states": round(optimizer_bytes / 1e9, 3),
"gradients": round(gradient_bytes / 1e9, 3),
"activations": round(activation_bytes / 1e9, 1),
},
"total_vram_gb": round(total_gb, 1),
"lora_trainable_params": total_lora_params,
"trainable_pct": round(total_lora_params / params * 100, 2),
"recommended_gpus": gpu_recs[:3],
}
def compare_configurations(self, model_name):
configs = [
{"lora_rank": 8, "batch_size": 2, "seq_length": 1024, "quantization": "4bit"},
{"lora_rank": 16, "batch_size": 4, "seq_length": 2048, "quantization": "4bit"},
{"lora_rank": 32, "batch_size": 4, "seq_length": 2048, "quantization": None},
{"lora_rank": 64, "batch_size": 8, "seq_length": 4096, "quantization": None},
]
results = []
for cfg in configs:
est = self.estimate_vram(model_name, **cfg)
results.append({
"config": cfg,
"vram_gb": est["total_vram_gb"],
"trainable_params": est["lora_trainable_params"],
"gpus": est["recommended_gpus"],
})
return results
def estimate_training_time(self, dataset_size, batch_size, seq_length,
epochs=3, tokens_per_second=1000):
total_tokens = dataset_size * seq_length * epochs
steps = math.ceil(dataset_size * epochs / batch_size)
time_seconds = total_tokens / tokens_per_second
return {
"dataset_examples": dataset_size,
"total_tokens": total_tokens,
"total_steps": steps,
"estimated_hours": round(time_seconds / 3600, 1),
"estimated_cost_a100_usd": round(time_seconds / 3600 * 3.5, 2),
}
planner = LoRACapacityPlanner()
# QLoRA 7B model
print("=== QLoRA Mistral 7B ===")
print(json.dumps(planner.estimate_vram("mistral-7b", lora_rank=16, batch_size=4,
seq_length=2048, quantization="4bit"), indent=2))
# Full precision 8B model
print("\n=== LoRA Llama 3 8B (bf16) ===")
print(json.dumps(planner.estimate_vram("llama-3-8b", lora_rank=32, batch_size=2,
seq_length=2048, precision="bf16"), indent=2))
# Training time estimate
print("\n=== Training Time ===")
print(json.dumps(planner.estimate_training_time(10000, 4, 2048), indent=2))
ติดตั้งและเตรียม Training Environment
เตรียมสภาพแวดล้อมสำหรับ LoRA training
# === ติดตั้ง Training Environment ===
# 1. System Requirements Check
# ===================================
# Check NVIDIA GPU
nvidia-smi
# ต้องเห็น GPU ที่มี VRAM เพียงพอ
# Check CUDA version
nvcc --version
# ต้อง CUDA 11.8+ หรือ 12.x
# 2. Create Python Environment
# ===================================
python -m venv lora-env
source lora-env/bin/activate # Linux/macOS
# lora-env\Scripts\activate # Windows
# 3. Install Dependencies
# ===================================
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install transformers>=4.40.0
pip install datasets>=2.18.0
pip install accelerate>=0.28.0
pip install peft>=0.10.0 # Parameter-Efficient Fine-Tuning
pip install trl>=0.8.0 # Transformer Reinforcement Learning
pip install bitsandbytes>=0.43.0 # Quantization
pip install wandb # Experiment tracking
pip install flash-attn --no-build-isolation # Flash Attention (optional)
# 4. Verify Installation
# ===================================
python -c "
import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB')
import transformers, peft, trl, bitsandbytes
print(f'Transformers: {transformers.__version__}')
print(f'PEFT: {peft.__version__}')
print(f'TRL: {trl.__version__}')
"
# 5. Download Base Model
# ===================================
# Using Hugging Face CLI
pip install huggingface-hub
huggingface-cli login # ต้อง token สำหรับ gated models
# Download Mistral 7B
python -c "
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = 'mistralai/Mistral-7B-Instruct-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto')
print(f'Model loaded: {sum(p.numel() for p in model.parameters()):,} parameters')
"
# 6. Prepare Training Data
# ===================================
# Data format for instruction fine-tuning:
# {"instruction": "...", "input": "...", "output": "..."}
# หรือ chat format:
# {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
# Create sample dataset
python -c "
import json
data = [
{'instruction': 'What is LoRA?', 'output': 'LoRA (Low-Rank Adaptation) is a parameter-efficient fine-tuning technique...'},
{'instruction': 'Explain QLoRA', 'output': 'QLoRA combines 4-bit quantization with LoRA adapters...'},
]
with open('train_data.json', 'w') as f:
json.dump(data, f, indent=2)
print(f'Created {len(data)} training examples')
"
echo "Training environment ready"
Fine-Tune LLM ด้วย LoRA
Training script สำหรับ LoRA fine-tuning
#!/usr/bin/env python3
# train_lora.py — LoRA Fine-Tuning Script
import torch
from transformers import (
AutoModelForCausalLM, AutoTokenizer,
TrainingArguments, BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import json
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("train")
def setup_model(model_id, use_qlora=True):
"""Load model with optional QLoRA quantization"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
if use_qlora:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model = prepare_model_for_kbit_training(model)
else:
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
# LoRA configuration
lora_config = LoraConfig(
r=16, # Rank
lora_alpha=32, # Alpha (scaling factor)
target_modules=[ # Which layers to apply LoRA
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
logger.info(f"Trainable: {trainable:,} / {total:,} ({trainable/total*100:.2f}%)")
return model, tokenizer
def format_instruction(example):
"""Format dataset for instruction tuning"""
if "messages" in example:
# Chat format
text = ""
for msg in example["messages"]:
if msg["role"] == "user":
text += f"[INST] {msg['content']} [/INST] "
elif msg["role"] == "assistant":
text += f"{msg['content']}"
return {"text": text}
else:
# Instruction format
text = f"[INST] {example['instruction']}"
if example.get("input"):
text += f"\n{example['input']}"
text += f" [/INST] {example['output']}"
return {"text": text}
def train(model_id="mistralai/Mistral-7B-Instruct-v0.3",
dataset_path="train_data.json",
output_dir="./lora-output",
epochs=3, batch_size=4, learning_rate=2e-4):
model, tokenizer = setup_model(model_id, use_qlora=True)
# Load and format dataset
dataset = load_dataset("json", data_files=dataset_path, split="train")
dataset = dataset.map(format_instruction)
# Training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=4,
learning_rate=learning_rate,
lr_scheduler_type="cosine",
warmup_ratio=0.1,
weight_decay=0.01,
fp16=False,
bf16=True,
logging_steps=10,
save_strategy="epoch",
save_total_limit=3,
optim="paged_adamw_8bit",
max_grad_norm=0.3,
report_to="wandb", # or "none"
)
# Trainer
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
dataset_text_field="text",
max_seq_length=2048,
packing=True,
)
# Train
logger.info("Starting training...")
trainer.train()
# Save LoRA adapter
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
logger.info(f"LoRA adapter saved to {output_dir}")
# if __name__ == "__main__":
# train()
Monitoring และ Resource Optimization
Monitor training และ optimize resources
#!/usr/bin/env python3
# training_monitor.py — LoRA Training Monitor
import json
import time
import logging
from datetime import datetime
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("monitor")
class TrainingMonitor:
def __init__(self):
self.metrics_history = []
self.start_time = None
def start(self):
self.start_time = time.time()
logger.info("Training monitor started")
def log_step(self, step, loss, learning_rate, gpu_memory_gb=None, tokens_per_sec=None):
elapsed = time.time() - self.start_time if self.start_time else 0
metric = {
"step": step,
"loss": round(loss, 4),
"learning_rate": learning_rate,
"elapsed_seconds": round(elapsed, 1),
"gpu_memory_gb": gpu_memory_gb,
"tokens_per_sec": tokens_per_sec,
"timestamp": datetime.utcnow().isoformat(),
}
self.metrics_history.append(metric)
if step % 50 == 0:
logger.info(
f"Step {step}: loss={loss:.4f} lr={learning_rate:.2e} "
f"gpu={gpu_memory_gb}GB tokens/s={tokens_per_sec}"
)
def get_summary(self):
if not self.metrics_history:
return {}
losses = [m["loss"] for m in self.metrics_history]
return {
"total_steps": len(self.metrics_history),
"initial_loss": losses[0],
"final_loss": losses[-1],
"best_loss": min(losses),
"loss_reduction_pct": round(
(1 - min(losses) / losses[0]) * 100, 1
),
"total_time_hours": round(
self.metrics_history[-1]["elapsed_seconds"] / 3600, 2
),
"avg_tokens_per_sec": round(
sum(m.get("tokens_per_sec", 0) for m in self.metrics_history
if m.get("tokens_per_sec"))
/ max(sum(1 for m in self.metrics_history if m.get("tokens_per_sec")), 1),
0
),
}
def detect_issues(self):
issues = []
if len(self.metrics_history) < 10:
return issues
recent_losses = [m["loss"] for m in self.metrics_history[-10:]]
# Loss not decreasing
if all(recent_losses[i] >= recent_losses[i-1] for i in range(1, len(recent_losses))):
issues.append("Loss not decreasing — consider reducing learning rate")
# Loss spike
if len(recent_losses) >= 2 and recent_losses[-1] > recent_losses[-2] * 1.5:
issues.append("Loss spike detected — possible gradient explosion")
# NaN loss
if any(m["loss"] != m["loss"] for m in self.metrics_history): # NaN check
issues.append("NaN loss detected — reduce learning rate or check data")
# GPU memory
gpu_mems = [m.get("gpu_memory_gb") for m in self.metrics_history if m.get("gpu_memory_gb")]
if gpu_mems and max(gpu_mems) > 0.95 * 24: # Assuming 24GB GPU
issues.append("GPU memory near limit — reduce batch size or seq length")
return issues
class LoRAOptimizer:
@staticmethod
def optimize_config(available_vram_gb, model_params_b, target_quality="balanced"):
configs = {
"minimal": {"rank": 8, "alpha": 16, "batch": 1, "seq": 1024, "quant": "4bit"},
"balanced": {"rank": 16, "alpha": 32, "batch": 4, "seq": 2048, "quant": "4bit"},
"quality": {"rank": 32, "alpha": 64, "batch": 4, "seq": 4096, "quant": "4bit"},
"maximum": {"rank": 64, "alpha": 128, "batch": 8, "seq": 4096, "quant": None},
}
config = configs.get(target_quality, configs["balanced"])
# Estimate VRAM
model_gb = model_params_b * (0.5 if config["quant"] == "4bit" else 2)
overhead_gb = config["batch"] * config["seq"] * 0.001 # Rough estimate
total_gb = model_gb + overhead_gb + 2 # +2GB safety margin
if total_gb > available_vram_gb:
# Downgrade
config["batch"] = max(1, config["batch"] // 2)
config["seq"] = max(512, config["seq"] // 2)
return {
"recommended_config": config,
"estimated_vram_gb": round(total_gb, 1),
"available_vram_gb": available_vram_gb,
"fits": total_gb <= available_vram_gb,
}
# monitor = TrainingMonitor()
# monitor.start()
# for step in range(100):
# monitor.log_step(step, loss=2.5 - step*0.02, learning_rate=2e-4, gpu_memory_gb=15.3)
# print(json.dumps(monitor.get_summary(), indent=2))
optimizer = LoRAOptimizer()
print(json.dumps(optimizer.optimize_config(24, 7, "balanced"), indent=2))
Production Deployment และ Serving
Deploy LoRA model สำหรับ production
# === LoRA Model Deployment ===
# 1. Merge LoRA Adapter with Base Model
# ===================================
#!/usr/bin/env python3
# merge_and_export.py
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel
# import torch
#
# base_model_id = "mistralai/Mistral-7B-Instruct-v0.3"
# lora_path = "./lora-output"
# merged_path = "./merged-model"
#
# # Load base model
# model = AutoModelForCausalLM.from_pretrained(
# base_model_id, torch_dtype=torch.bfloat16, device_map="cpu"
# )
# tokenizer = AutoTokenizer.from_pretrained(base_model_id)
#
# # Load and merge LoRA
# model = PeftModel.from_pretrained(model, lora_path)
# model = model.merge_and_unload()
#
# # Save merged model
# model.save_pretrained(merged_path)
# tokenizer.save_pretrained(merged_path)
# print(f"Merged model saved to {merged_path}")
# 2. Convert to GGUF for llama.cpp
# ===================================
# git clone https://github.com/ggerganov/llama.cpp
# cd llama.cpp
# pip install -r requirements.txt
#
# python convert_hf_to_gguf.py ../merged-model --outfile model.gguf --outtype f16
#
# # Quantize for smaller size
# ./llama-quantize model.gguf model-q4_k_m.gguf Q4_K_M
# 3. Serve with vLLM
# ===================================
pip install vllm
# Serve merged model
# python -m vllm.entrypoints.openai.api_server \
# --model ./merged-model \
# --dtype bfloat16 \
# --max-model-len 4096 \
# --port 8000
# Or serve base model + LoRA adapter (dynamic loading)
# python -m vllm.entrypoints.openai.api_server \
# --model mistralai/Mistral-7B-Instruct-v0.3 \
# --enable-lora \
# --lora-modules my-lora=./lora-output \
# --max-lora-rank 32
# Test API
curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "my-lora",
"messages": [{"role": "user", "content": "Hello"}],
"max_tokens": 100
}' | python -m json.tool
# 4. Docker Deployment
# ===================================
# Dockerfile
# FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
# RUN pip install vllm transformers peft
# COPY merged-model /app/model
# EXPOSE 8000
# CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
# "--model", "/app/model", "--port", "8000"]
# docker build -t lora-server .
# docker run --gpus all -p 8000:8000 lora-server
# 5. Kubernetes Deployment
# ===================================
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: lora-server
# spec:
# replicas: 1
# selector:
# matchLabels:
# app: lora-server
# template:
# spec:
# containers:
# - name: vllm
# image: lora-server:latest
# ports:
# - containerPort: 8000
# resources:
# limits:
# nvidia.com/gpu: 1
# memory: 32Gi
# volumeMounts:
# - name: model-storage
# mountPath: /app/model
# volumes:
# - name: model-storage
# persistentVolumeClaim:
# claimName: model-pvc
echo "Deployment configured"
FAQ คำถามที่พบบ่อย
Q: LoRA rank ตั้งเท่าไหรดี?
A: rank 8-16 เพียงพอสำหรับ simple tasks เช่น classification, sentiment ใช้ VRAM น้อย training เร็ว rank 32-64 สำหรับ complex tasks เช่น instruction following, code generation ใช้ VRAM มากขึ้น แต่ quality ดีกว่า rank 128+ สำหรับ domain-specific knowledge injection ต้อง GPU ใหญ่ เริ่มจาก rank 16 แล้วเพิ่มถ้า quality ไม่พอ อย่าลืมปรับ alpha ตาม (alpha = 2x rank เป็น starting point ที่ดี)
Q: ต้องการ training data เท่าไหร?
A: ขั้นต่ำ 100-500 examples สำหรับ simple task adaptation, 1,000-5,000 examples สำหรับ instruction following ที่ดี, 10,000+ examples สำหรับ domain-specific fine-tuning ที่ quality สูง คุณภาพสำคัญกว่าปริมาณ 1,000 examples ที่ clean และ high-quality ดีกว่า 10,000 examples ที่มี noise ใช้ format ที่ consistent และ diverse examples
Q: QLoRA กับ LoRA ต่างกันอย่างไร?
A: LoRA train LoRA adapters บน base model ที่เป็น fp16/bf16 ใช้ VRAM ตาม model size (7B model ~14GB) QLoRA quantize base model เป็น 4-bit ก่อน แล้ว train LoRA adapters ใช้ VRAM น้อยกว่ามาก (7B model ~6GB) Quality ของ QLoRA ใกล้เคียง LoRA (ต่างกัน 1-2% ใน benchmarks) สำหรับ GPU 16GB ขึ้นไป ใช้ QLoRA กับ 7B models สำหรับ GPU 24GB+ ใช้ LoRA ตรงๆ ได้
Q: Training ใช้เวลาเท่าไหร?
A: ขึ้นกับหลายปัจจัย dataset size, sequence length, batch size, GPU speed, number of epochs สำหรับ 7B model กับ QLoRA rank 16, batch 4, 1,000 examples, 3 epochs บน RTX 4090 ใช้เวลาประมาณ 15-30 นาที บน A100 80GB เร็วกว่า 2-3 เท่า สำหรับ 70B model ใช้เวลา 5-10 เท่าของ 7B Cloud GPU (A100/H100) คิดค่าใช้จ่ายประมาณ $2-5/ชั่วโมง สำหรับ training ง่ายๆ ไม่กี่ดอลลาร์
