LLM Fine-Tuning LoRA Capacity Planning — วางแผน Resource สำหรับ LoRA Training

LoRA Fine-Tuning คืออะไร

LoRA (Low-Rank Adaptation) เป็นเทคนิค parameter-efficient fine-tuning สำหรับ Large Language Models (LLMs) แทนที่จะ train ทุก parameters ของ model (ซึ่งอาจมีหลายพันล้าน parameters) LoRA เพิ่ม trainable low-rank matrices เข้าไปใน attention layers ทำให้ train เฉพาะ parameters ใหม่ที่เพิ่มเข้ามา (ปกติ 0.1-1% ของ original parameters)

ข้อดีของ LoRA ได้แก่ ใช้ VRAM น้อยกว่า full fine-tuning 60-80%, training เร็วกว่า 2-3 เท่า, LoRA adapters มีขนาดเล็ก (10-100MB แทน model เต็มหลาย GB), สามารถ swap adapters ได้ (ใช้ base model เดียวกับหลาย adapters), ไม่ทำให้ base model เสียหาย (catastrophic forgetting น้อยกว่า)

QLoRA เป็น extension ของ LoRA ที่ quantize base model เป็น 4-bit ก่อน แล้ว train LoRA adapters บน quantized model ลด VRAM ได้อีก 50% ทำให้ fine-tune 7B model ได้บน GPU 16GB และ 70B model ได้บน GPU 48GB

Capacity Planning สำคัญเพราะ VRAM เป็น bottleneck หลัก ต้องรู้ว่า model size, batch size, sequence length และ LoRA rank ต้องการ VRAM เท่าไหร เพื่อเลือก hardware ที่เหมาะสมและ optimize training configuration

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Elixir Ecto High Availability HA Setup —

Capacity Planning สำหรับ LoRA Training

คำนวณ resource requirements

#!/usr/bin/env python3
# lora_capacity_planner.py — LoRA Training Resource Calculator
import json
import math
import logging
from typing import Dict
from dataclasses import dataclass

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")

@dataclass
class ModelConfig:
 name: str
 params_billion: float
 hidden_size: int
 num_layers: int
 num_heads: int
 vocab_size: int

MODELS = {
 "llama-3-8b": ModelConfig("Llama 3 8B", 8.0, 4096, 32, 32, 128256),
 "mistral-7b": ModelConfig("Mistral 7B", 7.2, 4096, 32, 32, 32000),
 "llama-3-70b": ModelConfig("Llama 3 70B", 70.0, 8192, 80, 64, 128256),
 "phi-3-mini": ModelConfig("Phi-3 Mini 3.8B", 3.8, 3072, 32, 32, 32064),
 "qwen-2-7b": ModelConfig("Qwen 2 7B", 7.0, 3584, 28, 28, 152064),
}

class LoRACapacityPlanner:
 def __init__(self):
 self.bytes_per_param = {
 "fp32": 4, "fp16": 2, "bf16": 2,
 "int8": 1, "int4": 0.5, "nf4": 0.5,
 }
 
 def estimate_vram(self, model_name, lora_rank=16, batch_size=4,
 seq_length=2048, precision="bf16", quantization=None):
 model = MODELS.get(model_name)
 if not model:
 return {"error": f"Unknown model: {model_name}"}
 
 params = model.params_billion * 1e9
 
 # Base model memory
 if quantization == "4bit":
 model_bytes = params * self.bytes_per_param["int4"]
 elif quantization == "8bit":
 model_bytes = params * self.bytes_per_param["int8"]
 else:
 model_bytes = params * self.bytes_per_param[precision]
 
 # LoRA adapter parameters
 # Each LoRA layer: 2 matrices of size (hidden_size x rank) and (rank x hidden_size)
 # Applied to Q, K, V, O projections = 4 per layer
 lora_params_per_layer = 4 * 2 * model.hidden_size * lora_rank
 total_lora_params = lora_params_per_layer * model.num_layers
 lora_bytes = total_lora_params * self.bytes_per_param[precision]
 
 # Optimizer states (AdamW: 2 states per param)
 optimizer_bytes = total_lora_params * 2 * self.bytes_per_param["fp32"]
 
 # Gradients
 gradient_bytes = total_lora_params * self.bytes_per_param[precision]
 
 # Activation memory (rough estimate)
 activation_bytes = (
 batch_size * seq_length * model.hidden_size *
 model.num_layers * self.bytes_per_param[precision] * 0.5
 )
 
 # KV Cache
 kv_cache_bytes = (
 2 * batch_size * seq_length * model.hidden_size *
 model.num_layers * self.bytes_per_param[precision]
 )
 
 total_bytes = (model_bytes + lora_bytes + optimizer_bytes +
 gradient_bytes + activation_bytes)
 
 # Add 10% overhead
 total_bytes *= 1.1
 
 total_gb = total_bytes / 1e9
 
 # GPU recommendations
 gpu_recs = []
 gpus = [
 ("RTX 3060 12GB", 12), ("RTX 4060 Ti 16GB", 16),
 ("RTX 3090 24GB", 24), ("RTX 4090 24GB", 24),
 ("A100 40GB", 40), ("A100 80GB", 80),
 ("H100 80GB", 80),
 ]
 
 for gpu_name, vram_gb in gpus:
 if vram_gb >= total_gb:
 gpu_recs.append(gpu_name)
 
 return {
 "model": model.name,
 "model_params": f"{model.params_billion}B",
 "lora_rank": lora_rank,
 "batch_size": batch_size,
 "seq_length": seq_length,
 "precision": precision,
 "quantization": quantization or "none",
 "memory_breakdown_gb": {
 "base_model": round(model_bytes / 1e9, 1),
 "lora_adapters": round(lora_bytes / 1e9, 3),
 "optimizer_states": round(optimizer_bytes / 1e9, 3),
 "gradients": round(gradient_bytes / 1e9, 3),
 "activations": round(activation_bytes / 1e9, 1),
 },
 "total_vram_gb": round(total_gb, 1),
 "lora_trainable_params": total_lora_params,
 "trainable_pct": round(total_lora_params / params * 100, 2),
 "recommended_gpus": gpu_recs[:3],
 }
 
 def compare_configurations(self, model_name):
 configs = [
 {"lora_rank": 8, "batch_size": 2, "seq_length": 1024, "quantization": "4bit"},
 {"lora_rank": 16, "batch_size": 4, "seq_length": 2048, "quantization": "4bit"},
 {"lora_rank": 32, "batch_size": 4, "seq_length": 2048, "quantization": None},
 {"lora_rank": 64, "batch_size": 8, "seq_length": 4096, "quantization": None},
 ]
 
 results = []
 for cfg in configs:
 est = self.estimate_vram(model_name, **cfg)
 results.append({
 "config": cfg,
 "vram_gb": est["total_vram_gb"],
 "trainable_params": est["lora_trainable_params"],
 "gpus": est["recommended_gpus"],
 })
 
 return results
 
 def estimate_training_time(self, dataset_size, batch_size, seq_length,
 epochs=3, tokens_per_second=1000):
 total_tokens = dataset_size * seq_length * epochs
 steps = math.ceil(dataset_size * epochs / batch_size)
 time_seconds = total_tokens / tokens_per_second
 
 return {
 "dataset_examples": dataset_size,
 "total_tokens": total_tokens,
 "total_steps": steps,
 "estimated_hours": round(time_seconds / 3600, 1),
 "estimated_cost_a100_usd": round(time_seconds / 3600 * 3.5, 2),
 }

planner = LoRACapacityPlanner()

# QLoRA 7B model
print("=== QLoRA Mistral 7B ===")
print(json.dumps(planner.estimate_vram("mistral-7b", lora_rank=16, batch_size=4,
 seq_length=2048, quantization="4bit"), indent=2))

# Full precision 8B model
print("\n=== LoRA Llama 3 8B (bf16) ===")
print(json.dumps(planner.estimate_vram("llama-3-8b", lora_rank=32, batch_size=2,
 seq_length=2048, precision="bf16"), indent=2))

# Training time estimate
print("\n=== Training Time ===")
print(json.dumps(planner.estimate_training_time(10000, 4, 2048), indent=2))

ติดตั้งและเตรียม Training Environment

เตรียมสภาพแวดล้อมสำหรับ LoRA training

แนะนำเพิ่มเติม — iCafeForex

# === ติดตั้ง Training Environment ===

# 1. System Requirements Check
# ===================================
# Check NVIDIA GPU
nvidia-smi
# ต้องเห็น GPU ที่มี VRAM เพียงพอ

# Check CUDA version
nvcc --version
# ต้อง CUDA 11.8+ หรือ 12.x

# 2. Create Python Environment
# ===================================
python -m venv lora-env
source lora-env/bin/activate # Linux/macOS
# lora-env\Scripts\activate # Windows

# 3. Install Dependencies
# ===================================
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

pip install transformers>=4.40.0
pip install datasets>=2.18.0
pip install accelerate>=0.28.0
pip install peft>=0.10.0 # Parameter-Efficient Fine-Tuning
pip install trl>=0.8.0 # Transformer Reinforcement Learning
pip install bitsandbytes>=0.43.0 # Quantization
pip install wandb # Experiment tracking
pip install flash-attn --no-build-isolation # Flash Attention (optional)

# 4. Verify Installation
# ===================================
python -c "
import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB')

import transformers, peft, trl, bitsandbytes
print(f'Transformers: {transformers.__version__}')
print(f'PEFT: {peft.__version__}')
print(f'TRL: {trl.__version__}')
"

# 5. Download Base Model
# ===================================
# Using Hugging Face CLI
pip install huggingface-hub
huggingface-cli login # ต้อง token สำหรับ gated models

# Download Mistral 7B
python -c "
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = 'mistralai/Mistral-7B-Instruct-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto')
print(f'Model loaded: {sum(p.numel() for p in model.parameters()):,} parameters')
"

# 6. Prepare Training Data
# ===================================
# Data format for instruction fine-tuning:
# {"instruction": "...", "input": "...", "output": "..."}
# หรือ chat format:
# {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

# Create sample dataset
python -c "
import json
data = [
 {'instruction': 'What is LoRA?', 'output': 'LoRA (Low-Rank Adaptation) is a parameter-efficient fine-tuning technique...'},
 {'instruction': 'Explain QLoRA', 'output': 'QLoRA combines 4-bit quantization with LoRA adapters...'},
]
with open('train_data.json', 'w') as f:
 json.dump(data, f, indent=2)
print(f'Created {len(data)} training examples')
"

echo "Training environment ready"

Fine-Tune LLM ด้วย LoRA

Training script สำหรับ LoRA fine-tuning

#!/usr/bin/env python3
# train_lora.py — LoRA Fine-Tuning Script
import torch
from transformers import (
 AutoModelForCausalLM, AutoTokenizer,
 TrainingArguments, BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import json
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("train")

def setup_model(model_id, use_qlora=True):
 """Load model with optional QLoRA quantization"""
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.padding_side = "right"
 
 if use_qlora:
 bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.bfloat16,
 bnb_4bit_use_double_quant=True,
 )
 model = AutoModelForCausalLM.from_pretrained(
 model_id,
 quantization_config=bnb_config,
 device_map="auto",
 torch_dtype=torch.bfloat16,
 )
 model = prepare_model_for_kbit_training(model)
 else:
 model = AutoModelForCausalLM.from_pretrained(
 model_id,
 device_map="auto",
 torch_dtype=torch.bfloat16,
 )
 
 # LoRA configuration
 lora_config = LoraConfig(
 r=16, # Rank
 lora_alpha=32, # Alpha (scaling factor)
 target_modules=[ # Which layers to apply LoRA
 "q_proj", "k_proj", "v_proj", "o_proj",
 "gate_proj", "up_proj", "down_proj",
 ],
 lora_dropout=0.05,
 bias="none",
 task_type="CAUSAL_LM",
 )
 
 model = get_peft_model(model, lora_config)
 
 trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
 total = sum(p.numel() for p in model.parameters())
 logger.info(f"Trainable: {trainable:,} / {total:,} ({trainable/total*100:.2f}%)")
 
 return model, tokenizer

def format_instruction(example):
 """Format dataset for instruction tuning"""
 if "messages" in example:
 # Chat format
 text = ""
 for msg in example["messages"]:
 if msg["role"] == "user":
 text += f"[INST] {msg['content']} [/INST] "
 elif msg["role"] == "assistant":
 text += f"{msg['content']}"
 return {"text": text}
 else:
 # Instruction format
 text = f"[INST] {example['instruction']}"
 if example.get("input"):
 text += f"\n{example['input']}"
 text += f" [/INST] {example['output']}"
 return {"text": text}

def train(model_id="mistralai/Mistral-7B-Instruct-v0.3",
 dataset_path="train_data.json",
 output_dir="./lora-output",
 epochs=3, batch_size=4, learning_rate=2e-4):
 
 model, tokenizer = setup_model(model_id, use_qlora=True)
 
 # Load and format dataset
 dataset = load_dataset("json", data_files=dataset_path, split="train")
 dataset = dataset.map(format_instruction)
 
 # Training arguments
 training_args = TrainingArguments(
 output_dir=output_dir,
 num_train_epochs=epochs,
 per_device_train_batch_size=batch_size,
 gradient_accumulation_steps=4,
 learning_rate=learning_rate,
 lr_scheduler_type="cosine",
 warmup_ratio=0.1,
 weight_decay=0.01,
 fp16=False,
 bf16=True,
 logging_steps=10,
 save_strategy="epoch",
 save_total_limit=3,
 optim="paged_adamw_8bit",
 max_grad_norm=0.3,
 report_to="wandb", # or "none"
 )
 
 # Trainer
 trainer = SFTTrainer(
 model=model,
 args=training_args,
 train_dataset=dataset,
 tokenizer=tokenizer,
 dataset_text_field="text",
 max_seq_length=2048,
 packing=True,
 )
 
 # Train
 logger.info("Starting training...")
 trainer.train()
 
 # Save LoRA adapter
 trainer.save_model(output_dir)
 tokenizer.save_pretrained(output_dir)
 
 logger.info(f"LoRA adapter saved to {output_dir}")

# if __name__ == "__main__":
# train()

Monitoring และ Resource Optimization

Monitor training และ optimize resources

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Pi Network — คู่มือ IT Infrastructure 2026

#!/usr/bin/env python3
# training_monitor.py — LoRA Training Monitor
import json
import time
import logging
from datetime import datetime
from typing import Dict, List

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("monitor")

class TrainingMonitor:
 def __init__(self):
 self.metrics_history = []
 self.start_time = None
 
 def start(self):
 self.start_time = time.time()
 logger.info("Training monitor started")
 
 def log_step(self, step, loss, learning_rate, gpu_memory_gb=None, tokens_per_sec=None):
 elapsed = time.time() - self.start_time if self.start_time else 0
 
 metric = {
 "step": step,
 "loss": round(loss, 4),
 "learning_rate": learning_rate,
 "elapsed_seconds": round(elapsed, 1),
 "gpu_memory_gb": gpu_memory_gb,
 "tokens_per_sec": tokens_per_sec,
 "timestamp": datetime.utcnow().isoformat(),
 }
 
 self.metrics_history.append(metric)
 
 if step % 50 == 0:
 logger.info(
 f"Step {step}: loss={loss:.4f} lr={learning_rate:.2e} "
 f"gpu={gpu_memory_gb}GB tokens/s={tokens_per_sec}"
 )
 
 def get_summary(self):
 if not self.metrics_history:
 return {}
 
 losses = [m["loss"] for m in self.metrics_history]
 
 return {
 "total_steps": len(self.metrics_history),
 "initial_loss": losses[0],
 "final_loss": losses[-1],
 "best_loss": min(losses),
 "loss_reduction_pct": round(
 (1 - min(losses) / losses[0]) * 100, 1
 ),
 "total_time_hours": round(
 self.metrics_history[-1]["elapsed_seconds"] / 3600, 2
 ),
 "avg_tokens_per_sec": round(
 sum(m.get("tokens_per_sec", 0) for m in self.metrics_history
 if m.get("tokens_per_sec"))
 / max(sum(1 for m in self.metrics_history if m.get("tokens_per_sec")), 1),
 0
 ),
 }
 
 def detect_issues(self):
 issues = []
 
 if len(self.metrics_history) < 10:
 return issues
 
 recent_losses = [m["loss"] for m in self.metrics_history[-10:]]
 
 # Loss not decreasing
 if all(recent_losses[i] >= recent_losses[i-1] for i in range(1, len(recent_losses))):
 issues.append("Loss not decreasing — consider reducing learning rate")
 
 # Loss spike
 if len(recent_losses) >= 2 and recent_losses[-1] > recent_losses[-2] * 1.5:
 issues.append("Loss spike detected — possible gradient explosion")
 
 # NaN loss
 if any(m["loss"] != m["loss"] for m in self.metrics_history): # NaN check
 issues.append("NaN loss detected — reduce learning rate or check data")
 
 # GPU memory
 gpu_mems = [m.get("gpu_memory_gb") for m in self.metrics_history if m.get("gpu_memory_gb")]
 if gpu_mems and max(gpu_mems) > 0.95 * 24: # Assuming 24GB GPU
 issues.append("GPU memory near limit — reduce batch size or seq length")
 
 return issues

class LoRAOptimizer:
 @staticmethod
 def optimize_config(available_vram_gb, model_params_b, target_quality="balanced"):
 configs = {
 "minimal": {"rank": 8, "alpha": 16, "batch": 1, "seq": 1024, "quant": "4bit"},
 "balanced": {"rank": 16, "alpha": 32, "batch": 4, "seq": 2048, "quant": "4bit"},
 "quality": {"rank": 32, "alpha": 64, "batch": 4, "seq": 4096, "quant": "4bit"},
 "maximum": {"rank": 64, "alpha": 128, "batch": 8, "seq": 4096, "quant": None},
 }
 
 config = configs.get(target_quality, configs["balanced"])
 
 # Estimate VRAM
 model_gb = model_params_b * (0.5 if config["quant"] == "4bit" else 2)
 overhead_gb = config["batch"] * config["seq"] * 0.001 # Rough estimate
 total_gb = model_gb + overhead_gb + 2 # +2GB safety margin
 
 if total_gb > available_vram_gb:
 # Downgrade
 config["batch"] = max(1, config["batch"] // 2)
 config["seq"] = max(512, config["seq"] // 2)
 
 return {
 "recommended_config": config,
 "estimated_vram_gb": round(total_gb, 1),
 "available_vram_gb": available_vram_gb,
 "fits": total_gb <= available_vram_gb,
 }

# monitor = TrainingMonitor()
# monitor.start()
# for step in range(100):
# monitor.log_step(step, loss=2.5 - step*0.02, learning_rate=2e-4, gpu_memory_gb=15.3)
# print(json.dumps(monitor.get_summary(), indent=2))

optimizer = LoRAOptimizer()
print(json.dumps(optimizer.optimize_config(24, 7, "balanced"), indent=2))

Production Deployment และ Serving

Deploy LoRA model สำหรับ production

# === LoRA Model Deployment ===

# 1. Merge LoRA Adapter with Base Model
# ===================================
#!/usr/bin/env python3
# merge_and_export.py

# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel
# import torch
#
# base_model_id = "mistralai/Mistral-7B-Instruct-v0.3"
# lora_path = "./lora-output"
# merged_path = "./merged-model"
#
# # Load base model
# model = AutoModelForCausalLM.from_pretrained(
# base_model_id, torch_dtype=torch.bfloat16, device_map="cpu"
# )
# tokenizer = AutoTokenizer.from_pretrained(base_model_id)
#
# # Load and merge LoRA
# model = PeftModel.from_pretrained(model, lora_path)
# model = model.merge_and_unload()
#
# # Save merged model
# model.save_pretrained(merged_path)
# tokenizer.save_pretrained(merged_path)
# print(f"Merged model saved to {merged_path}")

# 2. Convert to GGUF for llama.cpp
# ===================================
# git clone https://github.com/ggerganov/llama.cpp
# cd llama.cpp
# pip install -r requirements.txt
#
# python convert_hf_to_gguf.py ../merged-model --outfile model.gguf --outtype f16
#
# # Quantize for smaller size
# ./llama-quantize model.gguf model-q4_k_m.gguf Q4_K_M

# 3. Serve with vLLM
# ===================================
pip install vllm

# Serve merged model
# python -m vllm.entrypoints.openai.api_server \
# --model ./merged-model \
# --dtype bfloat16 \
# --max-model-len 4096 \
# --port 8000

# Or serve base model + LoRA adapter (dynamic loading)
# python -m vllm.entrypoints.openai.api_server \
# --model mistralai/Mistral-7B-Instruct-v0.3 \
# --enable-lora \
# --lora-modules my-lora=./lora-output \
# --max-lora-rank 32

# Test API
curl -s http://localhost:8000/v1/chat/completions \
 -H "Content-Type: application/json" \
 -d '{
 "model": "my-lora",
 "messages": [{"role": "user", "content": "Hello"}],
 "max_tokens": 100
 }' | python -m json.tool

# 4. Docker Deployment
# ===================================
# Dockerfile
# FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
# RUN pip install vllm transformers peft
# COPY merged-model /app/model
# EXPOSE 8000
# CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
# "--model", "/app/model", "--port", "8000"]

# docker build -t lora-server .
# docker run --gpus all -p 8000:8000 lora-server

# 5. Kubernetes Deployment
# ===================================
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: lora-server
# spec:
# replicas: 1
# selector:
# matchLabels:
# app: lora-server
# template:
# spec:
# containers:
# - name: vllm
# image: lora-server:latest
# ports:
# - containerPort: 8000
# resources:
# limits:
# nvidia.com/gpu: 1
# memory: 32Gi
# volumeMounts:
# - name: model-storage
# mountPath: /app/model
# volumes:
# - name: model-storage
# persistentVolumeClaim:
# claimName: model-pvc

echo "Deployment configured"

FAQ คำถามที่พบบ่อย

Q: LoRA rank ตั้งเท่าไหรดี?

A: rank 8-16 เพียงพอสำหรับ simple tasks เช่น classification, sentiment ใช้ VRAM น้อย training เร็ว rank 32-64 สำหรับ complex tasks เช่น instruction following, code generation ใช้ VRAM มากขึ้น แต่ quality ดีกว่า rank 128+ สำหรับ domain-specific knowledge injection ต้อง GPU ใหญ่ เริ่มจาก rank 16 แล้วเพิ่มถ้า quality ไม่พอ อย่าลืมปรับ alpha ตาม (alpha = 2x rank เป็น starting point ที่ดี)

แนะนำเพิ่มเติม — อีบุ๊กการลงทุน SiamCafeBook

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Tailscale Mesh Incident Management — ทุกสิ่งที่ต้องรู้ในปี 2026

Q: ต้องการ training data เท่าไหร?

A: ขั้นต่ำ 100-500 examples สำหรับ simple task adaptation, 1,000-5,000 examples สำหรับ instruction following ที่ดี, 10,000+ examples สำหรับ domain-specific fine-tuning ที่ quality สูง คุณภาพสำคัญกว่าปริมาณ 1,000 examples ที่ clean และ high-quality ดีกว่า 10,000 examples ที่มี noise ใช้ format ที่ consistent และ diverse examples

Q: QLoRA กับ LoRA ต่างกันอย่างไร?

A: LoRA train LoRA adapters บน base model ที่เป็น fp16/bf16 ใช้ VRAM ตาม model size (7B model ~14GB) QLoRA quantize base model เป็น 4-bit ก่อน แล้ว train LoRA adapters ใช้ VRAM น้อยกว่ามาก (7B model ~6GB) Quality ของ QLoRA ใกล้เคียง LoRA (ต่างกัน 1-2% ใน benchmarks) สำหรับ GPU 16GB ขึ้นไป ใช้ QLoRA กับ 7B models สำหรับ GPU 24GB+ ใช้ LoRA ตรงๆ ได้

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: LangChain Agent Edge Computing

Q: Training ใช้เวลาเท่าไหร?

A: ขึ้นกับหลายปัจจัย dataset size, sequence length, batch size, GPU speed, number of epochs สำหรับ 7B model กับ QLoRA rank 16, batch 4, 1,000 examples, 3 epochs บน RTX 4090 ใช้เวลาประมาณ 15-30 นาที บน A100 80GB เร็วกว่า 2-3 เท่า สำหรับ 70B model ใช้เวลา 5-10 เท่าของ 7B Cloud GPU (A100/H100) คิดค่าใช้จ่ายประมาณ $2-5/ชั่วโมง สำหรับ training ง่ายๆ ไม่กี่ดอลลาร์