SiamCafe.net Blog
Technology

LLM Fine-tuning LoRA Progressive Delivery

llm fine tuning lora progressive delivery
LLM Fine-tuning LoRA Progressive Delivery | SiamCafe Blog
2025-11-09· อ. บอม — SiamCafe.net· 8,873 คำ

LLM Fine-tuning LoRA

LLM Fine-tuning LoRA Low-Rank Adaptation QLoRA Memory Efficient Trainable Parameters Progressive Delivery Canary Feature Flags A/B Testing Model Deployment

MethodParametersVRAM (7B)SpeedQuality
Full Fine-tune100%~60GBช้าสูงสุด
LoRA~1%~24GBเร็วสูง
QLoRA~1%~12GBเร็วสูง
Prompt Tuning~0.01%~16GBเร็วมากปานกลาง
Prefix Tuning~0.1%~18GBเร็วปานกลาง-สูง

LoRA Fine-tuning Implementation

# === LoRA Fine-tuning with PEFT ===

# pip install transformers peft datasets accelerate bitsandbytes

# from transformers import (
#     AutoModelForCausalLM, AutoTokenizer,
#     TrainingArguments, Trainer, BitsAndBytesConfig,
# )
# from peft import LoraConfig, get_peft_model, TaskType
# from datasets import load_dataset
# import torch
#
# # QLoRA: 4-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
# )
#
# # Load Base Model
# model = AutoModelForCausalLM.from_pretrained(
#     "meta-llama/Llama-2-7b-hf",
#     quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True,
# )
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# tokenizer.pad_token = tokenizer.eos_token
#
# # LoRA Config
# lora_config = LoraConfig(
#     r=16,              # Rank
#     lora_alpha=32,     # Alpha
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM,
# )
#
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# # trainable params: 4,194,304 || all params: 6,742,609,920 || 0.06%
#
# # Training
# training_args = TrainingArguments(
#     output_dir="./lora-output",
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     learning_rate=2e-4,
#     warmup_steps=100,
#     logging_steps=10,
#     save_strategy="epoch",
#     fp16=True,
#     optim="paged_adamw_8bit",
# )
#
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
# )
# trainer.train()
# model.save_pretrained("./lora-adapter")

from dataclasses import dataclass
from typing import List, Dict

@dataclass
class LoRAExperiment:
    name: str
    base_model: str
    rank: int
    alpha: int
    dataset_size: int
    epochs: int
    train_loss: float
    eval_loss: float
    vram_gb: float
    time_hours: float

experiments = [
    LoRAExperiment("v1-baseline", "Llama-2-7B", 8, 16, 5000, 3, 1.25, 1.32, 14.5, 2.1),
    LoRAExperiment("v2-rank16", "Llama-2-7B", 16, 32, 5000, 3, 1.18, 1.24, 15.2, 2.5),
    LoRAExperiment("v3-more-data", "Llama-2-7B", 16, 32, 15000, 3, 1.05, 1.12, 15.2, 6.8),
    LoRAExperiment("v4-rank32", "Llama-2-7B", 32, 64, 15000, 5, 0.95, 1.08, 16.8, 11.2),
    LoRAExperiment("v5-mistral", "Mistral-7B", 16, 32, 15000, 3, 0.98, 1.05, 15.5, 7.0),
]

print("=== LoRA Experiments ===")
for e in experiments:
    print(f"\n  [{e.name}] {e.base_model}")
    print(f"    Rank: {e.rank} | Alpha: {e.alpha} | Data: {e.dataset_size:,}")
    print(f"    Loss: {e.train_loss:.2f}/{e.eval_loss:.2f} | "
          f"VRAM: {e.vram_gb}GB | Time: {e.time_hours}h")

Progressive Delivery for ML Models

# === Progressive Delivery Pipeline ===

# Canary Deployment for LLM
# 1. Deploy new model as canary (5% traffic)
# 2. Compare metrics: latency, quality, error rate
# 3. If OK: increase to 25%, 50%, 100%
# 4. If bad: rollback to previous model

# Kubernetes Canary with Istio
# apiVersion: networking.istio.io/v1beta1
# kind: VirtualService
# metadata:
#   name: llm-service
# spec:
#   hosts:
#   - llm-service
#   http:
#   - route:
#     - destination:
#         host: llm-service
#         subset: stable
#       weight: 95
#     - destination:
#         host: llm-service
#         subset: canary
#       weight: 5
#
# ---
# apiVersion: networking.istio.io/v1beta1
# kind: DestinationRule
# metadata:
#   name: llm-service
# spec:
#   host: llm-service
#   subsets:
#   - name: stable
#     labels:
#       version: v3-lora
#   - name: canary
#     labels:
#       version: v4-lora

@dataclass
class DeliveryStage:
    stage: str
    traffic_pct: int
    duration_min: int
    metrics_check: List[str]
    status: str

canary_stages = [
    DeliveryStage("Canary 5%", 5, 30,
        ["Latency P99 < 500ms", "Error rate < 0.1%", "Quality score > 0.85"], "Passed"),
    DeliveryStage("Canary 25%", 25, 60,
        ["Latency P99 < 500ms", "Error rate < 0.1%", "Quality score > 0.85"], "Passed"),
    DeliveryStage("Canary 50%", 50, 120,
        ["Latency P99 < 500ms", "Error rate < 0.1%", "Quality score > 0.85"], "Passed"),
    DeliveryStage("Full Rollout", 100, 0,
        ["All metrics stable for 24h"], "In Progress"),
]

print("\n=== Progressive Delivery ===")
for s in canary_stages:
    print(f"  [{s.status}] {s.stage} ({s.traffic_pct}%)")
    print(f"    Duration: {s.duration_min}min")
    for check in s.metrics_check:
        print(f"    - {check}")

# A/B Testing for Models
ab_results = {
    "Model A (v3-lora)": {
        "latency_p99": "380ms",
        "quality_score": 0.87,
        "user_satisfaction": "4.2/5",
        "cost_per_1k": "$0.12",
    },
    "Model B (v4-lora)": {
        "latency_p99": "420ms",
        "quality_score": 0.91,
        "user_satisfaction": "4.5/5",
        "cost_per_1k": "$0.14",
    },
}

print(f"\n\nA/B Test Results:")
for model, metrics in ab_results.items():
    print(f"\n  [{model}]")
    for k, v in metrics.items():
        print(f"    {k}: {v}")

Evaluation และ Monitoring

# === Model Evaluation & Monitoring ===

# Evaluation Script
# from transformers import pipeline
# import json
#
# # Load fine-tuned model
# pipe = pipeline("text-generation",
#     model="./merged-model",
#     tokenizer="./merged-model",
#     max_new_tokens=256,
#     temperature=0.7,
# )
#
# # Evaluate on test set
# with open("test_data.jsonl") as f:
#     test_data = [json.loads(line) for line in f]
#
# correct = 0
# for item in test_data:
#     output = pipe(item["prompt"])[0]["generated_text"]
#     if evaluate_quality(output, item["expected"]) > 0.8:
#         correct += 1
#
# accuracy = correct / len(test_data)
# print(f"Accuracy: {accuracy:.2%}")

eval_metrics = {
    "Perplexity": {"v3": 8.5, "v4": 7.2, "improvement": "-15.3%"},
    "BLEU Score": {"v3": 0.42, "v4": 0.48, "improvement": "+14.3%"},
    "ROUGE-L": {"v3": 0.55, "v4": 0.61, "improvement": "+10.9%"},
    "Human Eval": {"v3": "4.2/5", "v4": "4.5/5", "improvement": "+7.1%"},
    "Latency P99": {"v3": "380ms", "v4": "420ms", "improvement": "+10.5%"},
    "Cost/1K req": {"v3": "$0.12", "v4": "$0.14", "improvement": "+16.7%"},
}

print("Model Evaluation:")
print(f"  {'Metric':<16} {'v3 (stable)':>12} {'v4 (canary)':>12} {'Change':>10}")
for metric, vals in eval_metrics.items():
    print(f"  {metric:<16} {str(vals['v3']):>12} {str(vals['v4']):>12} "
          f"{vals['improvement']:>10}")

# Monitoring Alerts
alerts = [
    "Latency P99 > 500ms for 5 min → Rollback",
    "Error rate > 1% for 3 min → Rollback",
    "Quality score < 0.80 for 10 min → Pause & Investigate",
    "VRAM usage > 90% → Scale up or Quantize",
    "Throughput drop > 30% → Check batch size / scaling",
]

print(f"\n\nProduction Alerts:")
for i, alert in enumerate(alerts, 1):
    print(f"  {i}. {alert}")

เคล็ดลับ

LoRA คืออะไร

Low-Rank Adaptation Fine-tuning LLM ประหยัด Memory Low-rank Matrices Attention Layers Trainable Parameters น้อย GPU เดียว Merge ได้

Progressive Delivery คืออะไร

ค่อยๆปล่อย Version ใหม่ Canary Release Traffic เล็กน้อย Metrics Feature Flags A/B Testing ลดความเสี่ยง Model Regression

Fine-tune LLM ต้องใช้ GPU อะไร

LoRA 7B RTX 4090 24GB QLoRA 12GB Model 13B A100 40GB Model 70B A100 80GB หลายใบ Cloud $1-3/hr

Dataset สำหรับ Fine-tuning ต้องมีเท่าไหร่

เริ่ม 1,000-5,000 ตัวอย่าง คุณภาพสูง 10,000-50,000 คุณภาพสำคัญกว่าปริมาณ Instruction Format Data Quality

สรุป

LLM Fine-tuning LoRA QLoRA Low-Rank Adaptation PEFT Progressive Delivery Canary Feature Flags A/B Testing Evaluation Perplexity BLEU ROUGE Monitoring Rollback GPU VRAM Dataset Quality

📖 บทความที่เกี่ยวข้อง

LLM Fine-tuning LoRA API Integration เชื่อมต่อระบบอ่านบทความ → LLM Fine-tuning LoRA Real-time Processingอ่านบทความ → LLM Fine-tuning LoRA Domain Driven Design DDDอ่านบทความ → LLM Fine-tuning LoRA GitOps Workflowอ่านบทความ → LLM Fine-tuning LoRA Security Hardening ป้องกันแฮกอ่านบทความ →

📚 ดูบทความทั้งหมด →