LLM Quantization GGUF กับ Container

LLM Quantization GGUF

LLM Quantization ลดขนาด Model FP32 FP16 เป็น INT8 INT4 ลดขนาด 2-4 เท่า ใช้ RAM น้อยลง Inference เร็วขึ้น รันบน Consumer Hardware

เนื้อหาเกี่ยวข้อง — machine learning algorithm คือ

GGUF GPT-Generated Unified Format llama.cpp Model Weights Tokenizer Metadata ไฟล์เดียว Quantization Q2_K ถึง Q8_0 รันบน CPU ไม่ต้อง GPU

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Vector Database Pinecone Message Queue Design —

Quant Level	Bits	Size (7B)	RAM	Quality
Q8_0	8-bit	7.2 GB	9.7 GB	ดีมาก ใกล้ FP16
Q6_K	6-bit	5.5 GB	8.0 GB	ดีมาก
Q5_K_M	5-bit	4.8 GB	7.3 GB	ดี (แนะนำ)
Q4_K_M	4-bit	4.1 GB	6.6 GB	ดี
Q3_K_M	3-bit	3.3 GB	5.8 GB	พอใช้
Q2_K	2-bit	2.7 GB	5.2 GB	ลดลงมาก

Quantization Process

# === LLM Quantization with llama.cpp ===

# 1. Clone llama.cpp
# git clone https://github.com/ggerganov/llama.cpp
# cd llama.cpp
# make -j$(nproc)

# 2. Download Model (Hugging Face)
# pip install huggingface-hub
# huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir ./models/llama-2-7b

# 3. Convert to GGUF
# python convert_hf_to_gguf.py ./models/llama-2-7b --outfile llama-2-7b-f16.gguf --outtype f16

# 4. Quantize
# ./llama-quantize llama-2-7b-f16.gguf llama-2-7b-Q5_K_M.gguf Q5_K_M
# ./llama-quantize llama-2-7b-f16.gguf llama-2-7b-Q4_K_M.gguf Q4_K_M
# ./llama-quantize llama-2-7b-f16.gguf llama-2-7b-Q8_0.gguf Q8_0

# 5. Test Inference
# ./llama-cli -m llama-2-7b-Q5_K_M.gguf -p "Hello, how are you?" -n 128

# 6. Start API Server
# ./llama-server -m llama-2-7b-Q5_K_M.gguf --host 0.0.0.0 --port 8080 -c 4096 -ngl 35

# 7. Test API
# curl http://localhost:8080/v1/chat/completions \
#   -H "Content-Type: application/json" \
#   -d '{"model": "llama-2-7b", "messages": [{"role": "user", "content": "Hello"}]}'

from dataclasses import dataclass
from typing import List

@dataclass
class QuantConfig:
    model_name: str
    original_size_gb: float
    quant_level: str
    quant_size_gb: float
    ram_required_gb: float
    quality_score: float  # 0-1
    tokens_per_sec: float

class QuantizationPipeline:
    """LLM Quantization Pipeline"""

    def __init__(self):
        self.configs: List[QuantConfig] = []

    def add(self, config: QuantConfig):
        self.configs.append(config)

    def recommend(self, available_ram_gb: float) -> QuantConfig:
        """แนะนำ Quantization Level ตาม RAM"""
        candidates = [c for c in self.configs if c.ram_required_gb <= available_ram_gb]
        if candidates:
            return max(candidates, key=lambda c: c.quality_score)
        return self.configs[-1]  # Smallest

    def show_comparison(self):
        print(f"\n{'='*60}")
        print(f"Quantization Comparison: {self.configs[0].model_name}")
        print(f"{'='*60}")
        for c in self.configs:
            bar = "#" * int(c.quality_score * 20)
            print(f"  {c.quant_level:<8} Size: {c.quant_size_gb:.1f}GB "
                  f"RAM: {c.ram_required_gb:.1f}GB "
                  f"Quality: [{bar:<20}] {c.quality_score:.0%} "
                  f"Speed: {c.tokens_per_sec:.0f} t/s")

pipeline = QuantizationPipeline()

configs = [
    QuantConfig("Llama-2-7B", 13.5, "Q8_0", 7.2, 9.7, 0.95, 25),
    QuantConfig("Llama-2-7B", 13.5, "Q6_K", 5.5, 8.0, 0.92, 30),
    QuantConfig("Llama-2-7B", 13.5, "Q5_K_M", 4.8, 7.3, 0.88, 35),
    QuantConfig("Llama-2-7B", 13.5, "Q4_K_M", 4.1, 6.6, 0.83, 40),
    QuantConfig("Llama-2-7B", 13.5, "Q3_K_M", 3.3, 5.8, 0.75, 45),
    QuantConfig("Llama-2-7B", 13.5, "Q2_K", 2.7, 5.2, 0.60, 50),
]

for c in configs:
    pipeline.add(c)

pipeline.show_comparison()

rec = pipeline.recommend(8.0)
print(f"\n  Recommended for 8GB RAM: {rec.quant_level} ({rec.quant_size_gb}GB)")

Docker Container

# Dockerfile — llama.cpp Server Container
# FROM ubuntu:22.04 AS builder
# RUN apt-get update && apt-get install -y build-essential git cmake
# RUN git clone https://github.com/ggerganov/llama.cpp /llama.cpp
# WORKDIR /llama.cpp
# RUN cmake -B build -DLLAMA_SERVER=ON && cmake --build build -j$(nproc)
#
# FROM ubuntu:22.04
# COPY --from=builder /llama.cpp/build/bin/llama-server /usr/local/bin/
# RUN mkdir /models
# EXPOSE 8080
# ENTRYPOINT ["llama-server"]
# CMD ["-m", "/models/model.gguf", "--host", "0.0.0.0", "--port", "8080", "-c", "4096"]

# docker-compose.yml
# version: "3.8"
# services:
#   llm-server:
#     build: .
#     ports:
#       - "8080:8080"
#     volumes:
#       - ./models:/models
#     environment:
#       - MODEL_PATH=/models/llama-2-7b-Q5_K_M.gguf
#       - CONTEXT_SIZE=4096
#       - N_GPU_LAYERS=35
#     deploy:
#       resources:
#         limits:
#           memory: 8G
#           cpus: "4"
#         reservations:
#           memory: 6G

# Kubernetes Deployment
# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: llm-server
# spec:
#   replicas: 3
#   selector:
#     matchLabels:
#       app: llm-server
#   template:
#     spec:
#       containers:
#       - name: llm
#         image: llama-cpp-server:latest
#         args:
#         - "-m"
#         - "/models/llama-2-7b-Q5_K_M.gguf"
#         - "--host"
#         - "0.0.0.0"
#         - "--port"
#         - "8080"
#         - "-c"
#         - "4096"
#         ports:
#         - containerPort: 8080
#         resources:
#           requests:
#             memory: 6Gi
#             cpu: 2
#           limits:
#             memory: 8Gi
#             cpu: 4
#         volumeMounts:
#         - name: models
#           mountPath: /models
#         readinessProbe:
#           httpGet:
#             path: /health
#             port: 8080
#           initialDelaySeconds: 30
#       volumes:
#       - name: models
#         persistentVolumeClaim:
#           claimName: model-pvc

container_configs = {
    "Small (7B Q4)": {"ram": "6-8 GB", "cpu": "2-4 cores", "replicas": "3-5"},
    "Medium (13B Q4)": {"ram": "10-12 GB", "cpu": "4-8 cores", "replicas": "2-3"},
    "Large (70B Q4)": {"ram": "40-48 GB", "cpu": "8-16 cores", "replicas": "1-2"},
    "GPU (7B FP16)": {"ram": "16 GB VRAM", "cpu": "4 cores", "replicas": "3-5"},
}

print("Container Configurations:")
for config, specs in container_configs.items():
    print(f"\n  [{config}]")
    for key, value in specs.items():
        print(f"    {key}: {value}")

Production Architecture

# production.py — Production LLM Architecture
architecture = {
    "Load Balancer": "Nginx/Traefik กระจาย Requests ไป LLM Pods",
    "LLM Pods": "llama.cpp Server 3-5 Replicas + HPA",
    "Model Storage": "PVC (NFS/EBS) แชร์ GGUF Files ข้าม Pods",
    "Cache": "Redis Cache ผลลัพธ์ที่เคยถามแล้ว (Semantic Cache)",
    "Queue": "Redis/Kafka Queue Requests ป้องกัน Overload",
    "Monitoring": "Prometheus + Grafana: Tokens/s, Latency, Memory",
    "Rate Limiting": "จำกัด Requests/min ต่อ User",
}

print("Production LLM Architecture:")
for component, desc in architecture.items():
    print(f"  [{component}]")
    print(f"    {desc}")

# Popular GGUF Models
models = {
    "Llama-3-8B": {"size_q5": "5.5 GB", "context": "8K", "license": "Meta"},
    "Mistral-7B": {"size_q5": "4.8 GB", "context": "32K", "license": "Apache 2.0"},
    "Phi-3-mini": {"size_q5": "2.4 GB", "context": "128K", "license": "MIT"},
    "Gemma-2-9B": {"size_q5": "6.1 GB", "context": "8K", "license": "Google"},
    "Qwen2-7B": {"size_q5": "4.8 GB", "context": "128K", "license": "Apache 2.0"},
    "CodeLlama-7B": {"size_q5": "4.8 GB", "context": "16K", "license": "Meta"},
}

print(f"\n\nPopular GGUF Models (Q5_K_M):")
for model, info in models.items():
    print(f"  {model}: {info['size_q5']} | Context: {info['context']} | License: {info['license']}")