LLM Quantization GGUF Automation Script

LLM Quantization GGUF Automation Script คืออะไร

LLM Quantization คือการลดขนาดของ Large Language Model โดยแปลง weights จาก FP32/FP16 เป็น INT8/INT4 ทำให้ model เล็กลง 2-4 เท่า ใช้ RAM/VRAM น้อยลง และ inference เร็วขึ้น โดยคุณภาพลดลงเพียงเล็กน้อย GGUF (GPT-Generated Unified Format) เป็น format มาตรฐานของ llama.cpp สำหรับรัน quantized LLMs บน CPU และ GPU รองรับ models เช่น Llama, Mistral, Phi, Gemma และอื่นๆ บทความนี้อธิบายวิธี quantize LLMs เป็น GGUF format พร้อม Python automation scripts

Quantization Fundamentals

# quant_basics.py — Quantization fundamentals
import json

class QuantizationBasics:
    TYPES = {
        "fp32": {"name": "FP32 (Full Precision)", "bits": 32, "size_7b": "28 GB", "quality": "100%", "speed": "1x (baseline)"},
        "fp16": {"name": "FP16 (Half Precision)", "bits": 16, "size_7b": "14 GB", "quality": "~99.9%", "speed": "1.5-2x"},
        "q8_0": {"name": "Q8_0 (8-bit)", "bits": 8, "size_7b": "7.2 GB", "quality": "~99.5%", "speed": "2-3x"},
        "q6_k": {"name": "Q6_K (6-bit)", "bits": 6, "size_7b": "5.5 GB", "quality": "~99%", "speed": "2.5-3.5x"},
        "q5_k_m": {"name": "Q5_K_M (5-bit medium)", "bits": 5, "size_7b": "4.8 GB", "quality": "~98%", "speed": "3-4x"},
        "q4_k_m": {"name": "Q4_K_M (4-bit medium)", "bits": 4, "size_7b": "4.1 GB", "quality": "~96%", "speed": "3.5-4.5x"},
        "q3_k_m": {"name": "Q3_K_M (3-bit medium)", "bits": 3, "size_7b": "3.3 GB", "quality": "~92%", "speed": "4-5x"},
        "q2_k": {"name": "Q2_K (2-bit)", "bits": 2, "size_7b": "2.7 GB", "quality": "~85%", "speed": "5-6x"},
    }

    RECOMMENDED = {
        "best_quality": "Q8_0 — ใกล้เคียง FP16 มาก เหมาะถ้ามี RAM พอ",
        "balanced": "Q5_K_M หรือ Q4_K_M — balance ระหว่าง quality และ size",
        "small_ram": "Q4_K_M — ใช้ RAM น้อย คุณภาพยังดี (แนะนำที่สุด)",
        "minimal": "Q3_K_M — สำหรับ RAM จำกัดมาก (< 4GB)",
        "avoid": "Q2_K — คุณภาพลดลงมาก ไม่แนะนำสำหรับงานจริง",
    }

    def show_types(self):
        print("=== Quantization Types ===\n")
        print(f"  {'Type':<20} {'Bits':<6} {'Size (7B)':<10} {'Quality':<10} {'Speed'}")
        for key, q in self.TYPES.items():
            print(f"  {q['name']:<20} {q['bits']:<6} {q['size_7b']:<10} {q['quality']:<10} {q['speed']}")

    def show_recommended(self):
        print(f"\n=== Recommendations ===")
        for key, rec in self.RECOMMENDED.items():
            print(f"  [{key}] {rec}")

basics = QuantizationBasics()
basics.show_types()
basics.show_recommended()

GGUF Format

# gguf_format.py — GGUF format overview
import json

class GGUFFormat:
    INFO = {
        "name": "GGUF (GPT-Generated Unified Format)",
        "creator": "ggerganov (llama.cpp)",
        "predecessor": "GGML → GGUF (v3)",
        "features": [
            "Single file format — model + tokenizer + metadata ในไฟล์เดียว",
            "Extensible metadata — เก็บข้อมูล model config, quantization info",
            "Multiple quantization types ในไฟล์เดียวกัน (mixed quantization)",
            "Memory-mapped loading — โหลดเร็ว ใช้ RAM น้อย",
            "Cross-platform — Windows, macOS, Linux, Android, iOS",
        ],
        "tools": {
            "llama.cpp": "Inference engine หลักสำหรับ GGUF",
            "ollama": "ใช้ GGUF internally — ง่ายที่สุด",
            "lm-studio": "GUI สำหรับรัน GGUF models",
            "koboldcpp": "GGUF inference + web UI",
            "text-generation-webui": "Oobabooga — รองรับ GGUF",
        },
    }

    METADATA = """
# GGUF file structure
# Header:
#   - Magic number: GGUF
#   - Version: 3
#   - Tensor count
#   - Metadata KV count
#
# Metadata (key-value pairs):
#   - general.architecture: "llama"
#   - general.name: "Llama-3-8B-Q4_K_M"
#   - llama.context_length: 8192
#   - llama.embedding_length: 4096
#   - tokenizer.ggml.model: "gpt2"
#   - general.quantization_version: 2
#
# Tensors:
#   - Weight data in quantized format
"""

    def show_info(self):
        print("=== GGUF Format ===\n")
        print(f"  Name: {self.INFO['name']}")
        for feat in self.INFO['features']:
            print(f"  • {feat}")

    def show_tools(self):
        print(f"\n=== Compatible Tools ===")
        for tool, desc in self.INFO['tools'].items():
            print(f"  [{tool}] {desc}")

gguf = GGUFFormat()
gguf.show_info()
gguf.show_tools()

Python Quantization Script

# quant_script.py — Automation script for GGUF quantization
import json

class QuantScript:
    CODE = """
# quantize_gguf.py — Automate LLM quantization to GGUF
import subprocess
import os
import json
import shutil
from pathlib import Path
from datetime import datetime

class GGUFQuantizer:
    def __init__(self, llama_cpp_dir="./llama.cpp"):
        self.llama_cpp = Path(llama_cpp_dir)
        self.convert_script = self.llama_cpp / "convert_hf_to_gguf.py"
        self.quantize_bin = self.llama_cpp / "build" / "bin" / "llama-quantize"
    
    def convert_hf_to_gguf(self, hf_model_dir, output_path=None):
        '''Convert HuggingFace model to GGUF FP16'''
        if output_path is None:
            model_name = Path(hf_model_dir).name
            output_path = f"./{model_name}-fp16.gguf"
        
        cmd = [
            "python3", str(self.convert_script),
            hf_model_dir,
            "--outfile", output_path,
            "--outtype", "f16",
        ]
        
        print(f"Converting {hf_model_dir} to GGUF FP16...")
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            size_gb = os.path.getsize(output_path) / (1024**3)
            return {"status": "success", "output": output_path, "size_gb": round(size_gb, 2)}
        return {"status": "error", "error": result.stderr}
    
    def quantize(self, input_gguf, quant_type="Q4_K_M", output_path=None):
        '''Quantize GGUF model'''
        if output_path is None:
            base = Path(input_gguf).stem.replace("-fp16", "")
            output_path = f"./{base}-{quant_type}.gguf"
        
        cmd = [
            str(self.quantize_bin),
            input_gguf,
            output_path,
            quant_type,
        ]
        
        print(f"Quantizing to {quant_type}...")
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            size_gb = os.path.getsize(output_path) / (1024**3)
            return {"status": "success", "output": output_path, "size_gb": round(size_gb, 2), "type": quant_type}
        return {"status": "error", "error": result.stderr}
    
    def batch_quantize(self, input_gguf, quant_types=None):
        '''Quantize to multiple types'''
        if quant_types is None:
            quant_types = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
        
        results = []
        for qt in quant_types:
            result = self.quantize(input_gguf, qt)
            results.append(result)
            if result['status'] == 'success':
                print(f"  {qt}: {result['size_gb']} GB")
        
        return results
    
    def full_pipeline(self, hf_model_dir, quant_types=None):
        '''Full pipeline: HF → GGUF FP16 → Quantize'''
        print(f"=== Full Quantization Pipeline ===")
        print(f"Model: {hf_model_dir}")
        print(f"Time: {datetime.now().isoformat()}")
        
        # Step 1: Convert to GGUF
        fp16_result = self.convert_hf_to_gguf(hf_model_dir)
        if fp16_result['status'] != 'success':
            return fp16_result
        
        print(f"FP16: {fp16_result['size_gb']} GB")
        
        # Step 2: Quantize
        results = self.batch_quantize(fp16_result['output'], quant_types)
        
        # Step 3: Summary
        summary = {
            'model': hf_model_dir,
            'fp16_size_gb': fp16_result['size_gb'],
            'quantizations': [r for r in results if r['status'] == 'success'],
            'timestamp': datetime.now().isoformat(),
        }
        
        return summary

# quantizer = GGUFQuantizer("./llama.cpp")
# result = quantizer.full_pipeline(
#     "./models/Llama-3-8B",
#     quant_types=["Q8_0", "Q5_K_M", "Q4_K_M"]
# )
"""

    def show_code(self):
        print("=== Quantization Script ===")
        print(self.CODE[:600])

script = QuantScript()
script.show_code()

Quality Benchmark

# benchmark.py — Benchmark quantized models
import json
import random

class QualityBenchmark:
    CODE = """
# benchmark_gguf.py — Benchmark GGUF quantized models
import subprocess
import json
import time

class GGUFBenchmark:
    def __init__(self, llama_cli="./llama.cpp/build/bin/llama-cli"):
        self.llama_cli = llama_cli
    
    def benchmark_model(self, model_path, prompt="Explain quantum computing in simple terms:",
                        n_predict=256, threads=4):
        '''Benchmark a single model'''
        cmd = [
            self.llama_cli,
            "-m", model_path,
            "-p", prompt,
            "-n", str(n_predict),
            "-t", str(threads),
            "--log-disable",
        ]
        
        start = time.time()
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
        elapsed = time.time() - start
        
        output = result.stdout
        tokens = len(output.split())
        tokens_per_sec = n_predict / elapsed if elapsed > 0 else 0
        
        return {
            'model': model_path,
            'tokens_generated': n_predict,
            'time_seconds': round(elapsed, 2),
            'tokens_per_second': round(tokens_per_sec, 1),
            'output_preview': output[:200],
        }
    
    def compare_quantizations(self, models, prompt=None):
        '''Compare multiple quantized models'''
        results = []
        for model in models:
            result = self.benchmark_model(model, prompt or "Explain AI:")
            results.append(result)
        
        return sorted(results, key=lambda x: x['tokens_per_second'], reverse=True)

# bench = GGUFBenchmark()
# results = bench.compare_quantizations([
#     "model-Q8_0.gguf", "model-Q5_K_M.gguf", "model-Q4_K_M.gguf"
# ])
"""

    SAMPLE_RESULTS = [
        {"type": "FP16", "size": "14.0 GB", "perplexity": "5.12", "tokens_sec": "8.5"},
        {"type": "Q8_0", "size": "7.2 GB", "perplexity": "5.14", "tokens_sec": "15.2"},
        {"type": "Q6_K", "size": "5.5 GB", "perplexity": "5.18", "tokens_sec": "18.7"},
        {"type": "Q5_K_M", "size": "4.8 GB", "perplexity": "5.25", "tokens_sec": "21.3"},
        {"type": "Q4_K_M", "size": "4.1 GB", "perplexity": "5.38", "tokens_sec": "24.8"},
        {"type": "Q3_K_M", "size": "3.3 GB", "perplexity": "5.68", "tokens_sec": "28.1"},
        {"type": "Q2_K", "size": "2.7 GB", "perplexity": "6.45", "tokens_sec": "31.5"},
    ]

    def show_code(self):
        print("=== Benchmark Code ===")
        print(self.CODE[:500])

    def show_results(self):
        print(f"\n=== Sample Benchmark (Llama 3 8B, CPU) ===")
        print(f"  {'Type':<10} {'Size':<10} {'Perplexity':<12} {'Tokens/sec'}")
        for r in self.SAMPLE_RESULTS:
            print(f"  {r['type']:<10} {r['size']:<10} {r['perplexity']:<12} {r['tokens_sec']}")

bench = QualityBenchmark()
bench.show_code()
bench.show_results()

Deployment

# deploy.py — Deploy GGUF models
import json

class GGUFDeployment:
    DOCKER = """
# Dockerfile — llama.cpp server with GGUF model
FROM ghcr.io/ggerganov/llama.cpp:server

COPY ./models/model-Q4_K_M.gguf /models/model.gguf

EXPOSE 8080

CMD ["--model", "/models/model.gguf", \
     "--host", "0.0.0.0", \
     "--port", "8080", \
     "--ctx-size", "4096", \
     "--threads", "4", \
     "--n-gpu-layers", "35"]
"""

    OLLAMA_MODELFILE = """
# Modelfile — Create Ollama model from GGUF
FROM ./model-Q4_K_M.gguf

PARAMETER temperature 0.7
PARAMETER num_ctx 4096
PARAMETER stop "<|eot_id|>"

SYSTEM "You are a helpful AI assistant."

# Build: ollama create mymodel -f Modelfile
# Run: ollama run mymodel
"""

    SERVING_OPTIONS = {
        "llama_cpp_server": {
            "name": "llama.cpp Server",
            "description": "OpenAI-compatible API — lightweight, fast",
            "command": "./llama-server -m model.gguf --host 0.0.0.0 --port 8080",
        },
        "ollama": {
            "name": "Ollama",
            "description": "ง่ายที่สุด — ollama run model",
            "command": "ollama create mymodel -f Modelfile && ollama serve",
        },
        "localai": {
            "name": "LocalAI",
            "description": "OpenAI drop-in replacement — multi-model support",
        },
    }

    def show_docker(self):
        print("=== Docker Deployment ===")
        print(self.DOCKER[:400])

    def show_ollama(self):
        print("\n=== Ollama Modelfile ===")
        print(self.OLLAMA_MODELFILE[:300])

    def show_options(self):
        print(f"\n=== Serving Options ===")
        for key, opt in self.SERVING_OPTIONS.items():
            print(f"  [{opt['name']}] {opt['description']}")

deploy = GGUFDeployment()
deploy.show_docker()
deploy.show_ollama()
deploy.show_options()

FAQ - คำถามที่พบบ่อย

Q: Q4_K_M กับ Q5_K_M อันไหนดีกว่า?

อ่านเพิ่ม: Local LLM 2026 รัน AI ที่เครื่องตัวเองด้วย Ollama คู่มือ Sel · อ่านเพิ่ม: WebAssembly (Wasm) คืออะไร? เปิดโลก High-Performance Web App · อ่านเพิ่ม: Redis คืออะไร? สอน Caching ตั้งแต่ In-Memory Store Session Q

เนื้อหาเกี่ยวข้อง — Payload CMS Domain Driven Design DDD

A: Q5_K_M: คุณภาพดีกว่าเล็กน้อย (perplexity ต่ำกว่า) ใช้ RAM มากกว่า ~700MB Q4_K_M: คุณภาพดีพอ ใช้ RAM น้อยกว่า เร็วกว่าเล็กน้อย แนะนำ: Q4_K_M สำหรับ RAM จำกัด, Q5_K_M ถ้ามี RAM เหลือ ความแตกต่างจริงๆ: เล็กมากสำหรับ conversational use — เลือกตาม RAM ที่มี

แนะนำเพิ่มเติม — อ่านเพิ่มเติมที่ SiamCafeBook

Q: GGUF กับ GPTQ กับ AWQ ต่างกันอย่างไร?

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Certificate Manager Event Driven Design

A: GGUF: สำหรับ llama.cpp — CPU + GPU, single file, cross-platform GPTQ: สำหรับ GPU (CUDA) — AutoGPTQ, ExLlama — เร็วบน GPU AWQ: สำหรับ GPU — activation-aware quantization, คุณภาพดีกว่า GPTQ เลือก GGUF: ถ้าจะรัน CPU หรือ mixed CPU+GPU เลือก GPTQ/AWQ: ถ้าจะรัน GPU เท่านั้น (เร็วกว่า GGUF บน GPU)

Q: Model 7B ต้องใช้ RAM เท่าไหร่?

แนะนำเพิ่มเติม — ติดตาม XM Signal

เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน Burp Suite Pro Scaling Strategy วิธี Scale

A: ขึ้นกับ quantization: Q4_K_M: ~4.1 GB model + ~1-2 GB context = 6-8 GB RAM Q5_K_M: ~4.8 GB + context = 7-9 GB Q8_0: ~7.2 GB + context = 9-11 GB กฎ: RAM ที่ต้องใช้ = model size + (context_length × 0.5-1 MB) 16 GB RAM: รัน 7B Q4_K_M สบายๆ 8 GB RAM: รัน 7B Q4_K_M ได้แต่ tight 32 GB RAM: รัน 13B Q4_K_M หรือ 7B Q8_0

Q: Quantize model เองดีกว่าโหลด pre-quantized ไหม?

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Calico Network Policy Real-time Processing

A: โหลด pre-quantized: ง่าย เร็ว ไม่ต้อง setup — แนะนำสำหรับ models ยอดนิยม (HuggingFace มีเยอะ) Quantize เอง: เมื่อต้องการ quantization type เฉพาะ หรือ model ใหม่ที่ยังไม่มี pre-quantized ที่มา: TheBloke, bartowski, mradermacher บน HuggingFace — มี GGUF quantizations ครบทุก type