Ollama Local LLM Site Reliability SRE — จัดการ

Ollama LLM SRE

Ollama Local LLM Site Reliability SRE GPU Monitoring Auto-scaling Incident Response Model Management Token Throughput Production Operations

เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน REST API Design Testing Strategy QA

Metric	SLI	SLO Target	Alert Threshold	Tool
Inference Latency p99	Response time	< 5s (simple) < 30s (complex)	> 10s / > 45s	Prometheus histogram
Token Throughput	Tokens/second	> 30 tok/s per user	< 15 tok/s	Custom metric
Error Rate	5xx / total	< 0.5%	> 1%	Prometheus counter
GPU Memory	VRAM usage %	< 85%	> 90%	nvidia-smi exporter
GPU Temperature	Celsius	< 75°C	> 80°C	nvidia-smi exporter
Availability	Uptime %	99.9%	Any downtime	Blackbox exporter

Installation and Setup

# === Ollama Production Setup ===

# Install Ollama
# curl -fsSL https://ollama.com/install.sh | sh

# Pull models
# ollama pull llama3
# ollama pull mistral
# ollama pull codellama
# ollama pull gemma:7b

# Systemd service (production)
# /etc/systemd/system/ollama.service
# [Unit]
# Description=Ollama LLM Server
# After=network.target
#
# [Service]
# Type=simple
# User=ollama
# Environment="OLLAMA_HOST=0.0.0.0:11434"
# Environment="OLLAMA_NUM_PARALLEL=4"
# Environment="OLLAMA_MAX_LOADED_MODELS=2"
# Environment="NVIDIA_VISIBLE_DEVICES=all"
# ExecStart=/usr/local/bin/ollama serve
# Restart=always
# RestartSec=5
# LimitNOFILE=65535
# WatchdogSec=120
#
# [Install]
# WantedBy=multi-user.target

# Nginx reverse proxy
# upstream ollama {
#     server 127.0.0.1:11434;
#     keepalive 32;
# }
# server {
#     listen 443 ssl;
#     server_name llm.internal.com;
#     location / {
#         proxy_pass http://ollama;
#         proxy_read_timeout 300s;
#         proxy_send_timeout 300s;
#         proxy_set_header Host $host;
#         limit_req zone=llm_limit burst=10 nodelay;
#     }
# }

from dataclasses import dataclass

@dataclass
class ModelConfig:
    model: str
    size: str
    vram: str
    speed: str
    use_case: str

models = [
    ModelConfig("llama3:8b", "4.7GB", "6GB VRAM", "~40 tok/s (RTX 3090)", "General purpose"),
    ModelConfig("llama3:70b", "40GB", "48GB VRAM", "~10 tok/s (A100)", "Complex reasoning"),
    ModelConfig("mistral:7b", "4.1GB", "5GB VRAM", "~45 tok/s (RTX 3090)", "Fast general"),
    ModelConfig("codellama:13b", "7.4GB", "10GB VRAM", "~25 tok/s (RTX 3090)", "Code generation"),
    ModelConfig("gemma:7b", "5.0GB", "6GB VRAM", "~35 tok/s (RTX 3090)", "Google model"),
    ModelConfig("phi3:mini", "2.3GB", "3GB VRAM", "~60 tok/s (RTX 3090)", "Small fast model"),
]

print("=== Model Catalog ===")
for m in models:
    print(f"  [{m.model}] Size: {m.size} | VRAM: {m.vram}")
    print(f"    Speed: {m.speed} | Use: {m.use_case}")

Monitoring Stack

# === GPU and LLM Monitoring ===

# Prometheus nvidia-smi exporter
# docker run -d --gpus all -p 9835:9835 \
#   utkuozdemir/nvidia_gpu_exporter

# Custom Ollama metrics exporter (Python)
# import prometheus_client as prom
# import requests, time
#
# INFERENCE_LATENCY = prom.Histogram(
#     'ollama_inference_seconds',
#     'Inference latency', ['model'],
#     buckets=[0.5, 1, 2, 5, 10, 20, 30, 60])
#
# TOKENS_GENERATED = prom.Counter(
#     'ollama_tokens_total',
#     'Total tokens generated', ['model', 'type'])
#
# ACTIVE_REQUESTS = prom.Gauge(
#     'ollama_active_requests',
#     'Currently processing requests')
#
# def track_inference(model, prompt):
#     ACTIVE_REQUESTS.inc()
#     start = time.time()
#     response = requests.post('http://localhost:11434/api/generate',
#         json={"model": model, "prompt": prompt, "stream": False})
#     duration = time.time() - start
#     data = response.json()
#     INFERENCE_LATENCY.labels(model=model).observe(duration)
#     TOKENS_GENERATED.labels(model=model, type='prompt').inc(
#         data.get('prompt_eval_count', 0))
#     TOKENS_GENERATED.labels(model=model, type='response').inc(
#         data.get('eval_count', 0))
#     ACTIVE_REQUESTS.dec()
#     return data

@dataclass
class AlertRule:
    alert: str
    expr: str
    duration: str
    severity: str
    action: str

alerts = [
    AlertRule("GPU Memory High",
        "nvidia_gpu_memory_used_bytes / nvidia_gpu_memory_total_bytes > 0.9",
        "5m", "warning",
        "Check loaded models, consider smaller model or restart"),
    AlertRule("GPU Temperature Critical",
        "nvidia_gpu_temperature_celsius > 80",
        "2m", "critical",
        "Reduce workload immediately, check cooling"),
    AlertRule("High Inference Latency",
        "histogram_quantile(0.99, ollama_inference_seconds_bucket) > 30",
        "5m", "warning",
        "Check GPU util, queue length, consider scaling"),
    AlertRule("Ollama Down",
        "up{job='ollama'} == 0",
        "1m", "critical",
        "Restart service, check logs, page on-call"),
    AlertRule("High Error Rate",
        "rate(ollama_errors_total[5m]) / rate(ollama_requests_total[5m]) > 0.01",
        "5m", "warning",
        "Check OOM, timeout, model loading issues"),
]

print("\n=== Alert Rules ===")
for a in alerts:
    print(f"  [{a.severity.upper()}] {a.alert}")
    print(f"    Expr: {a.expr}")
    print(f"    For: {a.duration} | Action: {a.action}")

Incident Runbooks

# === SRE Runbooks ===

@dataclass
class Runbook:
    incident: str
    symptoms: str
    diagnosis: str
    fix: str
    prevention: str

runbooks = [
    Runbook("OOM — Out of Memory",
        "GPU memory > 95%, requests failing with OOM error",
        "nvidia-smi ดู VRAM usage, ollama ps ดู loaded models",
        "1. ollama stop unused_model  2. Restart ollama  3. Use smaller model  4. Reduce OLLAMA_NUM_PARALLEL",
        "Set OLLAMA_MAX_LOADED_MODELS=2, monitor VRAM usage"),
    Runbook("High Latency",
        "p99 latency > 30s, users complaining of slow response",
        "Check GPU util (nvidia-smi), queue length, concurrent requests",
        "1. Rate limit requests  2. Scale horizontally  3. Use faster model  4. Reduce max_tokens",
        "Set rate limiting in Nginx, auto-scale based on queue length"),
    Runbook("Service Crash",
        "Ollama process not running, systemd restart loop",
        "journalctl -u ollama, check GPU driver, disk space",
        "1. Check logs  2. nvidia-smi (driver OK?)  3. df -h (disk)  4. systemctl restart ollama",
        "Watchdog in systemd, health check endpoint, auto-restart"),
    Runbook("Model Load Failure",
        "Model not responding, timeout on first request after restart",
        "ollama list, check disk space, check model integrity",
        "1. ollama rm model  2. ollama pull model  3. Check ~/.ollama/models disk",
        "Pre-pull models, verify checksums, monitor disk usage"),
    Runbook("GPU Temperature",
        "Temperature > 80°C, thermal throttling",
        "nvidia-smi -q -d TEMPERATURE, check fan speed",
        "1. Reduce concurrent requests  2. Increase fan speed  3. Check airflow  4. Clean dust",
        "Temperature alerts, regular cleaning, proper rack cooling"),
]

print("=== Incident Runbooks ===")
for r in runbooks:
    print(f"  [{r.incident}]")
    print(f"    Symptoms: {r.symptoms}")
    print(f"    Diagnosis: {r.diagnosis}")
    print(f"    Fix: {r.fix}")
    print(f"    Prevention: {r.prevention}")

เคล็ดลับ

VRAM: ดู VRAM ก่อนโหลด Model ใช้ ollama ps ดู Model ที่โหลดอยู่
Parallel: ตั้ง OLLAMA_NUM_PARALLEL ตาม VRAM ที่มี ไม่ให้ OOM
Watchdog: ใช้ systemd WatchdogSec ตรวจ Health อัตโนมัติ
Rate Limit: ตั้ง Rate Limit ใน Nginx ป้องกัน Overload
Backup: เก็บ Modelfile และ Config ใน Git สำหรับ Disaster Recovery

Ollama คืออะไร

Open Source Local LLM Llama Mistral Gemma Phi CodeLlama ollama run REST API GPU CUDA Metal Modelfile Custom Development Production

แนะนำเพิ่มเติม — อ่านเพิ่มเติมที่ SiamCafeBook

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Skaffold Dev Code Review Best Practice

เนื้อหาเกี่ยวข้อง — Sto คืออะไร — คู่มือฉบับสมบูรณ์ 2026