ai
Ollama Local LLM Site Reliability SRE — จัดการ
Ollama LLM SRE

Ollama Local LLM Site Reliability SRE GPU Monitoring Auto-scaling Incident Response Model Management Token Throughput Production Operations
เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน REST API Design Testing Strategy QA
| Metric | SLI | SLO Target | Alert Threshold | Tool |
|---|---|---|---|---|
| Inference Latency p99 | Response time | < 5s (simple) < 30s (complex) | > 10s / > 45s | Prometheus histogram |
| Token Throughput | Tokens/second | > 30 tok/s per user | < 15 tok/s | Custom metric |
| Error Rate | 5xx / total | < 0.5% | > 1% | Prometheus counter |
| GPU Memory | VRAM usage % | < 85% | > 90% | nvidia-smi exporter |
| GPU Temperature | Celsius | < 75°C | > 80°C | nvidia-smi exporter |
| Availability | Uptime % | 99.9% | Any downtime | Blackbox exporter |
Installation and Setup
# === Ollama Production Setup ===
# Install Ollama
# curl -fsSL https://ollama.com/install.sh | sh
# Pull models
# ollama pull llama3
# ollama pull mistral
# ollama pull codellama
# ollama pull gemma:7b
# Systemd service (production)
# /etc/systemd/system/ollama.service
# [Unit]
# Description=Ollama LLM Server
# After=network.target
#
# [Service]
# Type=simple
# User=ollama
# Environment="OLLAMA_HOST=0.0.0.0:11434"
# Environment="OLLAMA_NUM_PARALLEL=4"
# Environment="OLLAMA_MAX_LOADED_MODELS=2"
# Environment="NVIDIA_VISIBLE_DEVICES=all"
# ExecStart=/usr/local/bin/ollama serve
# Restart=always
# RestartSec=5
# LimitNOFILE=65535
# WatchdogSec=120
#
# [Install]
# WantedBy=multi-user.target
# Nginx reverse proxy
# upstream ollama {
# server 127.0.0.1:11434;
# keepalive 32;
# }
# server {
# listen 443 ssl;
# server_name llm.internal.com;
# location / {
# proxy_pass http://ollama;
# proxy_read_timeout 300s;
# proxy_send_timeout 300s;
# proxy_set_header Host $host;
# limit_req zone=llm_limit burst=10 nodelay;
# }
# }
from dataclasses import dataclass
@dataclass
class ModelConfig:
model: str
size: str
vram: str
speed: str
use_case: str
models = [
ModelConfig("llama3:8b", "4.7GB", "6GB VRAM", "~40 tok/s (RTX 3090)", "General purpose"),
ModelConfig("llama3:70b", "40GB", "48GB VRAM", "~10 tok/s (A100)", "Complex reasoning"),
ModelConfig("mistral:7b", "4.1GB", "5GB VRAM", "~45 tok/s (RTX 3090)", "Fast general"),
ModelConfig("codellama:13b", "7.4GB", "10GB VRAM", "~25 tok/s (RTX 3090)", "Code generation"),
ModelConfig("gemma:7b", "5.0GB", "6GB VRAM", "~35 tok/s (RTX 3090)", "Google model"),
ModelConfig("phi3:mini", "2.3GB", "3GB VRAM", "~60 tok/s (RTX 3090)", "Small fast model"),
]
print("=== Model Catalog ===")
for m in models:
print(f" [{m.model}] Size: {m.size} | VRAM: {m.vram}")
print(f" Speed: {m.speed} | Use: {m.use_case}")
Monitoring Stack

# === GPU and LLM Monitoring ===
# Prometheus nvidia-smi exporter
# docker run -d --gpus all -p 9835:9835 \
# utkuozdemir/nvidia_gpu_exporter
# Custom Ollama metrics exporter (Python)
# import prometheus_client as prom
# import requests, time
#
# INFERENCE_LATENCY = prom.Histogram(
# 'ollama_inference_seconds',
# 'Inference latency', ['model'],
# buckets=[0.5, 1, 2, 5, 10, 20, 30, 60])
#
# TOKENS_GENERATED = prom.Counter(
# 'ollama_tokens_total',
# 'Total tokens generated', ['model', 'type'])
#
# ACTIVE_REQUESTS = prom.Gauge(
# 'ollama_active_requests',
# 'Currently processing requests')
#
# def track_inference(model, prompt):
# ACTIVE_REQUESTS.inc()
# start = time.time()
# response = requests.post('http://localhost:11434/api/generate',
# json={"model": model, "prompt": prompt, "stream": False})
# duration = time.time() - start
# data = response.json()
# INFERENCE_LATENCY.labels(model=model).observe(duration)
# TOKENS_GENERATED.labels(model=model, type='prompt').inc(
# data.get('prompt_eval_count', 0))
# TOKENS_GENERATED.labels(model=model, type='response').inc(
# data.get('eval_count', 0))
# ACTIVE_REQUESTS.dec()
# return data
@dataclass
class AlertRule:
alert: str
expr: str
duration: str
severity: str
action: str
alerts = [
AlertRule("GPU Memory High",
"nvidia_gpu_memory_used_bytes / nvidia_gpu_memory_total_bytes > 0.9",
"5m", "warning",
"Check loaded models, consider smaller model or restart"),
AlertRule("GPU Temperature Critical",
"nvidia_gpu_temperature_celsius > 80",
"2m", "critical",
"Reduce workload immediately, check cooling"),
AlertRule("High Inference Latency",
"histogram_quantile(0.99, ollama_inference_seconds_bucket) > 30",
"5m", "warning",
"Check GPU util, queue length, consider scaling"),
AlertRule("Ollama Down",
"up{job='ollama'} == 0",
"1m", "critical",
"Restart service, check logs, page on-call"),
AlertRule("High Error Rate",
"rate(ollama_errors_total[5m]) / rate(ollama_requests_total[5m]) > 0.01",
"5m", "warning",
"Check OOM, timeout, model loading issues"),
]
print("\n=== Alert Rules ===")
for a in alerts:
print(f" [{a.severity.upper()}] {a.alert}")
print(f" Expr: {a.expr}")
print(f" For: {a.duration} | Action: {a.action}")
Incident Runbooks
# === SRE Runbooks ===
@dataclass
class Runbook:
incident: str
symptoms: str
diagnosis: str
fix: str
prevention: str
runbooks = [
Runbook("OOM — Out of Memory",
"GPU memory > 95%, requests failing with OOM error",
"nvidia-smi ดู VRAM usage, ollama ps ดู loaded models",
"1. ollama stop unused_model 2. Restart ollama 3. Use smaller model 4. Reduce OLLAMA_NUM_PARALLEL",
"Set OLLAMA_MAX_LOADED_MODELS=2, monitor VRAM usage"),
Runbook("High Latency",
"p99 latency > 30s, users complaining of slow response",
"Check GPU util (nvidia-smi), queue length, concurrent requests",
"1. Rate limit requests 2. Scale horizontally 3. Use faster model 4. Reduce max_tokens",
"Set rate limiting in Nginx, auto-scale based on queue length"),
Runbook("Service Crash",
"Ollama process not running, systemd restart loop",
"journalctl -u ollama, check GPU driver, disk space",
"1. Check logs 2. nvidia-smi (driver OK?) 3. df -h (disk) 4. systemctl restart ollama",
"Watchdog in systemd, health check endpoint, auto-restart"),
Runbook("Model Load Failure",
"Model not responding, timeout on first request after restart",
"ollama list, check disk space, check model integrity",
"1. ollama rm model 2. ollama pull model 3. Check ~/.ollama/models disk",
"Pre-pull models, verify checksums, monitor disk usage"),
Runbook("GPU Temperature",
"Temperature > 80°C, thermal throttling",
"nvidia-smi -q -d TEMPERATURE, check fan speed",
"1. Reduce concurrent requests 2. Increase fan speed 3. Check airflow 4. Clean dust",
"Temperature alerts, regular cleaning, proper rack cooling"),
]
print("=== Incident Runbooks ===")
for r in runbooks:
print(f" [{r.incident}]")
print(f" Symptoms: {r.symptoms}")
print(f" Diagnosis: {r.diagnosis}")
print(f" Fix: {r.fix}")
print(f" Prevention: {r.prevention}")
เคล็ดลับ
- VRAM: ดู VRAM ก่อนโหลด Model ใช้ ollama ps ดู Model ที่โหลดอยู่
- Parallel: ตั้ง OLLAMA_NUM_PARALLEL ตาม VRAM ที่มี ไม่ให้ OOM
- Watchdog: ใช้ systemd WatchdogSec ตรวจ Health อัตโนมัติ
- Rate Limit: ตั้ง Rate Limit ใน Nginx ป้องกัน Overload
- Backup: เก็บ Modelfile และ Config ใน Git สำหรับ Disaster Recovery
Ollama คืออะไร
Open Source Local LLM Llama Mistral Gemma Phi CodeLlama ollama run REST API GPU CUDA Metal Modelfile Custom Development Production
แนะนำเพิ่มเติม — อ่านเพิ่มเติมที่ SiamCafeBook
เนื้อหาเกี่ยวข้อง — อ่านต่อ: Skaffold Dev Code Review Best Practice
เนื้อหาเกี่ยวข้อง — Sto คืออะไร — คู่มือฉบับสมบูรณ์ 2026





