Ollama Local LLM Troubleshooting แก้ปัญหา —

Ollama Local LLM

Ollama รัน LLM บนเครื่องตัวเอง ง่ายเหมือน Docker สำหรับ AI Llama 3 Mistral Phi-3 Gemma CodeLlama Qwen REST API macOS Linux Windows

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Python SQLAlchemy Hexagonal Architecture

Troubleshooting แก้ปัญหา Out of Memory GPU ไม่ทำงาน ช้า Model Download API Integration

เนื้อหาเกี่ยวข้อง — อ่านต่อ: ตําแหน่ง ux ui คือ — ทุกสิ่งที่ต้องรู้ในปี 2026

Ollama Setup และปัญหาที่พบบ่อย

# === Ollama Setup & Troubleshooting ===

# 1. ติดตั้ง
# macOS: brew install ollama
# Linux: curl -fsSL https://ollama.com/install.sh | sh
# Windows: Download from ollama.com

# 2. รัน Model
# ollama run llama3
# ollama run mistral
# ollama run phi3
# ollama run codellama
# ollama run gemma2:9b

# 3. ดู Models ที่ติดตั้ง
# ollama list
# ollama show llama3

# 4. ลบ Model
# ollama rm llama3

# 5. Pull Model
# ollama pull llama3:8b-instruct-q5_K_M

# === Common Issues ===

# Issue 1: Out of Memory (OOM)
# Error: "not enough memory"
# Solution:
# export OLLAMA_NUM_PARALLEL=1
# ollama run llama3 --num-ctx 2048  # ลดจาก 4096

# Issue 2: GPU Not Detected
# Check: nvidia-smi
# Check: ollama --version
# Solution:
# export OLLAMA_GPU_LAYERS=35
# export CUDA_VISIBLE_DEVICES=0

# Issue 3: Slow Inference
# Check tokens/s in output
# Solution:
# export OLLAMA_KEEP_ALIVE=60m  # Keep model in memory
# ใช้ quantized model: ollama run llama3:8b-instruct-q4_K_M

# Issue 4: Download Failed
# Error: "failed to pull model"
# Solution:
# export OLLAMA_HOST=0.0.0.0
# export OLLAMA_MODELS=/path/to/models  # เปลี่ยนที่เก็บ
# ollama pull llama3 --insecure  # ถ้า proxy มีปัญหา

# Issue 5: API Connection Refused
# Error: "connection refused"
# Solution:
# ollama serve  # Start server ก่อน
# curl http://localhost:11434/api/tags  # ทดสอบ

import subprocess
import json
from dataclasses import dataclass, field
from typing import List, Dict, Optional

@dataclass
class OllamaIssue:
    name: str
    error_msg: str
    cause: str
    solution: List[str]
    severity: str  # critical, high, medium, low

issues = [
    OllamaIssue(
        "Out of Memory",
        "not enough memory / killed",
        "Model ใหญ่เกินไปสำหรับ RAM ที่มี",
        ["ใช้ Model เล็กลง (7B แทน 70B)",
         "ใช้ Quantized Q4_K_M",
         "ตั้ง num_ctx ต่ำลง",
         "ปิดแอปอื่นที่ใช้ RAM",
         "เพิ่ม Swap Space"],
        "critical"
    ),
    OllamaIssue(
        "GPU Not Detected",
        "running on CPU / no GPU found",
        "NVIDIA Driver ไม่ถูกต้อง หรือ CUDA ไม่รองรับ",
        ["ตรวจ nvidia-smi",
         "อัปเดต NVIDIA Driver",
         "ตั้ง CUDA_VISIBLE_DEVICES=0",
         "อัปเดต Ollama เวอร์ชันล่าสุด"],
        "high"
    ),
    OllamaIssue(
        "Slow Inference",
        "< 5 tokens/second",
        "ใช้ CPU, Model ใหญ่, Context ยาว",
        ["ใช้ GPU",
         "ใช้ Quantized Model",
         "ลด num_ctx",
         "ตั้ง OLLAMA_KEEP_ALIVE"],
        "medium"
    ),
    OllamaIssue(
        "Download Failed",
        "failed to pull model",
        "Network, Proxy, Disk Full",
        ["ตรวจ Internet Connection",
         "ตั้ง Proxy ถ้าอยู่หลัง Firewall",
         "ตรวจ Disk Space",
         "เปลี่ยน OLLAMA_MODELS path"],
        "high"
    ),
]

print("Ollama Troubleshooting Guide:")
for issue in issues:
    print(f"\n  [{issue.severity.upper()}] {issue.name}")
    print(f"    Error: {issue.error_msg}")
    print(f"    Cause: {issue.cause}")
    print(f"    Solutions:")
    for sol in issue.solution:
        print(f"      - {sol}")

API Integration

# ollama_api.py — Ollama API Integration
import json

# Ollama REST API
# Base URL: http://localhost:11434

# 1. Generate (Completion)
# curl http://localhost:11434/api/generate -d '{
#   "model": "llama3",
#   "prompt": "Explain Docker in Thai",
#   "stream": false,
#   "options": {
#     "temperature": 0.7,
#     "num_ctx": 4096,
#     "num_predict": 512
#   }
# }'

# 2. Chat
# curl http://localhost:11434/api/chat -d '{
#   "model": "llama3",
#   "messages": [
#     {"role": "system", "content": "You are a helpful assistant"},
#     {"role": "user", "content": "Hello"}
#   ],
#   "stream": false
# }'

# 3. Embeddings
# curl http://localhost:11434/api/embeddings -d '{
#   "model": "nomic-embed-text",
#   "prompt": "Hello world"
# }'

# 4. List Models
# curl http://localhost:11434/api/tags

# Python Client
# pip install ollama

# import ollama
#
# # Chat
# response = ollama.chat(model='llama3', messages=[
#     {'role': 'user', 'content': 'สวัสดี อธิบาย Docker ให้หน่อย'}
# ])
# print(response['message']['content'])
#
# # Streaming
# for chunk in ollama.chat(model='llama3', messages=[
#     {'role': 'user', 'content': 'Hello'}
# ], stream=True):
#     print(chunk['message']['content'], end='', flush=True)
#
# # Embeddings
# embeddings = ollama.embeddings(model='nomic-embed-text',
#                                 prompt='Hello world')

# Model Recommendations
models = {
    "General Chat": {
        "4GB RAM": "phi3:3.8b-mini-4k-instruct-q4_K_M",
        "8GB RAM": "llama3:8b-instruct-q5_K_M",
        "16GB RAM": "llama3:8b-instruct-fp16",
        "32GB RAM": "llama3:70b-instruct-q4_K_M",
    },
    "Coding": {
        "4GB RAM": "codegemma:2b",
        "8GB RAM": "codellama:7b-instruct-q5_K_M",
        "16GB RAM": "deepseek-coder:6.7b",
        "32GB RAM": "codellama:34b-instruct-q4_K_M",
    },
    "Embeddings": {
        "Any": "nomic-embed-text (137MB)",
        "Better": "mxbai-embed-large (670MB)",
    },
}

print("\nModel Recommendations by RAM:")
for use_case, recs in models.items():
    print(f"\n  [{use_case}]")
    for ram, model in recs.items():
        print(f"    {ram}: {model}")

Performance Tuning

# performance.py — Ollama Performance Tuning
performance_tips = {
    "GPU Offloading": {
        "env": "OLLAMA_GPU_LAYERS=35",
        "desc": "Offload Layers ไป GPU มากขึ้นเร็วขึ้น",
        "impact": "5-10x เร็วกว่า CPU",
    },
    "Keep Alive": {
        "env": "OLLAMA_KEEP_ALIVE=60m",
        "desc": "Keep Model ใน Memory ไม่ต้อง Load ใหม่ทุกครั้ง",
        "impact": "ลดเวลา First Token จาก 10s เหลือ 0.5s",
    },
    "Context Length": {
        "env": "num_ctx=2048",
        "desc": "ลด Context Length ใช้ RAM น้อยลง",
        "impact": "ลด RAM 30-50%",
    },
    "Parallel Requests": {
        "env": "OLLAMA_NUM_PARALLEL=2",
        "desc": "จำกัด Concurrent Requests",
        "impact": "ป้องกัน OOM เมื่อมีหลาย Requests",
    },
    "Quantization": {
        "env": "ใช้ Q4_K_M หรือ Q5_K_M",
        "desc": "ลดขนาด Model และเพิ่มความเร็ว",
        "impact": "ลดขนาด 60% เร็วขึ้น 30%",
    },
    "SSD Storage": {
        "env": "OLLAMA_MODELS=/ssd/ollama",
        "desc": "เก็บ Model บน SSD แทน HDD",
        "impact": "Load เร็วขึ้น 5-10x",
    },
}

print("Ollama Performance Tuning:")
for tip, info in performance_tips.items():
    print(f"\n  [{tip}]")
    print(f"    Config: {info['env']}")
    print(f"    {info['desc']}")
    print(f"    Impact: {info['impact']}")

# Benchmark
benchmark = {
    "CPU (i7-12700)": {"llama3-8b-q4": "8 t/s", "phi3-3.8b-q4": "15 t/s"},
    "GPU (RTX 3060 12GB)": {"llama3-8b-q4": "45 t/s", "phi3-3.8b-q4": "80 t/s"},
    "GPU (RTX 4090 24GB)": {"llama3-8b-q4": "120 t/s", "phi3-3.8b-q4": "200 t/s"},
    "Apple M2 Pro": {"llama3-8b-q4": "25 t/s", "phi3-3.8b-q4": "45 t/s"},
    "Apple M3 Max": {"llama3-8b-q4": "50 t/s", "phi3-3.8b-q4": "90 t/s"},
}

print(f"\n\nBenchmark (tokens/second):")
for hw, results in benchmark.items():
    print(f"  {hw}:")
    for model, speed in results.items():
        print(f"    {model}: {speed}")

เคล็ดลับ

GPU: ใช้ GPU เร็วกว่า CPU 5-10 เท่า ตรวจ nvidia-smi ก่อน
Q4_K_M: แนะนำ Quantization Level สมดุลขนาดและคุณภาพ
Keep Alive: ตั้ง OLLAMA_KEEP_ALIVE=60m ลดเวลา First Token
Monitor: ดู RAM GPU Usage ด้วย htop nvidia-smi ระหว่างรัน
SSD: เก็บ Models บน SSD ไม่ใช่ HDD
Update: อัปเดต Ollama เป็นเวอร์ชันล่าสุดเสมอ

Ollama คืออะไร

รัน LLM บนเครื่องตัวเอง ง่ายเหมือน Docker Llama 3 Mistral Phi-3 Gemma CodeLlama REST API macOS Linux Windows ติดตั้งง่าย

แนะนำเพิ่มเติม — เรียนเทรดกับ iCafeForex

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Linux Namespaces Incident Management