SiamCafe · Blog
ONNX Runtime Home Lab Setup — รัน AI Model
บทความ

ONNX Runtime Home Lab Setup — รัน AI Model

เผยแพร่ 28 พฤษภาคม 2569

ONNX Runtime Home Lab

ONNX Runtime Home Lab Setup — รัน AI Model

ONNX Runtime Inference Engine Microsoft AI Model PyTorch TensorFlow GPU CPU Optimization Home Lab Production

Execution ProviderHardwareSpeedInstall
CPU (Default)ทุก CPU1x (Baseline)pip install onnxruntime
CUDANVIDIA GPU5-20xpip install onnxruntime-gpu
TensorRTNVIDIA GPU10-30xonnxruntime-gpu + TensorRT
DirectMLAMD/Intel/NVIDIA GPU3-10xpip install onnxruntime-directml
CoreMLApple Silicon5-15xpip install onnxruntime (macOS)

Model Conversion

# === Convert PyTorch to ONNX ===

# pip install onnx onnxruntime torch

import torch
import onnx
import onnxruntime as ort
import numpy as np

# Example: Convert a simple model
# class MyModel(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.linear1 = torch.nn.Linear(768, 256)
#         self.relu = torch.nn.ReLU()
#         self.linear2 = torch.nn.Linear(256, 10)
#     
#     def forward(self, x):
#         return self.linear2(self.relu(self.linear1(x)))
#
# model = MyModel()
# model.eval()
#
# # Create dummy input
# dummy_input = torch.randn(1, 768)
#
# # Export to ONNX
# torch.onnx.export(
#     model,
#     dummy_input,
#     "model.onnx",
#     input_names=["input"],
#     output_names=["output"],
#     dynamic_axes={
#         "input": {0: "batch_size"},
#         "output": {0: "batch_size"},
#     },
#     opset_version=17,
# )
#
# # Verify
# onnx_model = onnx.load("model.onnx")
# onnx.checker.check_model(onnx_model)
# print("Model is valid!")

# Hugging Face Optimum export
# pip install optimum[onnxruntime]
# optimum-cli export onnx --model bert-base-uncased bert-onnx/
# optimum-cli export onnx --model gpt2 gpt2-onnx/

from dataclasses import dataclass

@dataclass
class ConversionMethod:
    framework: str
    tool: str
    command: str
    notes: str

methods = [
    ConversionMethod("PyTorch",
        "torch.onnx.export()",
        "torch.onnx.export(model, dummy, 'model.onnx', opset_version=17)",
        "ตั้ง dynamic_axes สำหรับ Variable Batch"),
    ConversionMethod("TensorFlow",
        "tf2onnx",
        "python -m tf2onnx.convert --saved-model ./model --output model.onnx",
        "pip install tf2onnx"),
    ConversionMethod("Hugging Face",
        "optimum",
        "optimum-cli export onnx --model bert-base-uncased ./bert-onnx/",
        "รองรับ Transformers ทุก Model"),
    ConversionMethod("Scikit-learn",
        "skl2onnx",
        "convert_sklearn(model, 'model', initial_types)",
        "pip install skl2onnx"),
]

print("=== Conversion Methods ===")
for m in methods:
    print(f"  [{m.framework}] Tool: {m.tool}")
    print(f"    Command: {m.command}")
    print(f"    Notes: {m.notes}")

Inference & Optimization

ONNX Runtime Home Lab Setup — รัน AI Model
# === ONNX Runtime Inference ===

# Basic Inference
# session = ort.InferenceSession("model.onnx",
#     providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
#
# input_name = session.get_inputs()[0].name
# output_name = session.get_outputs()[0].name
#
# result = session.run([output_name], {input_name: input_data})

# Session Options for Optimization
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
#
# # Enable memory optimization
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# session = ort.InferenceSession("model.onnx",
#     sess_options=sess_options,
#     providers=["CUDAExecutionProvider"])

# Quantization (INT8)
# from onnxruntime.quantization import quantize_dynamic, QuantType
# quantize_dynamic("model.onnx", "model_int8.onnx",
#     weight_type=QuantType.QInt8)

@dataclass
class OptimizationTechnique:
    technique: str
    speedup: str
    size_reduction: str
    quality_impact: str
    command: str

techniques = [
    OptimizationTechnique("Graph Optimization",
        "1.5-2x", "ไม่ลด", "ไม่กระทบ (Lossless)",
        "GraphOptimizationLevel.ORT_ENABLE_ALL"),
    OptimizationTechnique("Dynamic Quantization INT8",
        "2-3x (CPU)", "4x เล็กลง", "น้อยมาก (-0.5% accuracy)",
        "quantize_dynamic(model, output, QInt8)"),
    OptimizationTechnique("Static Quantization INT8",
        "2-4x (CPU)", "4x เล็กลง", "น้อย (-1% accuracy)",
        "quantize_static(model, output, calibration_data)"),
    OptimizationTechnique("FP16 Conversion",
        "1.5-2x (GPU)", "2x เล็กลง", "แทบไม่กระทบ",
        "ConvertFloat16().process(model)"),
    OptimizationTechnique("TensorRT Provider",
        "2-5x (vs CUDA)", "ไม่ลด", "ไม่กระทบ",
        "providers=['TensorrtExecutionProvider']"),
]

print("=== Optimization Techniques ===")
for o in techniques:
    print(f"  [{o.technique}] Speed: {o.speedup} | Size: {o.size_reduction}")
    print(f"    Quality: {o.quality_impact}")
    print(f"    Command: {o.command}")

API Serving

# === FastAPI + ONNX Runtime Serving ===

# from fastapi import FastAPI
# import onnxruntime as ort
# import numpy as np
#
# app = FastAPI()
# session = ort.InferenceSession("model.onnx",
#     providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
#
# @app.post("/predict")
# async def predict(data: dict):
#     input_data = np.array(data["input"], dtype=np.float32)
#     input_name = session.get_inputs()[0].name
#     result = session.run(None, {input_name: input_data})
#     return {"prediction": result[0].tolist()}
#
# # Run: uvicorn app:app --host 0.0.0.0 --port 8000

# Docker Deployment
# FROM python:3.11-slim
# RUN pip install onnxruntime fastapi uvicorn numpy
# COPY model.onnx /app/
# COPY app.py /app/
# WORKDIR /app
# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

@dataclass
class ServingConfig:
    setting: str
    value: str
    purpose: str

configs = [
    ServingConfig("Workers", "4 (CPU cores)",
        "จำนวน Worker Process สำหรับ Parallel Requests"),
    ServingConfig("Batch Size", "1-32",
        "รวม Request เป็น Batch เพิ่ม Throughput"),
    ServingConfig("Model Warm-up", "เรียก Inference 1 ครั้งตอน Start",
        "ลด Cold Start Latency ครั้งแรก"),
    ServingConfig("Health Check", "/health endpoint",
        "ตรวจว่า Model โหลดสำเร็จ พร้อม Serve"),
    ServingConfig("Monitoring", "Prometheus metrics",
        "วัด Latency Throughput Error Rate"),
]

print("=== Serving Config ===")
for c in configs:
    print(f"  [{c.setting}] {c.value}")
    print(f"    Purpose: {c.purpose}")

เคล็ดลับ

  • ONNX: แปลงทุก Model เป็น ONNX ก่อน Deploy เร็วกว่า Native
  • Quantize: ใช้ INT8 Quantization สำหรับ CPU ลดขนาด 4x เร็วขึ้น 2-3x
  • TensorRT: ใช้ TensorRT Provider สำหรับ NVIDIA GPU เร็วที่สุด
  • Dynamic: ตั้ง Dynamic Axes สำหรับ Variable Input Size
  • Warm-up: เรียก Inference 1 ครั้งตอน Start ลด Cold Start

ONNX Runtime คืออะไร

Microsoft Inference Engine ONNX Format PyTorch TensorFlow CPU GPU CUDA TensorRT DirectML CoreML 2-5x เร็วกว่า Python C++ Production

อ่านเพิ่ม: WebAssembly (Wasm) คืออะไร? เปิดโลก High-Performance Web App · อ่านเพิ่ม: Redis คืออะไร? สอน Caching ตั้งแต่ In-Memory Store Session Q · อ่านเพิ่ม: Edge Computing คืออะไร? สอนสร้าง Edge Application ด้วย Cloud