SiamCafe.net Blog
Technology

ONNX Runtime Home Lab Setup

onnx runtime home lab setup
ONNX Runtime Home Lab Setup | SiamCafe Blog
2026-04-24· อ. บอม — SiamCafe.net· 9,023 คำ

ONNX Runtime Home Lab

ONNX Runtime Inference Engine Microsoft AI Model PyTorch TensorFlow GPU CPU Optimization Home Lab Production

Execution ProviderHardwareSpeedInstall
CPU (Default)ทุก CPU1x (Baseline)pip install onnxruntime
CUDANVIDIA GPU5-20xpip install onnxruntime-gpu
TensorRTNVIDIA GPU10-30xonnxruntime-gpu + TensorRT
DirectMLAMD/Intel/NVIDIA GPU3-10xpip install onnxruntime-directml
CoreMLApple Silicon5-15xpip install onnxruntime (macOS)

Model Conversion

# === Convert PyTorch to ONNX ===

# pip install onnx onnxruntime torch

import torch
import onnx
import onnxruntime as ort
import numpy as np

# Example: Convert a simple model
# class MyModel(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.linear1 = torch.nn.Linear(768, 256)
#         self.relu = torch.nn.ReLU()
#         self.linear2 = torch.nn.Linear(256, 10)
#     
#     def forward(self, x):
#         return self.linear2(self.relu(self.linear1(x)))
#
# model = MyModel()
# model.eval()
#
# # Create dummy input
# dummy_input = torch.randn(1, 768)
#
# # Export to ONNX
# torch.onnx.export(
#     model,
#     dummy_input,
#     "model.onnx",
#     input_names=["input"],
#     output_names=["output"],
#     dynamic_axes={
#         "input": {0: "batch_size"},
#         "output": {0: "batch_size"},
#     },
#     opset_version=17,
# )
#
# # Verify
# onnx_model = onnx.load("model.onnx")
# onnx.checker.check_model(onnx_model)
# print("Model is valid!")

# Hugging Face Optimum export
# pip install optimum[onnxruntime]
# optimum-cli export onnx --model bert-base-uncased bert-onnx/
# optimum-cli export onnx --model gpt2 gpt2-onnx/

from dataclasses import dataclass

@dataclass
class ConversionMethod:
    framework: str
    tool: str
    command: str
    notes: str

methods = [
    ConversionMethod("PyTorch",
        "torch.onnx.export()",
        "torch.onnx.export(model, dummy, 'model.onnx', opset_version=17)",
        "ตั้ง dynamic_axes สำหรับ Variable Batch"),
    ConversionMethod("TensorFlow",
        "tf2onnx",
        "python -m tf2onnx.convert --saved-model ./model --output model.onnx",
        "pip install tf2onnx"),
    ConversionMethod("Hugging Face",
        "optimum",
        "optimum-cli export onnx --model bert-base-uncased ./bert-onnx/",
        "รองรับ Transformers ทุก Model"),
    ConversionMethod("Scikit-learn",
        "skl2onnx",
        "convert_sklearn(model, 'model', initial_types)",
        "pip install skl2onnx"),
]

print("=== Conversion Methods ===")
for m in methods:
    print(f"  [{m.framework}] Tool: {m.tool}")
    print(f"    Command: {m.command}")
    print(f"    Notes: {m.notes}")

Inference & Optimization

# === ONNX Runtime Inference ===

# Basic Inference
# session = ort.InferenceSession("model.onnx",
#     providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
#
# input_name = session.get_inputs()[0].name
# output_name = session.get_outputs()[0].name
#
# result = session.run([output_name], {input_name: input_data})

# Session Options for Optimization
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
#
# # Enable memory optimization
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# session = ort.InferenceSession("model.onnx",
#     sess_options=sess_options,
#     providers=["CUDAExecutionProvider"])

# Quantization (INT8)
# from onnxruntime.quantization import quantize_dynamic, QuantType
# quantize_dynamic("model.onnx", "model_int8.onnx",
#     weight_type=QuantType.QInt8)

@dataclass
class OptimizationTechnique:
    technique: str
    speedup: str
    size_reduction: str
    quality_impact: str
    command: str

techniques = [
    OptimizationTechnique("Graph Optimization",
        "1.5-2x", "ไม่ลด", "ไม่กระทบ (Lossless)",
        "GraphOptimizationLevel.ORT_ENABLE_ALL"),
    OptimizationTechnique("Dynamic Quantization INT8",
        "2-3x (CPU)", "4x เล็กลง", "น้อยมาก (-0.5% accuracy)",
        "quantize_dynamic(model, output, QInt8)"),
    OptimizationTechnique("Static Quantization INT8",
        "2-4x (CPU)", "4x เล็กลง", "น้อย (-1% accuracy)",
        "quantize_static(model, output, calibration_data)"),
    OptimizationTechnique("FP16 Conversion",
        "1.5-2x (GPU)", "2x เล็กลง", "แทบไม่กระทบ",
        "ConvertFloat16().process(model)"),
    OptimizationTechnique("TensorRT Provider",
        "2-5x (vs CUDA)", "ไม่ลด", "ไม่กระทบ",
        "providers=['TensorrtExecutionProvider']"),
]

print("=== Optimization Techniques ===")
for o in techniques:
    print(f"  [{o.technique}] Speed: {o.speedup} | Size: {o.size_reduction}")
    print(f"    Quality: {o.quality_impact}")
    print(f"    Command: {o.command}")

API Serving

# === FastAPI + ONNX Runtime Serving ===

# from fastapi import FastAPI
# import onnxruntime as ort
# import numpy as np
#
# app = FastAPI()
# session = ort.InferenceSession("model.onnx",
#     providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
#
# @app.post("/predict")
# async def predict(data: dict):
#     input_data = np.array(data["input"], dtype=np.float32)
#     input_name = session.get_inputs()[0].name
#     result = session.run(None, {input_name: input_data})
#     return {"prediction": result[0].tolist()}
#
# # Run: uvicorn app:app --host 0.0.0.0 --port 8000

# Docker Deployment
# FROM python:3.11-slim
# RUN pip install onnxruntime fastapi uvicorn numpy
# COPY model.onnx /app/
# COPY app.py /app/
# WORKDIR /app
# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

@dataclass
class ServingConfig:
    setting: str
    value: str
    purpose: str

configs = [
    ServingConfig("Workers", "4 (CPU cores)",
        "จำนวน Worker Process สำหรับ Parallel Requests"),
    ServingConfig("Batch Size", "1-32",
        "รวม Request เป็น Batch เพิ่ม Throughput"),
    ServingConfig("Model Warm-up", "เรียก Inference 1 ครั้งตอน Start",
        "ลด Cold Start Latency ครั้งแรก"),
    ServingConfig("Health Check", "/health endpoint",
        "ตรวจว่า Model โหลดสำเร็จ พร้อม Serve"),
    ServingConfig("Monitoring", "Prometheus metrics",
        "วัด Latency Throughput Error Rate"),
]

print("=== Serving Config ===")
for c in configs:
    print(f"  [{c.setting}] {c.value}")
    print(f"    Purpose: {c.purpose}")

เคล็ดลับ

ONNX Runtime คืออะไร

Microsoft Inference Engine ONNX Format PyTorch TensorFlow CPU GPU CUDA TensorRT DirectML CoreML 2-5x เร็วกว่า Python C++ Production

Home Lab ต้องมีอะไร

CPU 4+ cores RAM 16GB+ GPU NVIDIA 6GB+ SSD Ubuntu/Windows Python 3.10+ CUDA cuDNN Docker onnxruntime-gpu pip install

แปลง Model อย่างไร

torch.onnx.export() tf2onnx skl2onnx optimum Hugging Face dynamic_axes opset_version checker Quantization INT8 FP16 Optimize

เร็วกว่า PyTorch จริงไหม

2-5x เร็วกว่า Graph Optimization Kernel Fusion Constant Folding TensorRT 10-30x Quantization INT8 Multi-threading Batch Inference

สรุป

ONNX Runtime Home Lab Inference Engine ONNX Model Conversion GPU CUDA TensorRT Quantization INT8 FastAPI Docker Production Optimization

📖 บทความที่เกี่ยวข้อง

ONNX Runtime DNS Managementอ่านบทความ → Calico Network Policy Home Lab Setupอ่านบทความ → Shadcn UI Home Lab Setupอ่านบทความ → SASE Framework Home Lab Setupอ่านบทความ → OSPF Area Design Home Lab Setupอ่านบทความ →

📚 ดูบทความทั้งหมด →