ONNX Runtime Home Lab
ONNX Runtime Inference Engine Microsoft AI Model PyTorch TensorFlow GPU CPU Optimization Home Lab Production
| Execution Provider | Hardware | Speed | Install |
|---|---|---|---|
| CPU (Default) | ทุก CPU | 1x (Baseline) | pip install onnxruntime |
| CUDA | NVIDIA GPU | 5-20x | pip install onnxruntime-gpu |
| TensorRT | NVIDIA GPU | 10-30x | onnxruntime-gpu + TensorRT |
| DirectML | AMD/Intel/NVIDIA GPU | 3-10x | pip install onnxruntime-directml |
| CoreML | Apple Silicon | 5-15x | pip install onnxruntime (macOS) |
Model Conversion
# === Convert PyTorch to ONNX ===
# pip install onnx onnxruntime torch
import torch
import onnx
import onnxruntime as ort
import numpy as np
# Example: Convert a simple model
# class MyModel(torch.nn.Module):
# def __init__(self):
# super().__init__()
# self.linear1 = torch.nn.Linear(768, 256)
# self.relu = torch.nn.ReLU()
# self.linear2 = torch.nn.Linear(256, 10)
#
# def forward(self, x):
# return self.linear2(self.relu(self.linear1(x)))
#
# model = MyModel()
# model.eval()
#
# # Create dummy input
# dummy_input = torch.randn(1, 768)
#
# # Export to ONNX
# torch.onnx.export(
# model,
# dummy_input,
# "model.onnx",
# input_names=["input"],
# output_names=["output"],
# dynamic_axes={
# "input": {0: "batch_size"},
# "output": {0: "batch_size"},
# },
# opset_version=17,
# )
#
# # Verify
# onnx_model = onnx.load("model.onnx")
# onnx.checker.check_model(onnx_model)
# print("Model is valid!")
# Hugging Face Optimum export
# pip install optimum[onnxruntime]
# optimum-cli export onnx --model bert-base-uncased bert-onnx/
# optimum-cli export onnx --model gpt2 gpt2-onnx/
from dataclasses import dataclass
@dataclass
class ConversionMethod:
framework: str
tool: str
command: str
notes: str
methods = [
ConversionMethod("PyTorch",
"torch.onnx.export()",
"torch.onnx.export(model, dummy, 'model.onnx', opset_version=17)",
"ตั้ง dynamic_axes สำหรับ Variable Batch"),
ConversionMethod("TensorFlow",
"tf2onnx",
"python -m tf2onnx.convert --saved-model ./model --output model.onnx",
"pip install tf2onnx"),
ConversionMethod("Hugging Face",
"optimum",
"optimum-cli export onnx --model bert-base-uncased ./bert-onnx/",
"รองรับ Transformers ทุก Model"),
ConversionMethod("Scikit-learn",
"skl2onnx",
"convert_sklearn(model, 'model', initial_types)",
"pip install skl2onnx"),
]
print("=== Conversion Methods ===")
for m in methods:
print(f" [{m.framework}] Tool: {m.tool}")
print(f" Command: {m.command}")
print(f" Notes: {m.notes}")
Inference & Optimization
# === ONNX Runtime Inference ===
# Basic Inference
# session = ort.InferenceSession("model.onnx",
# providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
#
# input_name = session.get_inputs()[0].name
# output_name = session.get_outputs()[0].name
#
# result = session.run([output_name], {input_name: input_data})
# Session Options for Optimization
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
#
# # Enable memory optimization
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# session = ort.InferenceSession("model.onnx",
# sess_options=sess_options,
# providers=["CUDAExecutionProvider"])
# Quantization (INT8)
# from onnxruntime.quantization import quantize_dynamic, QuantType
# quantize_dynamic("model.onnx", "model_int8.onnx",
# weight_type=QuantType.QInt8)
@dataclass
class OptimizationTechnique:
technique: str
speedup: str
size_reduction: str
quality_impact: str
command: str
techniques = [
OptimizationTechnique("Graph Optimization",
"1.5-2x", "ไม่ลด", "ไม่กระทบ (Lossless)",
"GraphOptimizationLevel.ORT_ENABLE_ALL"),
OptimizationTechnique("Dynamic Quantization INT8",
"2-3x (CPU)", "4x เล็กลง", "น้อยมาก (-0.5% accuracy)",
"quantize_dynamic(model, output, QInt8)"),
OptimizationTechnique("Static Quantization INT8",
"2-4x (CPU)", "4x เล็กลง", "น้อย (-1% accuracy)",
"quantize_static(model, output, calibration_data)"),
OptimizationTechnique("FP16 Conversion",
"1.5-2x (GPU)", "2x เล็กลง", "แทบไม่กระทบ",
"ConvertFloat16().process(model)"),
OptimizationTechnique("TensorRT Provider",
"2-5x (vs CUDA)", "ไม่ลด", "ไม่กระทบ",
"providers=['TensorrtExecutionProvider']"),
]
print("=== Optimization Techniques ===")
for o in techniques:
print(f" [{o.technique}] Speed: {o.speedup} | Size: {o.size_reduction}")
print(f" Quality: {o.quality_impact}")
print(f" Command: {o.command}")
API Serving
# === FastAPI + ONNX Runtime Serving ===
# from fastapi import FastAPI
# import onnxruntime as ort
# import numpy as np
#
# app = FastAPI()
# session = ort.InferenceSession("model.onnx",
# providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
#
# @app.post("/predict")
# async def predict(data: dict):
# input_data = np.array(data["input"], dtype=np.float32)
# input_name = session.get_inputs()[0].name
# result = session.run(None, {input_name: input_data})
# return {"prediction": result[0].tolist()}
#
# # Run: uvicorn app:app --host 0.0.0.0 --port 8000
# Docker Deployment
# FROM python:3.11-slim
# RUN pip install onnxruntime fastapi uvicorn numpy
# COPY model.onnx /app/
# COPY app.py /app/
# WORKDIR /app
# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
@dataclass
class ServingConfig:
setting: str
value: str
purpose: str
configs = [
ServingConfig("Workers", "4 (CPU cores)",
"จำนวน Worker Process สำหรับ Parallel Requests"),
ServingConfig("Batch Size", "1-32",
"รวม Request เป็น Batch เพิ่ม Throughput"),
ServingConfig("Model Warm-up", "เรียก Inference 1 ครั้งตอน Start",
"ลด Cold Start Latency ครั้งแรก"),
ServingConfig("Health Check", "/health endpoint",
"ตรวจว่า Model โหลดสำเร็จ พร้อม Serve"),
ServingConfig("Monitoring", "Prometheus metrics",
"วัด Latency Throughput Error Rate"),
]
print("=== Serving Config ===")
for c in configs:
print(f" [{c.setting}] {c.value}")
print(f" Purpose: {c.purpose}")
เคล็ดลับ
- ONNX: แปลงทุก Model เป็น ONNX ก่อน Deploy เร็วกว่า Native
- Quantize: ใช้ INT8 Quantization สำหรับ CPU ลดขนาด 4x เร็วขึ้น 2-3x
- TensorRT: ใช้ TensorRT Provider สำหรับ NVIDIA GPU เร็วที่สุด
- Dynamic: ตั้ง Dynamic Axes สำหรับ Variable Input Size
- Warm-up: เรียก Inference 1 ครั้งตอน Start ลด Cold Start
ONNX Runtime คืออะไร
Microsoft Inference Engine ONNX Format PyTorch TensorFlow CPU GPU CUDA TensorRT DirectML CoreML 2-5x เร็วกว่า Python C++ Production
Home Lab ต้องมีอะไร
CPU 4+ cores RAM 16GB+ GPU NVIDIA 6GB+ SSD Ubuntu/Windows Python 3.10+ CUDA cuDNN Docker onnxruntime-gpu pip install
แปลง Model อย่างไร
torch.onnx.export() tf2onnx skl2onnx optimum Hugging Face dynamic_axes opset_version checker Quantization INT8 FP16 Optimize
เร็วกว่า PyTorch จริงไหม
2-5x เร็วกว่า Graph Optimization Kernel Fusion Constant Folding TensorRT 10-30x Quantization INT8 Multi-threading Batch Inference
สรุป
ONNX Runtime Home Lab Inference Engine ONNX Model Conversion GPU CUDA TensorRT Quantization INT8 FastAPI Docker Production Optimization
