ONNX Runtime Cost Optimization
ONNX Runtime Cost Optimization ลดค่าใช้จ่าย Quantization INT8 FP16 Graph Optimization TensorRT CPU GPU Edge Inference Production
| Optimization | Cost Savings | Speed | Accuracy Impact | Effort |
|---|---|---|---|---|
| FP16 (GPU) | 30-50% | 1.5-2x faster | < 0.1% loss | ต่ำ |
| INT8 Dynamic (CPU) | 60-80% | 1.5-2x faster | < 1% loss | ต่ำ |
| INT8 Static (CPU) | 60-80% | 2-3x faster | < 0.5% loss | ปานกลาง |
| TensorRT (GPU) | 40-60% | 2-5x faster | < 0.1% loss | ปานกลาง |
| Graph Optimization | 10-30% | 1.2-1.5x faster | 0% loss | ต่ำ |
| Model Distillation | 70-90% | 3-10x faster | 1-5% loss | สูง |
Quantization
# === ONNX Runtime Quantization ===
# pip install onnxruntime onnx onnxruntime-tools
# from onnxruntime.quantization import quantize_dynamic, quantize_static
# from onnxruntime.quantization import QuantType, CalibrationDataReader
# import onnxruntime as ort
# import numpy as np
#
# # Method 1: Dynamic Quantization (Easiest)
# quantize_dynamic(
# model_input="model.onnx",
# model_output="model_int8_dynamic.onnx",
# weight_type=QuantType.QInt8,
# )
#
# # Method 2: Static Quantization (Better accuracy)
# class CalibrationData(CalibrationDataReader):
# def __init__(self, dataset):
# self.data = iter(dataset)
# def get_next(self):
# try:
# return {"input": next(self.data)}
# except StopIteration:
# return None
#
# calibration_dataset = [np.random.randn(1, 3, 224, 224).astype(np.float32)
# for _ in range(100)]
# quantize_static(
# model_input="model.onnx",
# model_output="model_int8_static.onnx",
# calibration_data_reader=CalibrationData(calibration_dataset),
# quant_format=QuantFormat.QDQ,
# weight_type=QuantType.QInt8,
# )
#
# # Method 3: FP16 Conversion
# from onnxconverter_common import float16
# import onnx
# model = onnx.load("model.onnx")
# model_fp16 = float16.convert_float_to_float16(model)
# onnx.save(model_fp16, "model_fp16.onnx")
from dataclasses import dataclass
@dataclass
class QuantMethod:
method: str
precision: str
size_reduction: str
speed_gain: str
accuracy_loss: str
calibration: str
methods = [
QuantMethod("Dynamic INT8",
"INT8 Weights, FP32 Activations",
"2-4x smaller",
"1.5-2x faster (CPU)",
"< 1%",
"ไม่ต้อง"),
QuantMethod("Static INT8",
"INT8 Weights + Activations",
"2-4x smaller",
"2-3x faster (CPU)",
"< 0.5%",
"100-500 samples"),
QuantMethod("QAT INT8",
"INT8 (Trained)",
"2-4x smaller",
"2-3x faster",
"< 0.3%",
"Full Training"),
QuantMethod("FP16",
"FP16 Weights + Activations",
"2x smaller",
"1.5-2x faster (GPU)",
"< 0.1%",
"ไม่ต้อง"),
QuantMethod("INT4 (GPTQ/AWQ)",
"INT4 Weights",
"4-8x smaller",
"2-4x faster",
"1-3% (LLM)",
"Calibration Dataset"),
]
print("=== Quantization Methods ===")
for m in methods:
print(f" [{m.method}] {m.precision}")
print(f" Size: {m.size_reduction} | Speed: {m.speed_gain}")
print(f" Accuracy Loss: {m.accuracy_loss} | Calibration: {m.calibration}")
Graph Optimization & Execution Providers
# === Graph Optimization & EP Selection ===
# import onnxruntime as ort
#
# # Session Options
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# # CPU Inference (Optimized)
# session = ort.InferenceSession(
# "model_int8.onnx",
# sess_options,
# providers=['CPUExecutionProvider']
# )
#
# # GPU Inference (CUDA)
# session = ort.InferenceSession(
# "model_fp16.onnx",
# sess_options,
# providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
# )
#
# # GPU Inference (TensorRT - Fastest)
# session = ort.InferenceSession(
# "model.onnx",
# sess_options,
# providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider']
# )
@dataclass
class ExecutionProvider:
name: str
hardware: str
speed: str
cost: str
use_case: str
providers = [
ExecutionProvider("CPUExecutionProvider",
"Intel/AMD CPU",
"Baseline",
"ต่ำสุด ($0.05-0.5/hr)",
"Low-traffic API INT8 Quantized"),
ExecutionProvider("CUDAExecutionProvider",
"NVIDIA GPU",
"5-10x CPU",
"ปานกลาง ($0.5-3/hr)",
"High-throughput FP16 Batch"),
ExecutionProvider("TensorrtExecutionProvider",
"NVIDIA GPU (TensorRT)",
"10-20x CPU",
"ปานกลาง ($0.5-3/hr)",
"Max Performance Latency-critical"),
ExecutionProvider("OpenVINOExecutionProvider",
"Intel CPU/GPU/VPU",
"2-5x CPU",
"ต่ำ (Intel HW)",
"Intel Hardware Edge Device"),
ExecutionProvider("DirectMLExecutionProvider",
"Windows GPU (Any)",
"5-10x CPU",
"ปานกลาง",
"Windows Desktop AMD/Intel/NVIDIA"),
ExecutionProvider("CoreMLExecutionProvider",
"Apple Silicon (M1/M2/M3)",
"5-15x CPU",
"ต่ำ (Apple HW)",
"macOS iOS Inference"),
]
print("=== Execution Providers ===")
for p in providers:
print(f" [{p.name}] {p.hardware}")
print(f" Speed: {p.speed} | Cost: {p.cost}")
print(f" Use: {p.use_case}")
Cost Analysis
# === Cost Optimization Analysis ===
@dataclass
class CostScenario:
scenario: str
before: str
after: str
monthly_saving: str
technique: str
scenarios = [
CostScenario("GPU → CPU (INT8)",
"p3.2xlarge GPU $3.06/hr = $2,203/mo",
"c5.2xlarge CPU $0.34/hr = $245/mo",
"$1,958/mo (89% savings)",
"INT8 Static Quantization + Graph Optimization"),
CostScenario("Large GPU → Small GPU (FP16)",
"p3.2xlarge $3.06/hr = $2,203/mo",
"g4dn.xlarge $0.526/hr = $379/mo",
"$1,824/mo (83% savings)",
"FP16 Conversion + TensorRT"),
CostScenario("Always-on → Auto-scale",
"3x g4dn.xlarge 24/7 = $1,137/mo",
"0-3x g4dn.xlarge (avg 30%) = $341/mo",
"$796/mo (70% savings)",
"Kubernetes HPA + Scale to Zero"),
CostScenario("On-demand → Spot",
"g4dn.xlarge on-demand $0.526/hr",
"g4dn.xlarge spot ~$0.16/hr",
"$264/mo (70% savings)",
"Spot Instance + Fallback On-demand"),
CostScenario("Cloud → Edge",
"Cloud Inference $500/mo",
"Edge Device (one-time $200)",
"$500/mo ongoing (after payback)",
"ONNX Runtime + Edge Device (Jetson/RPi)"),
]
print("=== Cost Scenarios ===")
for c in scenarios:
print(f"\n [{c.scenario}]")
print(f" Before: {c.before}")
print(f" After: {c.after}")
print(f" Saving: {c.monthly_saving}")
print(f" Technique: {c.technique}")
เคล็ดลับ
- INT8: เริ่ม Dynamic Quantization ง่ายสุด ลดต้นทุนทันที
- FP16: ใช้ FP16 บน GPU แทบไม่สูญเสีย Accuracy
- TensorRT: ใช้ TensorRT EP เร็วสุดบน NVIDIA GPU
- Scale Zero: ตั้ง Auto-scale to Zero เมื่อไม่มี Traffic
- Benchmark: วัด Latency Throughput Accuracy ก่อนและหลัง Optimize
ONNX Runtime คืออะไร
High-performance Inference Engine ONNX Format CPU GPU NPU Quantization INT8 FP16 Graph Optimization Execution Provider Microsoft Open Source
Quantization ทำอย่างไร
Dynamic INT8 ง่าย Static INT8 Calibration QAT Training FP16 GPU INT4 LLM quantize_dynamic quantize_static onnxconverter float16
Graph Optimization ทำอะไร
Constant Folding Dead Code Operator Fusion Attention Fusion GELU Layout ORT_ENABLE_ALL ลด Inference 20-50% ลด Memory 10-30% 0% Accuracy Loss
ลดค่าใช้จ่ายอย่างไร
GPU→CPU INT8 89% FP16 TensorRT 83% Auto-scale 70% Spot Instance 70% Edge Inference Batching Caching Distillation Scale Zero
สรุป
ONNX Runtime Cost Optimization Quantization INT8 FP16 Graph Optimization TensorRT Auto-scale Spot Edge Inference Production
