it
ONNX Runtime Cost Optimization ลดค่าใช้จ่าย —
ONNX Runtime Cost Optimization

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย Quantization INT8 FP16 Graph Optimization TensorRT CPU GPU Edge Inference Production
เนื้อหาเกี่ยวข้อง — JavaScript Deno Deploy API Gateway Pattern — คู่มือฉบับสมบูรณ์ 2026
| Optimization | Cost Savings | Speed | Accuracy Impact | Effort |
|---|---|---|---|---|
| FP16 (GPU) | 30-50% | 1.5-2x faster | < 0.1% loss | ต่ำ |
| INT8 Dynamic (CPU) | 60-80% | 1.5-2x faster | < 1% loss | ต่ำ |
| INT8 Static (CPU) | 60-80% | 2-3x faster | < 0.5% loss | ปานกลาง |
| TensorRT (GPU) | 40-60% | 2-5x faster | < 0.1% loss | ปานกลาง |
| Graph Optimization | 10-30% | 1.2-1.5x faster | 0% loss | ต่ำ |
| Model Distillation | 70-90% | 3-10x faster | 1-5% loss | สูง |
Quantization
# === ONNX Runtime Quantization ===
# pip install onnxruntime onnx onnxruntime-tools
# from onnxruntime.quantization import quantize_dynamic, quantize_static
# from onnxruntime.quantization import QuantType, CalibrationDataReader
# import onnxruntime as ort
# import numpy as np
#
# # Method 1: Dynamic Quantization (Easiest)
# quantize_dynamic(
# model_input="model.onnx",
# model_output="model_int8_dynamic.onnx",
# weight_type=QuantType.QInt8,
# )
#
# # Method 2: Static Quantization (Better accuracy)
# class CalibrationData(CalibrationDataReader):
# def __init__(self, dataset):
# self.data = iter(dataset)
# def get_next(self):
# try:
# return {"input": next(self.data)}
# except StopIteration:
# return None
#
# calibration_dataset = [np.random.randn(1, 3, 224, 224).astype(np.float32)
# for _ in range(100)]
# quantize_static(
# model_input="model.onnx",
# model_output="model_int8_static.onnx",
# calibration_data_reader=CalibrationData(calibration_dataset),
# quant_format=QuantFormat.QDQ,
# weight_type=QuantType.QInt8,
# )
#
# # Method 3: FP16 Conversion
# from onnxconverter_common import float16
# import onnx
# model = onnx.load("model.onnx")
# model_fp16 = float16.convert_float_to_float16(model)
# onnx.save(model_fp16, "model_fp16.onnx")
from dataclasses import dataclass
@dataclass
class QuantMethod:
method: str
precision: str
size_reduction: str
speed_gain: str
accuracy_loss: str
calibration: str
methods = [
QuantMethod("Dynamic INT8",
"INT8 Weights, FP32 Activations",
"2-4x smaller",
"1.5-2x faster (CPU)",
"< 1%",
"ไม่ต้อง"),
QuantMethod("Static INT8",
"INT8 Weights + Activations",
"2-4x smaller",
"2-3x faster (CPU)",
"< 0.5%",
"100-500 samples"),
QuantMethod("QAT INT8",
"INT8 (Trained)",
"2-4x smaller",
"2-3x faster",
"< 0.3%",
"Full Training"),
QuantMethod("FP16",
"FP16 Weights + Activations",
"2x smaller",
"1.5-2x faster (GPU)",
"< 0.1%",
"ไม่ต้อง"),
QuantMethod("INT4 (GPTQ/AWQ)",
"INT4 Weights",
"4-8x smaller",
"2-4x faster",
"1-3% (LLM)",
"Calibration Dataset"),
]
print("=== Quantization Methods ===")
for m in methods:
print(f" [{m.method}] {m.precision}")
print(f" Size: {m.size_reduction} | Speed: {m.speed_gain}")
print(f" Accuracy Loss: {m.accuracy_loss} | Calibration: {m.calibration}")
Graph Optimization & Execution Providers

# === Graph Optimization & EP Selection ===
# import onnxruntime as ort
#
# # Session Options
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# # CPU Inference (Optimized)
# session = ort.InferenceSession(
# "model_int8.onnx",
# sess_options,
# providers=['CPUExecutionProvider']
# )
#
# # GPU Inference (CUDA)
# session = ort.InferenceSession(
# "model_fp16.onnx",
# sess_options,
# providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
# )
#
# # GPU Inference (TensorRT - Fastest)
# session = ort.InferenceSession(
# "model.onnx",
# sess_options,
# providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider']
# )
@dataclass
class ExecutionProvider:
name: str
hardware: str
speed: str
cost: str
use_case: str
providers = [
ExecutionProvider("CPUExecutionProvider",
"Intel/AMD CPU",
"Baseline",
"ต่ำสุด ($0.05-0.5/hr)",
"Low-traffic API INT8 Quantized"),
ExecutionProvider("CUDAExecutionProvider",
"NVIDIA GPU",
"5-10x CPU",
"ปานกลาง ($0.5-3/hr)",
"High-throughput FP16 Batch"),
ExecutionProvider("TensorrtExecutionProvider",
"NVIDIA GPU (TensorRT)",
"10-20x CPU",
"ปานกลาง ($0.5-3/hr)",
"Max Performance Latency-critical"),
ExecutionProvider("OpenVINOExecutionProvider",
"Intel CPU/GPU/VPU",
"2-5x CPU",
"ต่ำ (Intel HW)",
"Intel Hardware Edge Device"),
ExecutionProvider("DirectMLExecutionProvider",
"Windows GPU (Any)",
"5-10x CPU",
"ปานกลาง",
"Windows Desktop AMD/Intel/NVIDIA"),
ExecutionProvider("CoreMLExecutionProvider",
"Apple Silicon (M1/M2/M3)",
"5-15x CPU",
"ต่ำ (Apple HW)",
"macOS iOS Inference"),
]
print("=== Execution Providers ===")
for p in providers:
print(f" [{p.name}] {p.hardware}")
print(f" Speed: {p.speed} | Cost: {p.cost}")
print(f" Use: {p.use_case}")
Cost Analysis
# === Cost Optimization Analysis ===
@dataclass
class CostScenario:
scenario: str
before: str
after: str
monthly_saving: str
technique: str
scenarios = [
CostScenario("GPU → CPU (INT8)",
"p3.2xlarge GPU $3.06/hr = $2,203/mo",
"c5.2xlarge CPU $0.34/hr = $245/mo",
"$1,958/mo (89% savings)",
"INT8 Static Quantization + Graph Optimization"),
CostScenario("Large GPU → Small GPU (FP16)",
"p3.2xlarge $3.06/hr = $2,203/mo",
"g4dn.xlarge $0.526/hr = $379/mo",
"$1,824/mo (83% savings)",
"FP16 Conversion + TensorRT"),
CostScenario("Always-on → Auto-scale",
"3x g4dn.xlarge 24/7 = $1,137/mo",
"0-3x g4dn.xlarge (avg 30%) = $341/mo",
"$796/mo (70% savings)",
"Kubernetes HPA + Scale to Zero"),
CostScenario("On-demand → Spot",
"g4dn.xlarge on-demand $0.526/hr",
"g4dn.xlarge spot ~$0.16/hr",
"$264/mo (70% savings)",
"Spot Instance + Fallback On-demand"),
CostScenario("Cloud → Edge",
"Cloud Inference $500/mo",
"Edge Device (one-time $200)",
"$500/mo ongoing (after payback)",
"ONNX Runtime + Edge Device (Jetson/RPi)"),
]
print("=== Cost Scenarios ===")
for c in scenarios:
print(f"\n [{c.scenario}]")
print(f" Before: {c.before}")
print(f" After: {c.after}")
print(f" Saving: {c.monthly_saving}")
print(f" Technique: {c.technique}")
เคล็ดลับ
- INT8: เริ่ม Dynamic Quantization ง่ายสุด ลดต้นทุนทันที
- FP16: ใช้ FP16 บน GPU แทบไม่สูญเสีย Accuracy
- TensorRT: ใช้ TensorRT EP เร็วสุดบน NVIDIA GPU
- Scale Zero: ตั้ง Auto-scale to Zero เมื่อไม่มี Traffic
- Benchmark: วัด Latency Throughput Accuracy ก่อนและหลัง Optimize
ONNX Runtime คืออะไร
High-performance Inference Engine ONNX Format CPU GPU NPU Quantization INT8 FP16 Graph Optimization Execution Provider Microsoft Open Source
แนะนำเพิ่มเติม — อีบุ๊กการลงทุน SiamCafeBook
เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: GCP Cloud Run Troubleshooting แก้ปัญหา
เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: GCP Cloud Run Tech Conference 2026





