ONNX Runtime Cost Optimization ลดค่าใช้จ่าย —

ONNX Runtime Cost Optimization

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย Quantization INT8 FP16 Graph Optimization TensorRT CPU GPU Edge Inference Production

เนื้อหาเกี่ยวข้อง — JavaScript Deno Deploy API Gateway Pattern — คู่มือฉบับสมบูรณ์ 2026

Optimization	Cost Savings	Speed	Accuracy Impact	Effort
FP16 (GPU)	30-50%	1.5-2x faster	< 0.1% loss	ต่ำ
INT8 Dynamic (CPU)	60-80%	1.5-2x faster	< 1% loss	ต่ำ
INT8 Static (CPU)	60-80%	2-3x faster	< 0.5% loss	ปานกลาง
TensorRT (GPU)	40-60%	2-5x faster	< 0.1% loss	ปานกลาง
Graph Optimization	10-30%	1.2-1.5x faster	0% loss	ต่ำ
Model Distillation	70-90%	3-10x faster	1-5% loss	สูง

Graph Optimization & Execution Providers

# === Graph Optimization & EP Selection ===

# import onnxruntime as ort
#
# # Session Options
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# # CPU Inference (Optimized)
# session = ort.InferenceSession(
#     "model_int8.onnx",
#     sess_options,
#     providers=['CPUExecutionProvider']
# )
#
# # GPU Inference (CUDA)
# session = ort.InferenceSession(
#     "model_fp16.onnx",
#     sess_options,
#     providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
# )
#
# # GPU Inference (TensorRT - Fastest)
# session = ort.InferenceSession(
#     "model.onnx",
#     sess_options,
#     providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider']
# )

@dataclass
class ExecutionProvider:
    name: str
    hardware: str
    speed: str
    cost: str
    use_case: str

providers = [
    ExecutionProvider("CPUExecutionProvider",
        "Intel/AMD CPU",
        "Baseline",
        "ต่ำสุด ($0.05-0.5/hr)",
        "Low-traffic API INT8 Quantized"),
    ExecutionProvider("CUDAExecutionProvider",
        "NVIDIA GPU",
        "5-10x CPU",
        "ปานกลาง ($0.5-3/hr)",
        "High-throughput FP16 Batch"),
    ExecutionProvider("TensorrtExecutionProvider",
        "NVIDIA GPU (TensorRT)",
        "10-20x CPU",
        "ปานกลาง ($0.5-3/hr)",
        "Max Performance Latency-critical"),
    ExecutionProvider("OpenVINOExecutionProvider",
        "Intel CPU/GPU/VPU",
        "2-5x CPU",
        "ต่ำ (Intel HW)",
        "Intel Hardware Edge Device"),
    ExecutionProvider("DirectMLExecutionProvider",
        "Windows GPU (Any)",
        "5-10x CPU",
        "ปานกลาง",
        "Windows Desktop AMD/Intel/NVIDIA"),
    ExecutionProvider("CoreMLExecutionProvider",
        "Apple Silicon (M1/M2/M3)",
        "5-15x CPU",
        "ต่ำ (Apple HW)",
        "macOS iOS Inference"),
]

print("=== Execution Providers ===")
for p in providers:
    print(f"  [{p.name}] {p.hardware}")
    print(f"    Speed: {p.speed} | Cost: {p.cost}")
    print(f"    Use: {p.use_case}")