SiamCafe.net Blog
Technology

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย

onnx runtime cost optimization ลดคาใชจาย
ONNX Runtime Cost Optimization ลดค่าใช้จ่าย | SiamCafe Blog
2025-08-25· อ. บอม — SiamCafe.net· 10,987 คำ

ONNX Runtime Cost Optimization

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย Quantization INT8 FP16 Graph Optimization TensorRT CPU GPU Edge Inference Production

OptimizationCost SavingsSpeedAccuracy ImpactEffort
FP16 (GPU)30-50%1.5-2x faster< 0.1% lossต่ำ
INT8 Dynamic (CPU)60-80%1.5-2x faster< 1% lossต่ำ
INT8 Static (CPU)60-80%2-3x faster< 0.5% lossปานกลาง
TensorRT (GPU)40-60%2-5x faster< 0.1% lossปานกลาง
Graph Optimization10-30%1.2-1.5x faster0% lossต่ำ
Model Distillation70-90%3-10x faster1-5% lossสูง

Quantization

# === ONNX Runtime Quantization ===

# pip install onnxruntime onnx onnxruntime-tools

# from onnxruntime.quantization import quantize_dynamic, quantize_static
# from onnxruntime.quantization import QuantType, CalibrationDataReader
# import onnxruntime as ort
# import numpy as np
#
# # Method 1: Dynamic Quantization (Easiest)
# quantize_dynamic(
#     model_input="model.onnx",
#     model_output="model_int8_dynamic.onnx",
#     weight_type=QuantType.QInt8,
# )
#
# # Method 2: Static Quantization (Better accuracy)
# class CalibrationData(CalibrationDataReader):
#     def __init__(self, dataset):
#         self.data = iter(dataset)
#     def get_next(self):
#         try:
#             return {"input": next(self.data)}
#         except StopIteration:
#             return None
#
# calibration_dataset = [np.random.randn(1, 3, 224, 224).astype(np.float32)
#                        for _ in range(100)]
# quantize_static(
#     model_input="model.onnx",
#     model_output="model_int8_static.onnx",
#     calibration_data_reader=CalibrationData(calibration_dataset),
#     quant_format=QuantFormat.QDQ,
#     weight_type=QuantType.QInt8,
# )
#
# # Method 3: FP16 Conversion
# from onnxconverter_common import float16
# import onnx
# model = onnx.load("model.onnx")
# model_fp16 = float16.convert_float_to_float16(model)
# onnx.save(model_fp16, "model_fp16.onnx")

from dataclasses import dataclass

@dataclass
class QuantMethod:
    method: str
    precision: str
    size_reduction: str
    speed_gain: str
    accuracy_loss: str
    calibration: str

methods = [
    QuantMethod("Dynamic INT8",
        "INT8 Weights, FP32 Activations",
        "2-4x smaller",
        "1.5-2x faster (CPU)",
        "< 1%",
        "ไม่ต้อง"),
    QuantMethod("Static INT8",
        "INT8 Weights + Activations",
        "2-4x smaller",
        "2-3x faster (CPU)",
        "< 0.5%",
        "100-500 samples"),
    QuantMethod("QAT INT8",
        "INT8 (Trained)",
        "2-4x smaller",
        "2-3x faster",
        "< 0.3%",
        "Full Training"),
    QuantMethod("FP16",
        "FP16 Weights + Activations",
        "2x smaller",
        "1.5-2x faster (GPU)",
        "< 0.1%",
        "ไม่ต้อง"),
    QuantMethod("INT4 (GPTQ/AWQ)",
        "INT4 Weights",
        "4-8x smaller",
        "2-4x faster",
        "1-3% (LLM)",
        "Calibration Dataset"),
]

print("=== Quantization Methods ===")
for m in methods:
    print(f"  [{m.method}] {m.precision}")
    print(f"    Size: {m.size_reduction} | Speed: {m.speed_gain}")
    print(f"    Accuracy Loss: {m.accuracy_loss} | Calibration: {m.calibration}")

Graph Optimization & Execution Providers

# === Graph Optimization & EP Selection ===

# import onnxruntime as ort
#
# # Session Options
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# # CPU Inference (Optimized)
# session = ort.InferenceSession(
#     "model_int8.onnx",
#     sess_options,
#     providers=['CPUExecutionProvider']
# )
#
# # GPU Inference (CUDA)
# session = ort.InferenceSession(
#     "model_fp16.onnx",
#     sess_options,
#     providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
# )
#
# # GPU Inference (TensorRT - Fastest)
# session = ort.InferenceSession(
#     "model.onnx",
#     sess_options,
#     providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider']
# )

@dataclass
class ExecutionProvider:
    name: str
    hardware: str
    speed: str
    cost: str
    use_case: str

providers = [
    ExecutionProvider("CPUExecutionProvider",
        "Intel/AMD CPU",
        "Baseline",
        "ต่ำสุด ($0.05-0.5/hr)",
        "Low-traffic API INT8 Quantized"),
    ExecutionProvider("CUDAExecutionProvider",
        "NVIDIA GPU",
        "5-10x CPU",
        "ปานกลาง ($0.5-3/hr)",
        "High-throughput FP16 Batch"),
    ExecutionProvider("TensorrtExecutionProvider",
        "NVIDIA GPU (TensorRT)",
        "10-20x CPU",
        "ปานกลาง ($0.5-3/hr)",
        "Max Performance Latency-critical"),
    ExecutionProvider("OpenVINOExecutionProvider",
        "Intel CPU/GPU/VPU",
        "2-5x CPU",
        "ต่ำ (Intel HW)",
        "Intel Hardware Edge Device"),
    ExecutionProvider("DirectMLExecutionProvider",
        "Windows GPU (Any)",
        "5-10x CPU",
        "ปานกลาง",
        "Windows Desktop AMD/Intel/NVIDIA"),
    ExecutionProvider("CoreMLExecutionProvider",
        "Apple Silicon (M1/M2/M3)",
        "5-15x CPU",
        "ต่ำ (Apple HW)",
        "macOS iOS Inference"),
]

print("=== Execution Providers ===")
for p in providers:
    print(f"  [{p.name}] {p.hardware}")
    print(f"    Speed: {p.speed} | Cost: {p.cost}")
    print(f"    Use: {p.use_case}")

Cost Analysis

# === Cost Optimization Analysis ===

@dataclass
class CostScenario:
    scenario: str
    before: str
    after: str
    monthly_saving: str
    technique: str

scenarios = [
    CostScenario("GPU → CPU (INT8)",
        "p3.2xlarge GPU $3.06/hr = $2,203/mo",
        "c5.2xlarge CPU $0.34/hr = $245/mo",
        "$1,958/mo (89% savings)",
        "INT8 Static Quantization + Graph Optimization"),
    CostScenario("Large GPU → Small GPU (FP16)",
        "p3.2xlarge $3.06/hr = $2,203/mo",
        "g4dn.xlarge $0.526/hr = $379/mo",
        "$1,824/mo (83% savings)",
        "FP16 Conversion + TensorRT"),
    CostScenario("Always-on → Auto-scale",
        "3x g4dn.xlarge 24/7 = $1,137/mo",
        "0-3x g4dn.xlarge (avg 30%) = $341/mo",
        "$796/mo (70% savings)",
        "Kubernetes HPA + Scale to Zero"),
    CostScenario("On-demand → Spot",
        "g4dn.xlarge on-demand $0.526/hr",
        "g4dn.xlarge spot ~$0.16/hr",
        "$264/mo (70% savings)",
        "Spot Instance + Fallback On-demand"),
    CostScenario("Cloud → Edge",
        "Cloud Inference $500/mo",
        "Edge Device (one-time $200)",
        "$500/mo ongoing (after payback)",
        "ONNX Runtime + Edge Device (Jetson/RPi)"),
]

print("=== Cost Scenarios ===")
for c in scenarios:
    print(f"\n  [{c.scenario}]")
    print(f"    Before: {c.before}")
    print(f"    After: {c.after}")
    print(f"    Saving: {c.monthly_saving}")
    print(f"    Technique: {c.technique}")

เคล็ดลับ

ONNX Runtime คืออะไร

High-performance Inference Engine ONNX Format CPU GPU NPU Quantization INT8 FP16 Graph Optimization Execution Provider Microsoft Open Source

Quantization ทำอย่างไร

Dynamic INT8 ง่าย Static INT8 Calibration QAT Training FP16 GPU INT4 LLM quantize_dynamic quantize_static onnxconverter float16

Graph Optimization ทำอะไร

Constant Folding Dead Code Operator Fusion Attention Fusion GELU Layout ORT_ENABLE_ALL ลด Inference 20-50% ลด Memory 10-30% 0% Accuracy Loss

ลดค่าใช้จ่ายอย่างไร

GPU→CPU INT8 89% FP16 TensorRT 83% Auto-scale 70% Spot Instance 70% Edge Inference Batching Caching Distillation Scale Zero

สรุป

ONNX Runtime Cost Optimization Quantization INT8 FP16 Graph Optimization TensorRT Auto-scale Spot Edge Inference Production

📖 บทความที่เกี่ยวข้อง

Data Lakehouse Cost Optimization ลดค่าใช้จ่ายอ่านบทความ → AWS Step Functions Cost Optimization ลดค่าใช้จ่ายอ่านบทความ → RAG Architecture Cost Optimization ลดค่าใช้จ่ายอ่านบทความ → ONNX Runtime DNS Managementอ่านบทความ → Flatcar Container Linux Cost Optimization ลดค่าใช้จ่ายอ่านบทความ →

📚 ดูบทความทั้งหมด →