it

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย —

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย —

ONNX Runtime Cost Optimization

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย —

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย Quantization INT8 FP16 Graph Optimization TensorRT CPU GPU Edge Inference Production

เนื้อหาเกี่ยวข้อง — JavaScript Deno Deploy API Gateway Pattern — คู่มือฉบับสมบูรณ์ 2026

OptimizationCost SavingsSpeedAccuracy ImpactEffort
FP16 (GPU)30-50%1.5-2x faster< 0.1% lossต่ำ
INT8 Dynamic (CPU)60-80%1.5-2x faster< 1% lossต่ำ
INT8 Static (CPU)60-80%2-3x faster< 0.5% lossปานกลาง
TensorRT (GPU)40-60%2-5x faster< 0.1% lossปานกลาง
Graph Optimization10-30%1.2-1.5x faster0% lossต่ำ
Model Distillation70-90%3-10x faster1-5% lossสูง

Quantization

# === ONNX Runtime Quantization ===

# pip install onnxruntime onnx onnxruntime-tools

# from onnxruntime.quantization import quantize_dynamic, quantize_static
# from onnxruntime.quantization import QuantType, CalibrationDataReader
# import onnxruntime as ort
# import numpy as np
#
# # Method 1: Dynamic Quantization (Easiest)
# quantize_dynamic(
#     model_input="model.onnx",
#     model_output="model_int8_dynamic.onnx",
#     weight_type=QuantType.QInt8,
# )
#
# # Method 2: Static Quantization (Better accuracy)
# class CalibrationData(CalibrationDataReader):
#     def __init__(self, dataset):
#         self.data = iter(dataset)
#     def get_next(self):
#         try:
#             return {"input": next(self.data)}
#         except StopIteration:
#             return None
#
# calibration_dataset = [np.random.randn(1, 3, 224, 224).astype(np.float32)
#                        for _ in range(100)]
# quantize_static(
#     model_input="model.onnx",
#     model_output="model_int8_static.onnx",
#     calibration_data_reader=CalibrationData(calibration_dataset),
#     quant_format=QuantFormat.QDQ,
#     weight_type=QuantType.QInt8,
# )
#
# # Method 3: FP16 Conversion
# from onnxconverter_common import float16
# import onnx
# model = onnx.load("model.onnx")
# model_fp16 = float16.convert_float_to_float16(model)
# onnx.save(model_fp16, "model_fp16.onnx")

from dataclasses import dataclass

@dataclass
class QuantMethod:
    method: str
    precision: str
    size_reduction: str
    speed_gain: str
    accuracy_loss: str
    calibration: str

methods = [
    QuantMethod("Dynamic INT8",
        "INT8 Weights, FP32 Activations",
        "2-4x smaller",
        "1.5-2x faster (CPU)",
        "< 1%",
        "ไม่ต้อง"),
    QuantMethod("Static INT8",
        "INT8 Weights + Activations",
        "2-4x smaller",
        "2-3x faster (CPU)",
        "< 0.5%",
        "100-500 samples"),
    QuantMethod("QAT INT8",
        "INT8 (Trained)",
        "2-4x smaller",
        "2-3x faster",
        "< 0.3%",
        "Full Training"),
    QuantMethod("FP16",
        "FP16 Weights + Activations",
        "2x smaller",
        "1.5-2x faster (GPU)",
        "< 0.1%",
        "ไม่ต้อง"),
    QuantMethod("INT4 (GPTQ/AWQ)",
        "INT4 Weights",
        "4-8x smaller",
        "2-4x faster",
        "1-3% (LLM)",
        "Calibration Dataset"),
]

print("=== Quantization Methods ===")
for m in methods:
    print(f"  [{m.method}] {m.precision}")
    print(f"    Size: {m.size_reduction} | Speed: {m.speed_gain}")
    print(f"    Accuracy Loss: {m.accuracy_loss} | Calibration: {m.calibration}")

Graph Optimization & Execution Providers

ONNX Runtime Cost Optimization ลดค่าใช้จ่าย —
# === Graph Optimization & EP Selection ===

# import onnxruntime as ort
#
# # Session Options
# sess_options = ort.SessionOptions()
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# sess_options.intra_op_num_threads = 4
# sess_options.inter_op_num_threads = 2
# sess_options.enable_mem_pattern = True
# sess_options.enable_cpu_mem_arena = True
#
# # CPU Inference (Optimized)
# session = ort.InferenceSession(
#     "model_int8.onnx",
#     sess_options,
#     providers=['CPUExecutionProvider']
# )
#
# # GPU Inference (CUDA)
# session = ort.InferenceSession(
#     "model_fp16.onnx",
#     sess_options,
#     providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
# )
#
# # GPU Inference (TensorRT - Fastest)
# session = ort.InferenceSession(
#     "model.onnx",
#     sess_options,
#     providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider']
# )

@dataclass
class ExecutionProvider:
    name: str
    hardware: str
    speed: str
    cost: str
    use_case: str

providers = [
    ExecutionProvider("CPUExecutionProvider",
        "Intel/AMD CPU",
        "Baseline",
        "ต่ำสุด ($0.05-0.5/hr)",
        "Low-traffic API INT8 Quantized"),
    ExecutionProvider("CUDAExecutionProvider",
        "NVIDIA GPU",
        "5-10x CPU",
        "ปานกลาง ($0.5-3/hr)",
        "High-throughput FP16 Batch"),
    ExecutionProvider("TensorrtExecutionProvider",
        "NVIDIA GPU (TensorRT)",
        "10-20x CPU",
        "ปานกลาง ($0.5-3/hr)",
        "Max Performance Latency-critical"),
    ExecutionProvider("OpenVINOExecutionProvider",
        "Intel CPU/GPU/VPU",
        "2-5x CPU",
        "ต่ำ (Intel HW)",
        "Intel Hardware Edge Device"),
    ExecutionProvider("DirectMLExecutionProvider",
        "Windows GPU (Any)",
        "5-10x CPU",
        "ปานกลาง",
        "Windows Desktop AMD/Intel/NVIDIA"),
    ExecutionProvider("CoreMLExecutionProvider",
        "Apple Silicon (M1/M2/M3)",
        "5-15x CPU",
        "ต่ำ (Apple HW)",
        "macOS iOS Inference"),
]

print("=== Execution Providers ===")
for p in providers:
    print(f"  [{p.name}] {p.hardware}")
    print(f"    Speed: {p.speed} | Cost: {p.cost}")
    print(f"    Use: {p.use_case}")

Cost Analysis

# === Cost Optimization Analysis ===

@dataclass
class CostScenario:
    scenario: str
    before: str
    after: str
    monthly_saving: str
    technique: str

scenarios = [
    CostScenario("GPU → CPU (INT8)",
        "p3.2xlarge GPU $3.06/hr = $2,203/mo",
        "c5.2xlarge CPU $0.34/hr = $245/mo",
        "$1,958/mo (89% savings)",
        "INT8 Static Quantization + Graph Optimization"),
    CostScenario("Large GPU → Small GPU (FP16)",
        "p3.2xlarge $3.06/hr = $2,203/mo",
        "g4dn.xlarge $0.526/hr = $379/mo",
        "$1,824/mo (83% savings)",
        "FP16 Conversion + TensorRT"),
    CostScenario("Always-on → Auto-scale",
        "3x g4dn.xlarge 24/7 = $1,137/mo",
        "0-3x g4dn.xlarge (avg 30%) = $341/mo",
        "$796/mo (70% savings)",
        "Kubernetes HPA + Scale to Zero"),
    CostScenario("On-demand → Spot",
        "g4dn.xlarge on-demand $0.526/hr",
        "g4dn.xlarge spot ~$0.16/hr",
        "$264/mo (70% savings)",
        "Spot Instance + Fallback On-demand"),
    CostScenario("Cloud → Edge",
        "Cloud Inference $500/mo",
        "Edge Device (one-time $200)",
        "$500/mo ongoing (after payback)",
        "ONNX Runtime + Edge Device (Jetson/RPi)"),
]

print("=== Cost Scenarios ===")
for c in scenarios:
    print(f"\n  [{c.scenario}]")
    print(f"    Before: {c.before}")
    print(f"    After: {c.after}")
    print(f"    Saving: {c.monthly_saving}")
    print(f"    Technique: {c.technique}")

เคล็ดลับ

  • INT8: เริ่ม Dynamic Quantization ง่ายสุด ลดต้นทุนทันที
  • FP16: ใช้ FP16 บน GPU แทบไม่สูญเสีย Accuracy
  • TensorRT: ใช้ TensorRT EP เร็วสุดบน NVIDIA GPU
  • Scale Zero: ตั้ง Auto-scale to Zero เมื่อไม่มี Traffic
  • Benchmark: วัด Latency Throughput Accuracy ก่อนและหลัง Optimize

ONNX Runtime คืออะไร

High-performance Inference Engine ONNX Format CPU GPU NPU Quantization INT8 FP16 Graph Optimization Execution Provider Microsoft Open Source

แนะนำเพิ่มเติม — อีบุ๊กการลงทุน SiamCafeBook

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: GCP Cloud Run Troubleshooting แก้ปัญหา

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: GCP Cloud Run Tech Conference 2026

XM Legend · เทรดเดอร์ & ผู้สอน Forex 13 ปี

ผู้ก่อตั้ง SiamCafe ตั้งแต่ปี 1997 · เทรดเดอร์สาย Forex มากกว่า 13 ปี ได้รับการยกย่องเป็น XM Legend · แบ่งปันความรู้ Forex, ไอที, AI และการเทรด จากประสบการณ์จริงในตลาดจริง