SiamCafe · Blog
ONNX Runtime กับ Service Mesh Setup — วิธีใช้
บทความ

ONNX Runtime กับ Service Mesh Setup — วิธีใช้

เผยแพร่ 28 พฤษภาคม 2569

ONNX Runtime สำหรับ ML Inference

ONNX Runtime กับ Service Mesh Setup — วิธีใช้

ONNX Runtime เป็น Inference Engine จาก Microsoft ที่ออกแบบมาให้รัน ML Models ได้เร็วที่สุดบนทุก Platform รองรับ ONNX Format ที่เป็นมาตรฐานกลางระหว่าง ML Frameworks ทำให้ Train ด้วย PyTorch แล้วรันด้วย ONNX Runtime ได้

Service Mesh เช่น Istio ช่วยจัดการ ML Inference Services โดยอัตโนมัติ ทำ Traffic Management ระหว่าง Model Versions, Load Balancing, mTLS, Retry/Circuit Breaker และ Observability โดยไม่ต้องเขียน Code เพิ่ม

ONNX Model Conversion และ Inference

# onnx_inference.py — ONNX Runtime ML Inference Service
# pip install onnxruntime numpy torch onnx flask

import onnxruntime as ort
import numpy as np
import torch
import torch.nn as nn
import onnx
import json
import time
from flask import Flask, request, jsonify

# === 1. Model Conversion ===

class SimpleModel(nn.Module):
    """ตัวอย่าง PyTorch Model"""
    def __init__(self, input_size=10, hidden_size=64, output_size=3):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Softmax(dim=1),
        )

    def forward(self, x):
        return self.layers(x)

def export_to_onnx(model, input_shape, output_path):
    """แปลง PyTorch Model เป็น ONNX"""
    model.eval()
    dummy_input = torch.randn(*input_shape)

    torch.onnx.export(
        model, dummy_input, output_path,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={
            "input": {0: "batch_size"},
            "output": {0: "batch_size"},
        },
        opset_version=17,
    )

    # Validate
    onnx_model = onnx.load(output_path)
    onnx.checker.check_model(onnx_model)
    print(f"Model exported: {output_path}")

# === 2. ONNX Runtime Inference ===

class ONNXInferenceService:
    """ONNX Runtime Inference Service"""

    def __init__(self, model_path, providers=None):
        if providers is None:
            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]

        self.session = ort.InferenceSession(model_path, providers=providers)
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name

        # Model Info
        meta = self.session.get_modelmeta()
        print(f"Model: {model_path}")
        print(f"Provider: {self.session.get_providers()}")
        print(f"Input: {self.session.get_inputs()[0].shape}")
        print(f"Output: {self.session.get_outputs()[0].shape}")

    def predict(self, input_data):
        """Inference"""
        if isinstance(input_data, list):
            input_data = np.array(input_data, dtype=np.float32)
        result = self.session.run(
            [self.output_name],
            {self.input_name: input_data},
        )
        return result[0]

    def benchmark(self, input_shape, n=1000):
        """Benchmark"""
        data = np.random.randn(*input_shape).astype(np.float32)
        # Warmup
        for _ in range(50):
            self.predict(data)
        # Benchmark
        latencies = []
        for _ in range(n):
            start = time.perf_counter()
            self.predict(data)
            latencies.append((time.perf_counter() - start) * 1000)

        avg = np.mean(latencies)
        p50 = np.percentile(latencies, 50)
        p99 = np.percentile(latencies, 99)
        print(f"\nBenchmark ({n} iterations):")
        print(f"  Avg: {avg:.2f}ms | P50: {p50:.2f}ms | P99: {p99:.2f}ms")
        print(f"  Throughput: {1000/avg:.0f} req/s")

# === 3. Flask API ===

app = Flask(__name__)
# service = ONNXInferenceService("model.onnx")

@app.route("/health", methods=["GET"])
def health():
    return jsonify({"status": "healthy"})

@app.route("/predict", methods=["POST"])
def predict():
    data = request.json
    input_data = np.array(data["input"], dtype=np.float32)
    start = time.perf_counter()
    result = service.predict(input_data)
    latency = (time.perf_counter() - start) * 1000
    return jsonify({
        "prediction": result.tolist(),
        "latency_ms": round(latency, 2),
    })

@app.route("/info", methods=["GET"])
def info():
    return jsonify({
        "model": "model.onnx",
        "providers": service.session.get_providers(),
        "input_shape": str(service.session.get_inputs()[0].shape),
    })

# if __name__ == "__main__":
#     app.run(host="0.0.0.0", port=8080)

Istio Service Mesh Configuration

ONNX Runtime กับ Service Mesh Setup — วิธีใช้
# === Istio Service Mesh สำหรับ ML Inference ===

# 1. Kubernetes Deployment
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ml-inference-v1
  namespace: ml
  labels:
    app: ml-inference
    version: v1
spec:
  replicas: 3
  selector:
    matchLabels:
      app: ml-inference
      version: v1
  template:
    metadata:
      labels:
        app: ml-inference
        version: v1
      annotations:
        sidecar.istio.io/inject: "true"
    spec:
      containers:
        - name: inference
          image: myregistry/ml-inference:v1
          ports:
            - containerPort: 8080
          resources:
            requests:
              cpu: "500m"
              memory: "512Mi"
            limits:
              cpu: "2"
              memory: "2Gi"
              nvidia.com/gpu: "1"
          readinessProbe:
            httpGet:
              path: /health
              port: 8080
            initialDelaySeconds: 10
          livenessProbe:
            httpGet:
              path: /health
              port: 8080
            initialDelaySeconds: 30
---
# 2. Service
apiVersion: v1
kind: Service
metadata:
  name: ml-inference
  namespace: ml
spec:
  ports:
    - port: 80
      targetPort: 8080
      name: http
  selector:
    app: ml-inference
---
# 3. Istio VirtualService — Traffic Management
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: ml-inference
  namespace: ml
spec:
  hosts:
    - ml-inference
  http:
    - match:
        - headers:
            x-model-version:
              exact: "v2"
      route:
        - destination:
            host: ml-inference
            subset: v2
    - route:
        - destination:
            host: ml-inference
            subset: v1
          weight: 90
        - destination:
            host: ml-inference
            subset: v2
          weight: 10
      timeout: 5s
      retries:
        attempts: 3
        perTryTimeout: 2s
        retryOn: 5xx, reset, connect-failure
---
# 4. DestinationRule — Load Balancing + Circuit Breaker
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
  name: ml-inference
  namespace: ml
spec:
  host: ml-inference
  trafficPolicy:
    connectionPool:
      tcp:
        maxConnections: 100
      http:
        h2UpgradePolicy: DEFAULT
        http1MaxPendingRequests: 100
        http2MaxRequests: 1000
    outlierDetection:
      consecutive5xxErrors: 3
      interval: 10s
      baseEjectionTime: 30s
      maxEjectionPercent: 50
    loadBalancer:
      simple: LEAST_REQUEST
  subsets:
    - name: v1
      labels:
        version: v1
    - name: v2
      labels:
        version: v2

Observability Dashboard

# === Prometheus Metrics สำหรับ ONNX Runtime ===
# pip install prometheus-client

from prometheus_client import Counter, Histogram, Gauge, generate_latest
from flask import Response
import functools

# Metrics
REQUEST_COUNT = Counter(
    "ml_inference_requests_total",
    "Total inference requests",
    ["model", "version", "status"],
)
REQUEST_LATENCY = Histogram(
    "ml_inference_latency_seconds",
    "Inference latency in seconds",
    ["model", "version"],
    buckets=[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0],
)
MODEL_LOADED = Gauge(
    "ml_model_loaded",
    "Whether model is loaded",
    ["model", "version"],
)
BATCH_SIZE = Histogram(
    "ml_inference_batch_size",
    "Inference batch size",
    ["model"],
    buckets=[1, 2, 4, 8, 16, 32, 64],
)

def track_inference(model_name, version):
    """Decorator สำหรับ Track Inference Metrics"""
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            with REQUEST_LATENCY.labels(model_name, version).time():
                try:
                    result = func(*args, **kwargs)
                    REQUEST_COUNT.labels(model_name, version, "success").inc()
                    return result
                except Exception as e:
                    REQUEST_COUNT.labels(model_name, version, "error").inc()
                    raise
        return wrapper
    return decorator

# Metrics Endpoint
# @app.route("/metrics")
# def metrics():
#     return Response(generate_latest(), mimetype="text/plain")

# === Grafana Dashboard JSON ===
grafana_panels = [
    {"title": "Request Rate", "expr": "rate(ml_inference_requests_total[5m])"},
    {"title": "Latency P50", "expr": "histogram_quantile(0.5, rate(ml_inference_latency_seconds_bucket[5m]))"},
    {"title": "Latency P99", "expr": "histogram_quantile(0.99, rate(ml_inference_latency_seconds_bucket[5m]))"},
    {"title": "Error Rate", "expr": "rate(ml_inference_requests_total{status='error'}[5m])"},
    {"title": "Batch Size Avg", "expr": "rate(ml_inference_batch_size_sum[5m]) / rate(ml_inference_batch_size_count[5m])"},
]

print("Grafana Dashboard Panels:")
for panel in grafana_panels:
    print(f"  {panel['title']}: {panel['expr']}")

Best Practices

  • ONNX Opset: ใช้ Opset Version ล่าสุดที่ ONNX Runtime รองรับ เพื่อ Performance ดีที่สุด
  • Execution Providers: ใช้ CUDA EP สำหรับ GPU, TensorRT EP สำหรับ NVIDIA Optimization
  • Service Mesh mTLS: เปิด mTLS ระหว่าง Services ป้องกัน Data ระหว่างทาง
  • Circuit Breaker: ตั้ง Circuit Breaker ป้องกัน Cascade Failure
  • Canary Deployment: ใช้ Istio Traffic Splitting ทดสอบ Model Version ใหม่
  • Metrics: ติดตาม Latency, Throughput, Error Rate สำหรับทุก Model Version

ONNX Runtime คืออะไร

High-performance Inference Engine จาก Microsoft รัน ML Models รูปแบบ ONNX รองรับ CPU GPU NPU หลาย Platform เร็วกว่า Native Framework 2-3 เท่า