
ONNX Runtime Tech Conference 2026 —
ONNX Runtime 2026

ONNX Runtime Tech Conference Model Optimization Cross-platform Inference Hardware Acceleration Edge Deployment LLM GenAI Quantization Production
| Feature | ONNX RT 1.x | ONNX RT 2.0 | Impact |
|---|---|---|---|
| LLM Support | Basic | PagedAttention KV-Cache | 3-5x faster LLM inference |
| Quantization | INT8 FP16 | INT4 AWQ GPTQ | 50-75% model size reduction |
| Web GPU | WASM only | WebGPU backend | 10x faster in browser |
| Mobile NPU | Limited | QNN Qualcomm NPU | Native NPU acceleration |
| GenAI Pipeline | Manual | Built-in text generation | Simplified LLM deployment |
| Olive Toolchain | Separate tool | Integrated optimization | One-click model optimization |
Model Conversion and Optimization
=== ONNX Model Export and Optimization ===
PyTorch to ONNX
import torch
import onnx
model = MyModel()
model.load_state_dict(torch.load("model.pth"))
model.eval()
dummy_input = torch.randn(1, 3, 224, 224)
torch.onnx.export(
model, dummy_input, "model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
opset_version=17,
)
Optimize with ONNX Runtime
import onnxruntime as ort
from onnxruntime.transformers import optimizer
optimized = optimizer.optimize_model(
"model.onnx",
model_type="bert",
num_heads=12,
hidden_size=768,
optimization_level=99,
)
optimized.save_model_to_file("model_optimized.onnx")
Quantize INT8
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
"model_optimized.onnx",
"model_int8.onnx",
weight_type=QuantType.QInt8,
)
Olive Toolchain (automated)
olive run --config olive_config.json
{
"input_model": {"type": "PyTorchModel", "model_path": "model.pth"},
"systems": {"local": {"type": "LocalSystem", "accelerators": [{"device": "gpu"}]}},
"passes": {
"onnx_conversion": {"type": "OnnxConversion", "opset": 17},
"transformer_opt": {"type": "OrtTransformersOptimization", "model_type": "bert"},
"quantization": {"type": "OnnxDynamicQuantization", "weight_type": "QInt8"}
}
}
from dataclasses import dataclass
@dataclass
class ModelBenchmark:
model: str
original_size: str
optimized_size: str
latency_original: str
latency_optimized: str
accuracy_drop: str
benchmarks = [
ModelBenchmark("BERT-base", "438 MB", "110 MB (INT8)", "12ms", "4ms", "< 0.5%"),
ModelBenchmark("ResNet-50", "98 MB", "25 MB (INT8)", "8ms", "2ms", "< 0.3%"),
ModelBenchmark("GPT-2 Small", "548 MB", "137 MB (INT8)", "45ms", "15ms", "< 1%"),
ModelBenchmark("Whisper Small", "967 MB", "242 MB (INT8)", "800ms", "250ms", "< 0.5%"),
ModelBenchmark("LLaMA 7B", "13 GB", "3.5 GB (INT4)", "2000ms", "500ms", "< 2%"),
]
print("=== Model Benchmarks ===")
for b in benchmarks:
print(f" [{b.model}] Size: {b.original_size} → {b.optimized_size}")
print(f" Latency: {b.latency_original} → {b.latency_optimized}")
print(f" Accuracy drop: {b.accuracy_drop}")
Cross-platform Inference

=== ONNX Runtime on Multiple Platforms ===
Python Inference
import onnxruntime as ort
import numpy as np
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession("model.onnx", providers=providers)
input_name = session.get_inputs()[0].name
result = session.run(None, {input_name: np.random.randn(1, 3, 224, 224).astype(np.float32)})
C++ Inference
#include <onnxruntime_cxx_api.h>
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "inference");
Ort::SessionOptions options;
options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
Ort::Session session(env, "model.onnx", options);
JavaScript (Web)
import * as ort from 'onnxruntime-web';
const session = await ort.InferenceSession.create('model.onnx',
{ executionProviders: ['webgpu', 'wasm'] });
const feeds = { input: new ort.Tensor('float32', inputData, [1, 3, 224, 224]) };
const results = await session.run(feeds);
Mobile (React Native)
import { InferenceSession } from 'onnxruntime-react-native';
const session = await InferenceSession.create(modelPath);
@dataclass
class PlatformSupport:
platform: str
package: str
providers: str
use_case: str
platforms = [
PlatformSupport("Python", "onnxruntime / onnxruntime-gpu", "CUDA TensorRT OpenVINO DirectML", "Server inference, batch processing"),
PlatformSupport("C/C++", "onnxruntime C API", "CUDA TensorRT CoreML", "Embedded, high-performance"),
PlatformSupport("JavaScript Web", "onnxruntime-web", "WebGPU WASM", "Browser-based inference"),
PlatformSupport("Node.js", "onnxruntime-node", "CPU CUDA", "Server-side JS"),
PlatformSupport("iOS", "onnxruntime-objc", "CoreML Neural Engine", "Mobile AI apps"),
PlatformSupport("Android", "onnxruntime-android", "NNAPI QNN", "Mobile AI apps"),
PlatformSupport("React Native", "onnxruntime-react-native", "CoreML NNAPI", "Cross-platform mobile"),
]
print("\n=== Platform Support ===")
for p in platforms:
print(f" [{p.platform}] Package: {p.package}")
print(f" Providers: {p.providers}")
print(f" Use case: {p.use_case}")
Production Deployment
# === Production Deployment Patterns ===
@dataclass
class DeployPattern:
pattern: str
architecture: str
latency: str
throughput: str
best_for: str
patterns = [
DeployPattern("REST API (FastAPI)", "FastAPI + ONNX Runtime + CUDA",
"5-20ms", "100-500 req/s per GPU", "Standard ML API"),
DeployPattern("gRPC (Triton)", "Triton Inference Server + ONNX",
"2-10ms", "500-5000 req/s per GPU", "High-throughput"),
DeployPattern("Batch Processing", "Spark + ONNX Runtime",
"Minutes", "Millions/batch", "Offline scoring"),
DeployPattern("Edge (Mobile)", "ONNX Runtime Mobile + INT8",
"10-50ms", "20-60 fps", "On-device inference"),
DeployPattern("Browser (WebGPU)", "onnxruntime-web + WebGPU",
"20-100ms", "10-30 fps", "Client-side AI"),
DeployPattern("Serverless", "AWS Lambda + ONNX Runtime",
"50-500ms (cold start)", "Concurrent scaling", "Variable traffic"),
]
print("Deployment Patterns:")
for d in patterns:
print(f" [{d.pattern}]")
print(f" Arch: {d.architecture}")
print(f" Latency: {d.latency} | Throughput: {d.throughput}")
print(f" Best for: {d.best_for}")
# Monitoring checklist
monitoring = {
"Latency p50/p95/p99": "Track percentile latency per endpoint",
"Throughput": "Requests per second per instance",
"GPU Utilization": "Target 60-80% for cost efficiency",
"Memory Usage": "Track GPU and CPU memory",
"Error Rate": "< 0.1% target",
"Model Version": "Track which version is serving",
"Accuracy Drift": "Compare predictions vs actuals weekly",
}
print(f"\n\nMonitoring Checklist:")
for k, v in monitoring.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- Olive: ใช้ Olive Toolchain Optimize อัตโนมัติ ไม่ต้องทำมือ
- INT8: เริ่มด้วย INT8 Quantization ได้ 3-4x speedup แทบไม่เสีย Accuracy
- Dynamic: ใช้ Dynamic Axes สำหรับ Variable Batch Size
- Provider: ทดสอบทุก Execution Provider เลือกที่เร็วสุดบน Hardware
- Profile: ใช้ ONNX Runtime Profiler หาจุด Bottleneck
ONNX Runtime คืออะไร
Inference Engine ONNX Format PyTorch TensorFlow Cross-platform Windows Linux macOS iOS Android Web CUDA TensorRT DirectML OpenVINO Open Source Microsoft