TensorRT Optimization
TensorRT SDK NVIDIA Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX
Kubernetes Pod Scheduling เลือก Node เหมาะสม Resource Requests Limits Node Affinity Taints Tolerations NVIDIA Device Plugin GPU Resources
TensorRT Engine Build
# tensorrt_build.py — TensorRT Engine Build Pipeline
# pip install tensorrt torch onnx onnxruntime
import os
from dataclasses import dataclass
from typing import List, Dict, Optional
@dataclass
class TRTConfig:
model_path: str
precision: str # fp32, fp16, int8
max_batch_size: int
workspace_size: int # MB
calibration_data: str = ""
dynamic_shapes: bool = False
class TensorRTBuilder:
"""TensorRT Engine Builder"""
def __init__(self, config: TRTConfig):
self.config = config
def export_onnx(self, pytorch_model_path: str, output_path: str):
"""Export PyTorch Model to ONNX"""
# import torch
# model = torch.load(pytorch_model_path)
# model.eval()
# dummy_input = torch.randn(1, 3, 640, 640)
# torch.onnx.export(
# model, dummy_input, output_path,
# opset_version=17,
# input_names=["images"],
# output_names=["output"],
# dynamic_axes={"images": {0: "batch"}, "output": {0: "batch"}}
# )
print(f" Exported ONNX: {output_path}")
def build_engine(self, onnx_path: str, engine_path: str):
"""Build TensorRT Engine from ONNX"""
# import tensorrt as trt
# logger = trt.Logger(trt.Logger.INFO)
# builder = trt.Builder(logger)
# network = builder.create_network(
# 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# parser = trt.OnnxParser(network, logger)
#
# with open(onnx_path, 'rb') as f:
# parser.parse(f.read())
#
# config = builder.create_builder_config()
# config.set_memory_pool_limit(
# trt.MemoryPoolType.WORKSPACE,
# self.config.workspace_size << 20)
#
# if self.config.precision == "fp16":
# config.set_flag(trt.BuilderFlag.FP16)
# elif self.config.precision == "int8":
# config.set_flag(trt.BuilderFlag.INT8)
# # Set calibrator for INT8
#
# engine = builder.build_serialized_network(network, config)
# with open(engine_path, 'wb') as f:
# f.write(engine)
print(f" Built TensorRT Engine: {engine_path}")
print(f" Precision: {self.config.precision}")
print(f" Max Batch: {self.config.max_batch_size}")
print(f" Workspace: {self.config.workspace_size}MB")
def benchmark(self, engine_path: str, iterations: int = 100):
"""Benchmark TensorRT Engine"""
print(f"\n Benchmark: {engine_path}")
print(f" Iterations: {iterations}")
# Simulated results
results = {
"fp32": {"latency_ms": 15.2, "throughput_fps": 65},
"fp16": {"latency_ms": 6.1, "throughput_fps": 163},
"int8": {"latency_ms": 3.8, "throughput_fps": 263},
}
r = results.get(self.config.precision, results["fp32"])
print(f" Latency: {r['latency_ms']}ms")
print(f" Throughput: {r['throughput_fps']} FPS")
# Build Pipeline
configs = [
TRTConfig("yolov8m.onnx", "fp32", 16, 4096),
TRTConfig("yolov8m.onnx", "fp16", 16, 4096),
TRTConfig("yolov8m.onnx", "int8", 16, 4096, "calibration_data/"),
]
print("TensorRT Build Pipeline:")
for config in configs:
builder = TensorRTBuilder(config)
engine_name = f"yolov8m_{config.precision}.engine"
builder.build_engine(config.model_path, engine_name)
builder.benchmark(engine_name)
# trtexec Commands
trtexec_commands = {
"FP32": "trtexec --onnx=model.onnx --saveEngine=model_fp32.engine",
"FP16": "trtexec --onnx=model.onnx --saveEngine=model_fp16.engine --fp16",
"INT8": "trtexec --onnx=model.onnx --saveEngine=model_int8.engine --int8 --calib=calibration.cache",
"Dynamic": "trtexec --onnx=model.onnx --minShapes=input:1x3x640x640 --optShapes=input:8x3x640x640 --maxShapes=input:16x3x640x640",
"Benchmark": "trtexec --loadEngine=model_fp16.engine --batch=8 --iterations=1000",
}
print(f"\ntrtexec Commands:")
for name, cmd in trtexec_commands.items():
print(f" [{name}]")
print(f" {cmd}")
Kubernetes GPU Scheduling
# k8s_gpu_scheduling.py — Kubernetes GPU Pod Scheduling
# 1. NVIDIA GPU Operator Installation
# helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
# helm install gpu-operator nvidia/gpu-operator \
# --namespace gpu-operator --create-namespace
# 2. GPU Pod Deployment
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: tensorrt-inference
# spec:
# replicas: 3
# selector:
# matchLabels:
# app: tensorrt-inference
# template:
# metadata:
# labels:
# app: tensorrt-inference
# spec:
# nodeSelector:
# nvidia.com/gpu.product: NVIDIA-A10G
# tolerations:
# - key: nvidia.com/gpu
# operator: Exists
# effect: NoSchedule
# containers:
# - name: inference
# image: nvcr.io/nvidia/tensorrt:23.12-py3
# command: ["python3", "server.py"]
# ports:
# - containerPort: 8080
# resources:
# limits:
# nvidia.com/gpu: 1
# memory: 8Gi
# cpu: 4
# requests:
# nvidia.com/gpu: 1
# memory: 4Gi
# cpu: 2
# volumeMounts:
# - name: model-storage
# mountPath: /models
# volumes:
# - name: model-storage
# persistentVolumeClaim:
# claimName: model-pvc
# 3. GPU HPA (Custom Metrics)
# apiVersion: autoscaling/v2
# kind: HorizontalPodAutoscaler
# metadata:
# name: tensorrt-hpa
# spec:
# scaleTargetRef:
# apiVersion: apps/v1
# kind: Deployment
# name: tensorrt-inference
# minReplicas: 2
# maxReplicas: 10
# metrics:
# - type: Pods
# pods:
# metric:
# name: gpu_utilization
# target:
# type: AverageValue
# averageValue: "70"
# - type: Pods
# pods:
# metric:
# name: inference_latency_p99
# target:
# type: AverageValue
# averageValue: "50m"
from dataclasses import dataclass
from typing import List
@dataclass
class GPUNode:
name: str
gpu_type: str
gpu_count: int
gpu_memory: str
available: int
class GPUScheduler:
"""GPU Scheduling Strategy"""
def __init__(self):
self.nodes: List[GPUNode] = []
def add_node(self, node: GPUNode):
self.nodes.append(node)
def schedule(self, required_gpu: int, preferred_type: str = ""):
"""เลือก Node ที่เหมาะสม"""
candidates = [n for n in self.nodes if n.available >= required_gpu]
if preferred_type:
preferred = [n for n in candidates if preferred_type in n.gpu_type]
if preferred:
candidates = preferred
if candidates:
best = max(candidates, key=lambda n: n.available)
return best
return None
def show_status(self):
print(f"\n{'='*55}")
print(f"GPU Cluster Status")
print(f"{'='*55}")
total_gpus = sum(n.gpu_count for n in self.nodes)
available = sum(n.available for n in self.nodes)
print(f" Total GPUs: {total_gpus} | Available: {available}")
for node in self.nodes:
used = node.gpu_count - node.available
bar = "#" * used + "." * node.available
print(f" {node.name:<20} {node.gpu_type:<12} [{bar}] {used}/{node.gpu_count}")
scheduler = GPUScheduler()
nodes = [
GPUNode("gpu-node-01", "A10G", 4, "24GB", 2),
GPUNode("gpu-node-02", "A10G", 4, "24GB", 1),
GPUNode("gpu-node-03", "T4", 2, "16GB", 2),
GPUNode("gpu-node-04", "A100", 8, "80GB", 5),
]
for node in nodes:
scheduler.add_node(node)
scheduler.show_status()
best = scheduler.schedule(1, "A10G")
if best:
print(f"\n Scheduled on: {best.name} ({best.gpu_type})")
Inference Server
# inference_server.py — TensorRT Inference Server
# NVIDIA Triton Inference Server
# 1. Triton Model Repository Structure
# model_repository/
# ├── yolov8_fp16/
# │ ├── config.pbtxt
# │ ├── 1/
# │ │ └── model.plan (TensorRT Engine)
# │ └── labels.txt
# └── yolov8_ensemble/
# ├── config.pbtxt
# └── 1/
# 2. config.pbtxt
# name: "yolov8_fp16"
# platform: "tensorrt_plan"
# max_batch_size: 16
# input [
# { name: "images" data_type: TYPE_FP16 dims: [3, 640, 640] }
# ]
# output [
# { name: "output" data_type: TYPE_FP16 dims: [84, 8400] }
# ]
# instance_group [
# { count: 2 kind: KIND_GPU gpus: [0] }
# ]
# dynamic_batching {
# preferred_batch_size: [4, 8, 16]
# max_queue_delay_microseconds: 100
# }
# 3. Start Triton
# docker run --gpus all -p 8000:8000 -p 8001:8001 -p 8002:8002 \
# -v /models:/models \
# nvcr.io/nvidia/tritonserver:23.12-py3 \
# tritonserver --model-repository=/models
triton_features = {
"Dynamic Batching": "รวมหลาย Requests เป็น Batch อัตโนมัติ",
"Model Ensemble": "Pipeline หลาย Models ต่อกัน",
"Concurrent Execution": "รัน Model หลายตัวบน GPU เดียว",
"Model Versioning": "หลาย Version ของ Model พร้อมกัน",
"Metrics": "Prometheus Metrics: Latency, Throughput, Queue",
"gRPC + HTTP": "รองรับทั้ง gRPC และ HTTP API",
"Multi-GPU": "กระจาย Inference ข้าม GPU",
}
print("Triton Inference Server Features:")
for feature, desc in triton_features.items():
print(f" {feature}: {desc}")
# Optimization Comparison
comparison = {
"PyTorch (FP32)": {"latency": "15ms", "throughput": "65 FPS", "memory": "2.1GB"},
"ONNX Runtime": {"latency": "10ms", "throughput": "100 FPS", "memory": "1.8GB"},
"TensorRT FP16": {"latency": "6ms", "throughput": "165 FPS", "memory": "1.2GB"},
"TensorRT INT8": {"latency": "4ms", "throughput": "260 FPS", "memory": "0.8GB"},
"Triton + TRT FP16": {"latency": "5ms", "throughput": "200 FPS", "memory": "1.2GB"},
}
print(f"\n\nOptimization Comparison (YOLOv8m, A10G):")
for method, metrics in comparison.items():
print(f" {method:<22} Latency: {metrics['latency']:<6} FPS: {metrics['throughput']:<8} Mem: {metrics['memory']}")
Best Practices
- FP16 ก่อน: เริ่มจาก FP16 ก่อน INT8 ถ้า Accuracy ยอมรับได้
- Dynamic Batching: ใช้ Triton Dynamic Batching เพิ่ม Throughput
- GPU Taints: ใช้ Taints/Tolerations สงวน GPU Nodes
- Node Affinity: เลือก GPU Type ที่เหมาะกับ Workload
- Model Versioning: ใช้ Triton Model Versioning อัปเดตไม่ Downtime
- Monitoring: ติดตาม GPU Utilization Latency Queue Length
TensorRT คืออะไร
NVIDIA SDK Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX
Pod Scheduling บน Kubernetes คืออะไร
Scheduler เลือก Node เหมาะสม Resource Requests Limits Node Affinity Taints Tolerations GPU Availability NVIDIA Device Plugin จัดการ GPU
TensorRT FP16 กับ INT8 ต่างกันอย่างไร
FP16 ลด 32-bit เป็น 16-bit เร็ว 2-3 เท่า สูญเสีย Accuracy น้อย INT8 ลดเป็น 8-bit เร็ว 3-5 เท่า สูญเสียมากกว่า ต้อง Calibration Dataset
GPU Scheduling ใน Kubernetes ทำอย่างไร
NVIDIA GPU Operator Device Plugin resources.limits nvidia.com/gpu Node Affinity เลือก GPU Node Taints Tolerations สงวน GPU Nodes Inference
สรุป
TensorRT Optimize Inference FP16 INT8 เร็วขึ้น 2-5 เท่า Kubernetes GPU Scheduling NVIDIA Device Plugin Node Affinity Taints Triton Inference Server Dynamic Batching Model Versioning Monitoring GPU Utilization
