TensorRT Optimization กับ Pod Scheduling — วิธี
TensorRT Optimization

TensorRT SDK NVIDIA Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX
Kubernetes Pod Scheduling เลือก Node เหมาะสม Resource Requests Limits Node Affinity Taints Tolerations NVIDIA Device Plugin GPU Resources
อ่านเพิ่ม: ModSecurity WAF Monitoring และ Alerting | SiamCafe Blog · อ่านเพิ่ม: Medusa Commerce Batch Processing Pipeline | SiamCafe Blog · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog
TensorRT Engine Build
# tensorrt_build.py — TensorRT Engine Build Pipeline
# pip install tensorrt torch onnx onnxruntime
import os
from dataclasses import dataclass
from typing import List, Dict, Optional
@dataclass
class TRTConfig:
model_path: str
precision: str # fp32, fp16, int8
max_batch_size: int
workspace_size: int # MB
calibration_data: str = ""
dynamic_shapes: bool = False
class TensorRTBuilder:
"""TensorRT Engine Builder"""
def __init__(self, config: TRTConfig):
self.config = config
def export_onnx(self, pytorch_model_path: str, output_path: str):
"""Export PyTorch Model to ONNX"""
# import torch
# model = torch.load(pytorch_model_path)
# model.eval()
# dummy_input = torch.randn(1, 3, 640, 640)
# torch.onnx.export(
# model, dummy_input, output_path,
# opset_version=17,
# input_names=["images"],
# output_names=["output"],
# dynamic_axes={"images": {0: "batch"}, "output": {0: "batch"}}
# )
print(f" Exported ONNX: {output_path}")
def build_engine(self, onnx_path: str, engine_path: str):
"""Build TensorRT Engine from ONNX"""
# import tensorrt as trt
# logger = trt.Logger(trt.Logger.INFO)
# builder = trt.Builder(logger)
# network = builder.create_network(
# 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# parser = trt.OnnxParser(network, logger)
#
# with open(onnx_path, 'rb') as f:
# parser.parse(f.read())
#
# config = builder.create_builder_config()
# config.set_memory_pool_limit(
# trt.MemoryPoolType.WORKSPACE,
# self.config.workspace_size << 20)
#
# if self.config.precision == "fp16":
# config.set_flag(trt.BuilderFlag.FP16)
# elif self.config.precision == "int8":
# config.set_flag(trt.BuilderFlag.INT8)
# # Set calibrator for INT8
#
# engine = builder.build_serialized_network(network, config)
# with open(engine_path, 'wb') as f:
# f.write(engine)
print(f" Built TensorRT Engine: {engine_path}")
print(f" Precision: {self.config.precision}")
print(f" Max Batch: {self.config.max_batch_size}")
print(f" Workspace: {self.config.workspace_size}MB")
def benchmark(self, engine_path: str, iterations: int = 100):
"""Benchmark TensorRT Engine"""
print(f"\n Benchmark: {engine_path}")
print(f" Iterations: {iterations}")
# Simulated results
results = {
"fp32": {"latency_ms": 15.2, "throughput_fps": 65},
"fp16": {"latency_ms": 6.1, "throughput_fps": 163},
"int8": {"latency_ms": 3.8, "throughput_fps": 263},
}
r = results.get(self.config.precision, results["fp32"])
print(f" Latency: {r['latency_ms']}ms")
print(f" Throughput: {r['throughput_fps']} FPS")
# Build Pipeline
configs = [
TRTConfig("yolov8m.onnx", "fp32", 16, 4096),
TRTConfig("yolov8m.onnx", "fp16", 16, 4096),
TRTConfig("yolov8m.onnx", "int8", 16, 4096, "calibration_data/"),
]
print("TensorRT Build Pipeline:")
for config in configs:
builder = TensorRTBuilder(config)
engine_name = f"yolov8m_{config.precision}.engine"
builder.build_engine(config.model_path, engine_name)
builder.benchmark(engine_name)
# trtexec Commands
trtexec_commands = {
"FP32": "trtexec --onnx=model.onnx --saveEngine=model_fp32.engine",
"FP16": "trtexec --onnx=model.onnx --saveEngine=model_fp16.engine --fp16",
"INT8": "trtexec --onnx=model.onnx --saveEngine=model_int8.engine --int8 --calib=calibration.cache",
"Dynamic": "trtexec --onnx=model.onnx --minShapes=input:1x3x640x640 --optShapes=input:8x3x640x640 --maxShapes=input:16x3x640x640",
"Benchmark": "trtexec --loadEngine=model_fp16.engine --batch=8 --iterations=1000",
}
print(f"\ntrtexec Commands:")
for name, cmd in trtexec_commands.items():
print(f" [{name}]")
print(f" {cmd}")
Kubernetes GPU Scheduling
k8s_gpu_scheduling.py — Kubernetes GPU Pod Scheduling
1. NVIDIA GPU Operator Installation
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm install gpu-operator nvidia/gpu-operator \
--namespace gpu-operator --create-namespace
2. GPU Pod Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: tensorrt-inference
spec:
replicas: 3
selector:
matchLabels:
app: tensorrt-inference
template:
metadata:
labels:
app: tensorrt-inference
spec:
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A10G
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- name: inference
image: nvcr.io/nvidia/tensorrt:23.12-py3
command: ["python3", "server.py"]
ports:
- containerPort: 8080
resources:
limits:
nvidia.com/gpu: 1
memory: 8Gi
cpu: 4
requests:
nvidia.com/gpu: 1
memory: 4Gi
cpu: 2
volumeMounts:
- name: model-storage
mountPath: /models
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: model-pvc
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง HTTP/3 QUIC Log Management ELK — จัดการ Log ด้วย
3. GPU HPA (Custom Metrics)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: tensorrt-hpa
spec:
scaleTargetRef:

apiVersion: apps/v1
kind: Deployment
name: tensorrt-inference
minReplicas: 2
แนะนำเพิ่มเติม — เรียนเทรดกับ iCafeForex
maxReplicas: 10
metrics:
- type: Pods
pods:
metric:
name: gpu_utilization
target:
type: AverageValue
averageValue: "70"
- type: Pods
pods:
metric:
name: inference_latency_p99
target:
type: AverageValue
averageValue: "50m"
from dataclasses import dataclass
from typing import List
@dataclass
class GPUNode:
name: str
gpu_type: str
gpu_count: int
gpu_memory: str
available: int
class GPUScheduler:
"""GPU Scheduling Strategy"""
def __init__(self):
self.nodes: List[GPUNode] = []
def add_node(self, node: GPUNode):
self.nodes.append(node)
def schedule(self, required_gpu: int, preferred_type: str = ""):
"""เลือก Node ที่เหมาะสม"""
candidates = [n for n in self.nodes if n.available >= required_gpu]
เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ phân tích bài vội vàng
if preferred_type:
preferred = [n for n in candidates if preferred_type in n.gpu_type]
if preferred:
candidates = preferred
if candidates:
best = max(candidates, key=lambda n: n.available)
return best
return None
def show_status(self):
print(f"\n{'='*55}")
print(f"GPU Cluster Status")
print(f"{'='*55}")
total_gpus = sum(n.gpu_count for n in self.nodes)
available = sum(n.available for n in self.nodes)
print(f" Total GPUs: {total_gpus} | Available: {available}")
for node in self.nodes:
used = node.gpu_count - node.available
bar = "#" * used + "." * node.available
print(f" {node.name:<20} {node.gpu_type:<12} [{bar}] {used}/{node.gpu_count}")
scheduler = GPUScheduler()
แนะนำเพิ่มเติม — หนังสือเทรดที่ SiamCafeBook
nodes = [
GPUNode("gpu-node-01", "A10G", 4, "24GB", 2),
GPUNode("gpu-node-02", "A10G", 4, "24GB", 1),
GPUNode("gpu-node-03", "T4", 2, "16GB", 2),
GPUNode("gpu-node-04", "A100", 8, "80GB", 5),
]
for node in nodes:
scheduler.add_node(node)
scheduler.show_status()
best = scheduler.schedule(1, "A10G")
if best:
print(f"\n Scheduled on: {best.name} ({best.gpu_type})")
Inference Server
inference_server.py — TensorRT Inference Server
NVIDIA Triton Inference Server
เนื้อหาเกี่ยวข้อง — อ่านต่อ: Go Fiber High Availability HA Setup
1. Triton Model Repository Structure
model_repository/
├── yolov8_fp16/
│ ├── config.pbtxt
│ ├── 1/
│ │ └── model.plan (TensorRT Engine)
│ └── labels.txt
└── yolov8_ensemble/
├── config.pbtxt
└── 1/
2. config.pbtxt
name: "yolov8_fp16"
platform: "tensorrt_plan"
max_batch_size: 16
input [
{ name: "images" data_type: TYPE_FP16 dims: [3, 640, 640] }
]
output [
{ name: "output" data_type: TYPE_FP16 dims: [84, 8400] }
]
instance_group [
{ count: 2 kind: KIND_GPU gpus: [0] }
]
dynamic_batching {
preferred_batch_size: [4, 8, 16]
max_queue_delay_microseconds: 100
}
3. Start Triton
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง LXC vs Docker เลือก Container Technology อะไรดี
docker run --gpus all -p 8000:8000 -p 8001:8001 -p 8002:8002 \
-v /models:/models \
nvcr.io/nvidia/tritonserver:23.12-py3 \
tritonserver --model-repository=/models
triton_features = {
"Dynamic Batching": "รวมหลาย Requests เป็น Batch อัตโนมัติ",
"Model Ensemble": "Pipeline หลาย Models ต่อกัน",
"Concurrent Execution": "รัน Model หลายตัวบน GPU เดียว",
"Model Versioning": "หลาย Version ของ Model พร้อมกัน",
"Metrics": "Prometheus Metrics: Latency, Throughput, Queue",
"gRPC + HTTP": "รองรับทั้ง gRPC และ HTTP API",
"Multi-GPU": "กระจาย Inference ข้าม GPU",
}
print("Triton Inference Server Features:")
for feature, desc in triton_features.items():
print(f" {feature}: {desc}")
Optimization Comparison
comparison = {
"PyTorch (FP32)": {"latency": "15ms", "throughput": "65 FPS", "memory": "2.1GB"},
"ONNX Runtime": {"latency": "10ms", "throughput": "100 FPS", "memory": "1.8GB"},
"TensorRT FP16": {"latency": "6ms", "throughput": "165 FPS", "memory": "1.2GB"},
"TensorRT INT8": {"latency": "4ms", "throughput": "260 FPS", "memory": "0.8GB"},
"Triton + TRT FP16": {"latency": "5ms", "throughput": "200 FPS", "memory": "1.2GB"},
}
print(f"\n\nOptimization Comparison (YOLOv8m, A10G):")
for method, metrics in comparison.items():
print(f" {method:<22} Latency: {metrics['latency']:<6} FPS: {metrics['throughput']:<8} Mem: {metrics['memory']}")
Best Practices
- FP16 ก่อน: เริ่มจาก FP16 ก่อน INT8 ถ้า Accuracy ยอมรับได้
- Dynamic Batching: ใช้ Triton Dynamic Batching เพิ่ม Throughput
- GPU Taints: ใช้ Taints/Tolerations สงวน GPU Nodes
- Node Affinity: เลือก GPU Type ที่เหมาะกับ Workload
- Model Versioning: ใช้ Triton Model Versioning อัปเดตไม่ Downtime
- Monitoring: ติดตาม GPU Utilization Latency Queue Length
TensorRT คืออะไร
NVIDIA SDK Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX





