SiamCafe.net Blog
Technology

TensorRT Optimization Pod Scheduling

tensorrt optimization pod scheduling
TensorRT Optimization Pod Scheduling | SiamCafe Blog
2026-02-21· อ. บอม — SiamCafe.net· 8,076 คำ

TensorRT Optimization

TensorRT SDK NVIDIA Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX

Kubernetes Pod Scheduling เลือก Node เหมาะสม Resource Requests Limits Node Affinity Taints Tolerations NVIDIA Device Plugin GPU Resources

TensorRT Engine Build

# tensorrt_build.py — TensorRT Engine Build Pipeline
# pip install tensorrt torch onnx onnxruntime

import os
from dataclasses import dataclass
from typing import List, Dict, Optional

@dataclass
class TRTConfig:
    model_path: str
    precision: str  # fp32, fp16, int8
    max_batch_size: int
    workspace_size: int  # MB
    calibration_data: str = ""
    dynamic_shapes: bool = False

class TensorRTBuilder:
    """TensorRT Engine Builder"""

    def __init__(self, config: TRTConfig):
        self.config = config

    def export_onnx(self, pytorch_model_path: str, output_path: str):
        """Export PyTorch Model to ONNX"""
        # import torch
        # model = torch.load(pytorch_model_path)
        # model.eval()
        # dummy_input = torch.randn(1, 3, 640, 640)
        # torch.onnx.export(
        #     model, dummy_input, output_path,
        #     opset_version=17,
        #     input_names=["images"],
        #     output_names=["output"],
        #     dynamic_axes={"images": {0: "batch"}, "output": {0: "batch"}}
        # )
        print(f"  Exported ONNX: {output_path}")

    def build_engine(self, onnx_path: str, engine_path: str):
        """Build TensorRT Engine from ONNX"""
        # import tensorrt as trt
        # logger = trt.Logger(trt.Logger.INFO)
        # builder = trt.Builder(logger)
        # network = builder.create_network(
        #     1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        # parser = trt.OnnxParser(network, logger)
        #
        # with open(onnx_path, 'rb') as f:
        #     parser.parse(f.read())
        #
        # config = builder.create_builder_config()
        # config.set_memory_pool_limit(
        #     trt.MemoryPoolType.WORKSPACE,
        #     self.config.workspace_size << 20)
        #
        # if self.config.precision == "fp16":
        #     config.set_flag(trt.BuilderFlag.FP16)
        # elif self.config.precision == "int8":
        #     config.set_flag(trt.BuilderFlag.INT8)
        #     # Set calibrator for INT8
        #
        # engine = builder.build_serialized_network(network, config)
        # with open(engine_path, 'wb') as f:
        #     f.write(engine)

        print(f"  Built TensorRT Engine: {engine_path}")
        print(f"    Precision: {self.config.precision}")
        print(f"    Max Batch: {self.config.max_batch_size}")
        print(f"    Workspace: {self.config.workspace_size}MB")

    def benchmark(self, engine_path: str, iterations: int = 100):
        """Benchmark TensorRT Engine"""
        print(f"\n  Benchmark: {engine_path}")
        print(f"    Iterations: {iterations}")

        # Simulated results
        results = {
            "fp32": {"latency_ms": 15.2, "throughput_fps": 65},
            "fp16": {"latency_ms": 6.1, "throughput_fps": 163},
            "int8": {"latency_ms": 3.8, "throughput_fps": 263},
        }

        r = results.get(self.config.precision, results["fp32"])
        print(f"    Latency: {r['latency_ms']}ms")
        print(f"    Throughput: {r['throughput_fps']} FPS")

# Build Pipeline
configs = [
    TRTConfig("yolov8m.onnx", "fp32", 16, 4096),
    TRTConfig("yolov8m.onnx", "fp16", 16, 4096),
    TRTConfig("yolov8m.onnx", "int8", 16, 4096, "calibration_data/"),
]

print("TensorRT Build Pipeline:")
for config in configs:
    builder = TensorRTBuilder(config)
    engine_name = f"yolov8m_{config.precision}.engine"
    builder.build_engine(config.model_path, engine_name)
    builder.benchmark(engine_name)

# trtexec Commands
trtexec_commands = {
    "FP32": "trtexec --onnx=model.onnx --saveEngine=model_fp32.engine",
    "FP16": "trtexec --onnx=model.onnx --saveEngine=model_fp16.engine --fp16",
    "INT8": "trtexec --onnx=model.onnx --saveEngine=model_int8.engine --int8 --calib=calibration.cache",
    "Dynamic": "trtexec --onnx=model.onnx --minShapes=input:1x3x640x640 --optShapes=input:8x3x640x640 --maxShapes=input:16x3x640x640",
    "Benchmark": "trtexec --loadEngine=model_fp16.engine --batch=8 --iterations=1000",
}

print(f"\ntrtexec Commands:")
for name, cmd in trtexec_commands.items():
    print(f"  [{name}]")
    print(f"    {cmd}")

Kubernetes GPU Scheduling

# k8s_gpu_scheduling.py — Kubernetes GPU Pod Scheduling

# 1. NVIDIA GPU Operator Installation
# helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
# helm install gpu-operator nvidia/gpu-operator \
#   --namespace gpu-operator --create-namespace

# 2. GPU Pod Deployment
# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: tensorrt-inference
# spec:
#   replicas: 3
#   selector:
#     matchLabels:
#       app: tensorrt-inference
#   template:
#     metadata:
#       labels:
#         app: tensorrt-inference
#     spec:
#       nodeSelector:
#         nvidia.com/gpu.product: NVIDIA-A10G
#       tolerations:
#       - key: nvidia.com/gpu
#         operator: Exists
#         effect: NoSchedule
#       containers:
#       - name: inference
#         image: nvcr.io/nvidia/tensorrt:23.12-py3
#         command: ["python3", "server.py"]
#         ports:
#         - containerPort: 8080
#         resources:
#           limits:
#             nvidia.com/gpu: 1
#             memory: 8Gi
#             cpu: 4
#           requests:
#             nvidia.com/gpu: 1
#             memory: 4Gi
#             cpu: 2
#         volumeMounts:
#         - name: model-storage
#           mountPath: /models
#       volumes:
#       - name: model-storage
#         persistentVolumeClaim:
#           claimName: model-pvc

# 3. GPU HPA (Custom Metrics)
# apiVersion: autoscaling/v2
# kind: HorizontalPodAutoscaler
# metadata:
#   name: tensorrt-hpa
# spec:
#   scaleTargetRef:
#     apiVersion: apps/v1
#     kind: Deployment
#     name: tensorrt-inference
#   minReplicas: 2
#   maxReplicas: 10
#   metrics:
#   - type: Pods
#     pods:
#       metric:
#         name: gpu_utilization
#       target:
#         type: AverageValue
#         averageValue: "70"
#   - type: Pods
#     pods:
#       metric:
#         name: inference_latency_p99
#       target:
#         type: AverageValue
#         averageValue: "50m"

from dataclasses import dataclass
from typing import List

@dataclass
class GPUNode:
    name: str
    gpu_type: str
    gpu_count: int
    gpu_memory: str
    available: int

class GPUScheduler:
    """GPU Scheduling Strategy"""

    def __init__(self):
        self.nodes: List[GPUNode] = []

    def add_node(self, node: GPUNode):
        self.nodes.append(node)

    def schedule(self, required_gpu: int, preferred_type: str = ""):
        """เลือก Node ที่เหมาะสม"""
        candidates = [n for n in self.nodes if n.available >= required_gpu]
        if preferred_type:
            preferred = [n for n in candidates if preferred_type in n.gpu_type]
            if preferred:
                candidates = preferred
        if candidates:
            best = max(candidates, key=lambda n: n.available)
            return best
        return None

    def show_status(self):
        print(f"\n{'='*55}")
        print(f"GPU Cluster Status")
        print(f"{'='*55}")
        total_gpus = sum(n.gpu_count for n in self.nodes)
        available = sum(n.available for n in self.nodes)
        print(f"  Total GPUs: {total_gpus} | Available: {available}")

        for node in self.nodes:
            used = node.gpu_count - node.available
            bar = "#" * used + "." * node.available
            print(f"  {node.name:<20} {node.gpu_type:<12} [{bar}] {used}/{node.gpu_count}")

scheduler = GPUScheduler()
nodes = [
    GPUNode("gpu-node-01", "A10G", 4, "24GB", 2),
    GPUNode("gpu-node-02", "A10G", 4, "24GB", 1),
    GPUNode("gpu-node-03", "T4", 2, "16GB", 2),
    GPUNode("gpu-node-04", "A100", 8, "80GB", 5),
]

for node in nodes:
    scheduler.add_node(node)

scheduler.show_status()

best = scheduler.schedule(1, "A10G")
if best:
    print(f"\n  Scheduled on: {best.name} ({best.gpu_type})")

Inference Server

# inference_server.py — TensorRT Inference Server
# NVIDIA Triton Inference Server

# 1. Triton Model Repository Structure
# model_repository/
# ├── yolov8_fp16/
# │   ├── config.pbtxt
# │   ├── 1/
# │   │   └── model.plan  (TensorRT Engine)
# │   └── labels.txt
# └── yolov8_ensemble/
#     ├── config.pbtxt
#     └── 1/

# 2. config.pbtxt
# name: "yolov8_fp16"
# platform: "tensorrt_plan"
# max_batch_size: 16
# input [
#   { name: "images" data_type: TYPE_FP16 dims: [3, 640, 640] }
# ]
# output [
#   { name: "output" data_type: TYPE_FP16 dims: [84, 8400] }
# ]
# instance_group [
#   { count: 2 kind: KIND_GPU gpus: [0] }
# ]
# dynamic_batching {
#   preferred_batch_size: [4, 8, 16]
#   max_queue_delay_microseconds: 100
# }

# 3. Start Triton
# docker run --gpus all -p 8000:8000 -p 8001:8001 -p 8002:8002 \
#   -v /models:/models \
#   nvcr.io/nvidia/tritonserver:23.12-py3 \
#   tritonserver --model-repository=/models

triton_features = {
    "Dynamic Batching": "รวมหลาย Requests เป็น Batch อัตโนมัติ",
    "Model Ensemble": "Pipeline หลาย Models ต่อกัน",
    "Concurrent Execution": "รัน Model หลายตัวบน GPU เดียว",
    "Model Versioning": "หลาย Version ของ Model พร้อมกัน",
    "Metrics": "Prometheus Metrics: Latency, Throughput, Queue",
    "gRPC + HTTP": "รองรับทั้ง gRPC และ HTTP API",
    "Multi-GPU": "กระจาย Inference ข้าม GPU",
}

print("Triton Inference Server Features:")
for feature, desc in triton_features.items():
    print(f"  {feature}: {desc}")

# Optimization Comparison
comparison = {
    "PyTorch (FP32)": {"latency": "15ms", "throughput": "65 FPS", "memory": "2.1GB"},
    "ONNX Runtime": {"latency": "10ms", "throughput": "100 FPS", "memory": "1.8GB"},
    "TensorRT FP16": {"latency": "6ms", "throughput": "165 FPS", "memory": "1.2GB"},
    "TensorRT INT8": {"latency": "4ms", "throughput": "260 FPS", "memory": "0.8GB"},
    "Triton + TRT FP16": {"latency": "5ms", "throughput": "200 FPS", "memory": "1.2GB"},
}

print(f"\n\nOptimization Comparison (YOLOv8m, A10G):")
for method, metrics in comparison.items():
    print(f"  {method:<22} Latency: {metrics['latency']:<6} FPS: {metrics['throughput']:<8} Mem: {metrics['memory']}")

Best Practices

TensorRT คืออะไร

NVIDIA SDK Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX

Pod Scheduling บน Kubernetes คืออะไร

Scheduler เลือก Node เหมาะสม Resource Requests Limits Node Affinity Taints Tolerations GPU Availability NVIDIA Device Plugin จัดการ GPU

TensorRT FP16 กับ INT8 ต่างกันอย่างไร

FP16 ลด 32-bit เป็น 16-bit เร็ว 2-3 เท่า สูญเสีย Accuracy น้อย INT8 ลดเป็น 8-bit เร็ว 3-5 เท่า สูญเสียมากกว่า ต้อง Calibration Dataset

GPU Scheduling ใน Kubernetes ทำอย่างไร

NVIDIA GPU Operator Device Plugin resources.limits nvidia.com/gpu Node Affinity เลือก GPU Node Taints Tolerations สงวน GPU Nodes Inference

สรุป

TensorRT Optimize Inference FP16 INT8 เร็วขึ้น 2-5 เท่า Kubernetes GPU Scheduling NVIDIA Device Plugin Node Affinity Taints Triton Inference Server Dynamic Batching Model Versioning Monitoring GPU Utilization

📖 บทความที่เกี่ยวข้อง

TensorRT Optimization Cloud Native Designอ่านบทความ → TensorRT Optimization Security Hardening ป้องกันแฮกอ่านบทความ → GraphQL Federation Pod Schedulingอ่านบทความ → ClickHouse Analytics Pod Schedulingอ่านบทความ → BigQuery Scheduled Query Pod Schedulingอ่านบทความ →

📚 ดูบทความทั้งหมด →