TensorRT Optimization กับ Pod Scheduling — วิธี

TensorRT Optimization

TensorRT SDK NVIDIA Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX

Kubernetes Pod Scheduling เลือก Node เหมาะสม Resource Requests Limits Node Affinity Taints Tolerations NVIDIA Device Plugin GPU Resources

อ่านเพิ่ม: ModSecurity WAF Monitoring และ Alerting | SiamCafe Blog · อ่านเพิ่ม: Medusa Commerce Batch Processing Pipeline | SiamCafe Blog · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog

TensorRT Engine Build

# tensorrt_build.py — TensorRT Engine Build Pipeline
# pip install tensorrt torch onnx onnxruntime

import os
from dataclasses import dataclass
from typing import List, Dict, Optional

@dataclass
class TRTConfig:
    model_path: str
    precision: str  # fp32, fp16, int8
    max_batch_size: int
    workspace_size: int  # MB
    calibration_data: str = ""
    dynamic_shapes: bool = False

class TensorRTBuilder:
    """TensorRT Engine Builder"""

    def __init__(self, config: TRTConfig):
        self.config = config

    def export_onnx(self, pytorch_model_path: str, output_path: str):
        """Export PyTorch Model to ONNX"""
        # import torch
        # model = torch.load(pytorch_model_path)
        # model.eval()
        # dummy_input = torch.randn(1, 3, 640, 640)
        # torch.onnx.export(
        #     model, dummy_input, output_path,
        #     opset_version=17,
        #     input_names=["images"],
        #     output_names=["output"],
        #     dynamic_axes={"images": {0: "batch"}, "output": {0: "batch"}}
        # )
        print(f"  Exported ONNX: {output_path}")

    def build_engine(self, onnx_path: str, engine_path: str):
        """Build TensorRT Engine from ONNX"""
        # import tensorrt as trt
        # logger = trt.Logger(trt.Logger.INFO)
        # builder = trt.Builder(logger)
        # network = builder.create_network(
        #     1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        # parser = trt.OnnxParser(network, logger)
        #
        # with open(onnx_path, 'rb') as f:
        #     parser.parse(f.read())
        #
        # config = builder.create_builder_config()
        # config.set_memory_pool_limit(
        #     trt.MemoryPoolType.WORKSPACE,
        #     self.config.workspace_size << 20)
        #
        # if self.config.precision == "fp16":
        #     config.set_flag(trt.BuilderFlag.FP16)
        # elif self.config.precision == "int8":
        #     config.set_flag(trt.BuilderFlag.INT8)
        #     # Set calibrator for INT8
        #
        # engine = builder.build_serialized_network(network, config)
        # with open(engine_path, 'wb') as f:
        #     f.write(engine)

        print(f"  Built TensorRT Engine: {engine_path}")
        print(f"    Precision: {self.config.precision}")
        print(f"    Max Batch: {self.config.max_batch_size}")
        print(f"    Workspace: {self.config.workspace_size}MB")

    def benchmark(self, engine_path: str, iterations: int = 100):
        """Benchmark TensorRT Engine"""
        print(f"\n  Benchmark: {engine_path}")
        print(f"    Iterations: {iterations}")

        # Simulated results
        results = {
            "fp32": {"latency_ms": 15.2, "throughput_fps": 65},
            "fp16": {"latency_ms": 6.1, "throughput_fps": 163},
            "int8": {"latency_ms": 3.8, "throughput_fps": 263},
        }

        r = results.get(self.config.precision, results["fp32"])
        print(f"    Latency: {r['latency_ms']}ms")
        print(f"    Throughput: {r['throughput_fps']} FPS")

# Build Pipeline
configs = [
    TRTConfig("yolov8m.onnx", "fp32", 16, 4096),
    TRTConfig("yolov8m.onnx", "fp16", 16, 4096),
    TRTConfig("yolov8m.onnx", "int8", 16, 4096, "calibration_data/"),
]

print("TensorRT Build Pipeline:")
for config in configs:
    builder = TensorRTBuilder(config)
    engine_name = f"yolov8m_{config.precision}.engine"
    builder.build_engine(config.model_path, engine_name)
    builder.benchmark(engine_name)

# trtexec Commands
trtexec_commands = {
    "FP32": "trtexec --onnx=model.onnx --saveEngine=model_fp32.engine",
    "FP16": "trtexec --onnx=model.onnx --saveEngine=model_fp16.engine --fp16",
    "INT8": "trtexec --onnx=model.onnx --saveEngine=model_int8.engine --int8 --calib=calibration.cache",
    "Dynamic": "trtexec --onnx=model.onnx --minShapes=input:1x3x640x640 --optShapes=input:8x3x640x640 --maxShapes=input:16x3x640x640",
    "Benchmark": "trtexec --loadEngine=model_fp16.engine --batch=8 --iterations=1000",
}

print(f"\ntrtexec Commands:")
for name, cmd in trtexec_commands.items():
    print(f"  [{name}]")
    print(f"    {cmd}")

Kubernetes GPU Scheduling

k8s_gpu_scheduling.py — Kubernetes GPU Pod Scheduling

1. NVIDIA GPU Operator Installation

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia

helm install gpu-operator nvidia/gpu-operator \

--namespace gpu-operator --create-namespace

2. GPU Pod Deployment

apiVersion: apps/v1

kind: Deployment

metadata:

spec:

replicas: 3

selector:

matchLabels:

app: tensorrt-inference

template:

metadata:

labels:

app: tensorrt-inference

spec:

nodeSelector:

nvidia.com/gpu.product: NVIDIA-A10G

tolerations:

key: nvidia.com/gpu

operator: Exists

effect: NoSchedule

containers:

name: inference

image: nvcr.io/nvidia/tensorrt:23.12-py3

command: ["python3", "server.py"]

ports:

containerPort: 8080

resources:

limits:

nvidia.com/gpu: 1

memory: 8Gi

cpu: 4

requests:

nvidia.com/gpu: 1

memory: 4Gi

cpu: 2

volumeMounts:

name: model-storage

mountPath: /models

volumes:

name: model-storage

persistentVolumeClaim:

claimName: model-pvc

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง HTTP/3 QUIC Log Management ELK — จัดการ Log ด้วย

3. GPU HPA (Custom Metrics)

apiVersion: autoscaling/v2

kind: HorizontalPodAutoscaler

metadata:

spec:

scaleTargetRef:

apiVersion: apps/v1

kind: Deployment

minReplicas: 2

แนะนำเพิ่มเติม — เรียนเทรดกับ iCafeForex

maxReplicas: 10

metrics:

type: Pods

pods:

metric:

target:

type: AverageValue

averageValue: "70"

type: Pods

pods:

metric:

target:

type: AverageValue

averageValue: "50m"

from dataclasses import dataclass

from typing import List

@dataclass

class GPUNode:

gpu_type: str

gpu_count: int

gpu_memory: str

available: int

class GPUScheduler:

"""GPU Scheduling Strategy"""

def init(self):

self.nodes: List[GPUNode] = []

def add_node(self, node: GPUNode):

self.nodes.append(node)

def schedule(self, required_gpu: int, preferred_type: str = ""):

"""เลือก Node ที่เหมาะสม"""

candidates = [n for n in self.nodes if n.available >= required_gpu]

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ phân tích bài vội vàng

if preferred_type:

preferred = [n for n in candidates if preferred_type in n.gpu_type]

if preferred:

candidates = preferred

if candidates:

best = max(candidates, key=lambda n: n.available)

return best

return None

def show_status(self):

print(f"\n{'='*55}")

print(f"GPU Cluster Status")

print(f"{'='*55}")

total_gpus = sum(n.gpu_count for n in self.nodes)

available = sum(n.available for n in self.nodes)

print(f" Total GPUs: {total_gpus} | Available: {available}")

for node in self.nodes:

used = node.gpu_count - node.available

bar = "#" * used + "." * node.available

print(f" {node.name:<20} {node.gpu_type:<12} [{bar}] {used}/{node.gpu_count}")

scheduler = GPUScheduler()

แนะนำเพิ่มเติม — หนังสือเทรดที่ SiamCafeBook

nodes = [

GPUNode("gpu-node-01", "A10G", 4, "24GB", 2),

GPUNode("gpu-node-02", "A10G", 4, "24GB", 1),

GPUNode("gpu-node-03", "T4", 2, "16GB", 2),

GPUNode("gpu-node-04", "A100", 8, "80GB", 5),

]

for node in nodes:

scheduler.add_node(node)

scheduler.show_status()

best = scheduler.schedule(1, "A10G")

if best:

print(f"\n Scheduled on: {best.name} ({best.gpu_type})")

Inference Server

inference_server.py — TensorRT Inference Server

NVIDIA Triton Inference Server

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Go Fiber High Availability HA Setup

1. Triton Model Repository Structure

model_repository/

├── yolov8_fp16/

│ ├── config.pbtxt

│ ├── 1/

│ │ └── model.plan (TensorRT Engine)

│ └── labels.txt

└── yolov8_ensemble/

├── config.pbtxt

└── 1/

2. config.pbtxt

platform: "tensorrt_plan"

max_batch_size: 16

input [

{ name: "images" data_type: TYPE_FP16 dims: [3, 640, 640] }

]

output [

{ name: "output" data_type: TYPE_FP16 dims: [84, 8400] }

]

instance_group [

{ count: 2 kind: KIND_GPU gpus: [0] }

]

dynamic_batching {

preferred_batch_size: [4, 8, 16]

max_queue_delay_microseconds: 100

}

3. Start Triton

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง LXC vs Docker เลือก Container Technology อะไรดี

docker run --gpus all -p 8000:8000 -p 8001:8001 -p 8002:8002 \

-v /models:/models \

nvcr.io/nvidia/tritonserver:23.12-py3 \

tritonserver --model-repository=/models

triton_features = {

"Dynamic Batching": "รวมหลาย Requests เป็น Batch อัตโนมัติ",

"Model Ensemble": "Pipeline หลาย Models ต่อกัน",

"Concurrent Execution": "รัน Model หลายตัวบน GPU เดียว",

"Model Versioning": "หลาย Version ของ Model พร้อมกัน",

"Metrics": "Prometheus Metrics: Latency, Throughput, Queue",

"gRPC + HTTP": "รองรับทั้ง gRPC และ HTTP API",

"Multi-GPU": "กระจาย Inference ข้าม GPU",

}

print("Triton Inference Server Features:")

for feature, desc in triton_features.items():

print(f" {feature}: {desc}")

Optimization Comparison

comparison = {

"PyTorch (FP32)": {"latency": "15ms", "throughput": "65 FPS", "memory": "2.1GB"},

"ONNX Runtime": {"latency": "10ms", "throughput": "100 FPS", "memory": "1.8GB"},

"TensorRT FP16": {"latency": "6ms", "throughput": "165 FPS", "memory": "1.2GB"},

"TensorRT INT8": {"latency": "4ms", "throughput": "260 FPS", "memory": "0.8GB"},

"Triton + TRT FP16": {"latency": "5ms", "throughput": "200 FPS", "memory": "1.2GB"},

}

print(f"\n\nOptimization Comparison (YOLOv8m, A10G):")

for method, metrics in comparison.items():

print(f" {method:<22} Latency: {metrics['latency']:<6} FPS: {metrics['throughput']:<8} Mem: {metrics['memory']}")

Best Practices

FP16 ก่อน: เริ่มจาก FP16 ก่อน INT8 ถ้า Accuracy ยอมรับได้
Dynamic Batching: ใช้ Triton Dynamic Batching เพิ่ม Throughput
GPU Taints: ใช้ Taints/Tolerations สงวน GPU Nodes
Node Affinity: เลือก GPU Type ที่เหมาะกับ Workload
Model Versioning: ใช้ Triton Model Versioning อัปเดตไม่ Downtime
Monitoring: ติดตาม GPU Utilization Latency Queue Length

TensorRT คืออะไร

NVIDIA SDK Optimize Deep Learning Inference GPU Layer Fusion Precision FP16 INT8 Kernel Auto-tuning เร็วขึ้น 2-5 เท่า PyTorch TensorFlow ONNX

TensorRT Optimization กับ Pod Scheduling — วิธี

TensorRT Optimization

TensorRT Engine Build

Kubernetes GPU Scheduling

metadata:

spec:

selector:

matchLabels:

template:

metadata:

labels:

spec:

nodeSelector:

tolerations:

containers:

ports:

resources:

limits:

requests:

volumeMounts:

volumes:

persistentVolumeClaim:

metadata:

spec:

scaleTargetRef:

metrics:

pods:

metric:

target:

pods:

metric:

target:

class GPUNode:

class GPUScheduler:

def __init__(self):

def add_node(self, node: GPUNode):

def schedule(self, required_gpu: int, preferred_type: str = ""):

if preferred_type:

if preferred:

if candidates:

def show_status(self):

for node in self.nodes:

for node in nodes:

if best:

Inference Server

for feature, desc in triton_features.items():

for method, metrics in comparison.items():

Best Practices

TensorRT คืออะไร

บทความที่เกี่ยวข้อง

แนะนำจากเครือข่าย SiamCafe

บทความที่เกี่ยวข้อง

def init(self):