TensorRT Remote Work
TensorRT Optimization Remote Work GPU Inference Model Conversion INT8 Calibration SSH Tunnel JupyterLab VS Code Remote Tailscale Production
| Precision | Model Size | Speedup | Accuracy Loss | Use Case |
|---|---|---|---|---|
| FP32 | 100% (baseline) | 1x | 0% | Development, baseline |
| FP16 | 50% | 2-3x | < 0.1% | Production default |
| INT8 | 25% | 3-5x | < 1% | High throughput, edge |
| INT4 (TRT-LLM) | 12.5% | 4-8x | 1-3% | LLM inference |
Remote Development Environment
# === Remote GPU Server Setup ===
# Install NVIDIA Driver + CUDA + TensorRT
# sudo apt update && sudo apt install -y nvidia-driver-535
# sudo apt install -y nvidia-cuda-toolkit
# pip install tensorrt nvidia-tensorrt
# Install JupyterLab
# pip install jupyterlab ipywidgets
# jupyter lab --ip 0.0.0.0 --port 8888 --no-browser
# SSH config (~/.ssh/config on local machine)
# Host gpu-server
# HostName 192.168.1.100 # or public IP / Tailscale IP
# User dev
# Port 22
# IdentityFile ~/.ssh/id_rsa
# LocalForward 8888 localhost:8888 # JupyterLab
# LocalForward 6006 localhost:6006 # TensorBoard
# LocalForward 3000 localhost:3000 # Grafana
# ServerAliveInterval 60
# ServerAliveCountMax 3
# VS Code Remote SSH
# 1. Install "Remote - SSH" extension
# 2. Ctrl+Shift+P > "Remote-SSH: Connect to Host"
# 3. Select "gpu-server"
# 4. Install Python extension on remote
# Tailscale VPN (zero-config)
# curl -fsSL https://tailscale.com/install.sh | sh
# sudo tailscale up
# ssh dev@gpu-server # Use Tailscale hostname
# tmux for long-running jobs
# tmux new -s training
# python train.py # Start training
# Ctrl+B, D # Detach
# tmux attach -t training # Re-attach later
from dataclasses import dataclass
@dataclass
class RemoteTool:
tool: str
purpose: str
port: str
setup: str
tools = [
RemoteTool("SSH", "Remote terminal access", "22", "ssh-keygen + authorized_keys"),
RemoteTool("JupyterLab", "Interactive notebooks", "8888", "pip install jupyterlab"),
RemoteTool("VS Code Remote", "Full IDE remote", "SSH", "Remote-SSH extension"),
RemoteTool("TensorBoard", "Training visualization", "6006", "tensorboard --logdir runs/"),
RemoteTool("Grafana", "GPU monitoring dashboard", "3000", "Docker + nvidia-smi exporter"),
RemoteTool("Tailscale", "Secure VPN access", "N/A", "curl install + tailscale up"),
RemoteTool("tmux", "Persistent sessions", "N/A", "apt install tmux"),
]
print("=== Remote Tools ===")
for t in tools:
print(f" [{t.tool}] Purpose: {t.purpose}")
print(f" Port: {t.port} | Setup: {t.setup}")
TensorRT Conversion
# === Model Conversion Pipeline ===
# Step 1: PyTorch to ONNX
# import torch
# model = MyModel().eval().cuda()
# dummy = torch.randn(1, 3, 224, 224).cuda()
# torch.onnx.export(model, dummy, "model.onnx",
# input_names=["input"], output_names=["output"],
# dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
# opset_version=17)
# Step 2: ONNX to TensorRT (CLI)
# trtexec --onnx=model.onnx \
# --saveEngine=model_fp16.trt \
# --fp16 \
# --minShapes=input:1x3x224x224 \
# --optShapes=input:8x3x224x224 \
# --maxShapes=input:32x3x224x224 \
# --workspace=4096
# Step 3: INT8 with Calibration
# trtexec --onnx=model.onnx \
# --saveEngine=model_int8.trt \
# --int8 \
# --calib=calibration_cache.bin \
# --minShapes=input:1x3x224x224 \
# --optShapes=input:8x3x224x224 \
# --maxShapes=input:32x3x224x224
# Python API — Custom INT8 Calibrator
# import tensorrt as trt
# import numpy as np
#
# class MyCalibrator(trt.IInt8EntropyCalibrator2):
# def __init__(self, data_loader, cache_file="calib.cache"):
# super().__init__()
# self.data_loader = iter(data_loader)
# self.cache_file = cache_file
# self.batch_size = 8
# self.device_input = cuda.mem_alloc(
# self.batch_size * 3 * 224 * 224 * 4)
#
# def get_batch_size(self):
# return self.batch_size
#
# def get_batch(self, names):
# try:
# batch = next(self.data_loader)
# cuda.memcpy_htod(self.device_input, batch.numpy())
# return [int(self.device_input)]
# except StopIteration:
# return None
@dataclass
class ConversionResult:
model: str
original: str
fp16: str
int8: str
speedup_fp16: str
speedup_int8: str
results = [
ConversionResult("ResNet-50", "4.2ms", "1.8ms", "1.1ms", "2.3x", "3.8x"),
ConversionResult("BERT-base", "6.5ms", "3.1ms", "1.9ms", "2.1x", "3.4x"),
ConversionResult("YOLOv8-L", "12.3ms", "5.2ms", "3.1ms", "2.4x", "4.0x"),
ConversionResult("Whisper-small", "450ms", "180ms", "95ms", "2.5x", "4.7x"),
ConversionResult("Stable Diffusion", "8.5s", "3.2s", "N/A", "2.7x", "N/A"),
]
print("\n=== Benchmark Results (T4 GPU) ===")
for r in results:
print(f" [{r.model}] FP32: {r.original} | FP16: {r.fp16} | INT8: {r.int8}")
print(f" Speedup: FP16 {r.speedup_fp16} | INT8 {r.speedup_int8}")
Production Deployment
# === Production Inference Server ===
# Triton Inference Server with TensorRT
# docker run --gpus all -p 8000:8000 -p 8001:8001 -p 8002:8002 \
# -v $(pwd)/model_repository:/models \
# nvcr.io/nvidia/tritonserver:24.01-py3 \
# tritonserver --model-repository=/models
# model_repository/
# └── my_model/
# ├── config.pbtxt
# └── 1/
# └── model.plan # TensorRT engine
# config.pbtxt
# name: "my_model"
# platform: "tensorrt_plan"
# max_batch_size: 32
# input [{
# name: "input"
# data_type: TYPE_FP32
# dims: [3, 224, 224]
# }]
# output [{
# name: "output"
# data_type: TYPE_FP32
# dims: [1000]
# }]
# instance_group [{
# count: 2
# kind: KIND_GPU
# }]
# dynamic_batching {
# max_queue_delay_microseconds: 100
# }
@dataclass
class MonitorMetric:
metric: str
target: str
alert: str
tool: str
metrics = [
MonitorMetric("GPU Utilization", "60-80%", "> 90% or < 30%", "nvidia-smi + Prometheus"),
MonitorMetric("GPU Temperature", "< 80°C", "> 85°C", "nvidia-smi exporter"),
MonitorMetric("GPU Memory", "< 90%", "> 95%", "nvidia-smi exporter"),
MonitorMetric("Inference Latency p99", "< 10ms (ResNet)", "> 20ms", "Triton metrics"),
MonitorMetric("Throughput", "> 500 req/s", "< 300 req/s", "Triton metrics"),
MonitorMetric("Error Rate", "< 0.1%", "> 0.5%", "Application metrics"),
MonitorMetric("Power Usage", "< 250W (T4)", "> 280W", "nvidia-smi"),
]
print("Monitoring:")
for m in metrics:
print(f" [{m.metric}] Target: {m.target}")
print(f" Alert: {m.alert} | Tool: {m.tool}")
เคล็ดลับ
- FP16: เริ่มด้วย FP16 ก่อน ได้ 2-3x speedup แทบไม่เสีย Accuracy
- Tailscale: ใช้ Tailscale VPN เข้าถึง GPU Server จากทุกที่ ไม่ต้อง Port Forward
- tmux: รัน Training ใน tmux ปิด SSH ได้โดย Training ไม่หยุด
- Profile: ใช้ trtexec --verbose วิเคราะห์ Layer Performance
- Cache: เก็บ TensorRT Engine File ไว้ ไม่ต้อง Build ใหม่ทุกครั้ง
TensorRT คืออะไร
NVIDIA SDK Inference Graph Optimization Precision FP32 FP16 INT8 Kernel Auto-tuning ONNX PyTorch TensorFlow 2-5x Production Edge Autonomous
ตั้ง Remote Work Setup อย่างไร
GPU Server CUDA TensorRT SSH JupyterLab VS Code Remote SSH Tunnel tmux Tailscale WireGuard VPN Prometheus Grafana Monitor
แปลง Model เป็น TensorRT อย่างไร
ONNX Export trtexec --onnx --saveEngine FP16 INT8 Dynamic Shape minShapes optShapes maxShapes Accuracy Latency Throughput GPU
INT8 Calibration ทำอย่างไร
Calibration Dataset 500-1000 samples trtexec --int8 --calib Python Calibrator Entropy MinMax Percentile Accuracy < 1% Mixed Precision Layer Sensitivity
สรุป
TensorRT Optimization Remote Work GPU SSH JupyterLab VS Code Tailscale ONNX FP16 INT8 Calibration Triton Inference Server Monitoring Production Deployment