TensorRT Home Lab
TensorRT Optimization Home Lab NVIDIA GPU Inference ONNX FP16 INT8 Quantization Engine Building Benchmark Latency Throughput Production Deployment
| GPU | VRAM | TensorRT FP16 | INT8 Support | Price (THB) | เหมาะกับ |
|---|---|---|---|---|---|
| RTX 3060 | 12GB | ดี | มี | 10,000-13,000 | Entry home lab |
| RTX 3090 | 24GB | ดีมาก | มี | 25,000-35,000 | Large models |
| RTX 4070 Ti | 12GB | ดีมาก | มี | 20,000-25,000 | Best value |
| RTX 4090 | 24GB | ดีที่สุด | มี | 55,000-70,000 | Pro home lab |
| Tesla T4 | 16GB | ดี | Optimized | 15,000-20,000 used | Server inference |
| A100 40GB | 40GB | ดีที่สุด | Optimized | 200,000+ | Enterprise |
Model Conversion
# === TensorRT Model Conversion ===
# Step 1: PyTorch → ONNX
# import torch
# import torchvision.models as models
#
# model = models.resnet50(pretrained=True).eval().cuda()
# dummy_input = torch.randn(1, 3, 224, 224).cuda()
#
# torch.onnx.export(
# model, dummy_input, "resnet50.onnx",
# input_names=["input"],
# output_names=["output"],
# dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
# opset_version=17
# )
# Step 2: ONNX → TensorRT Engine
# trtexec command:
# trtexec --onnx=resnet50.onnx \
# --saveEngine=resnet50_fp16.engine \
# --fp16 \
# --workspace=4096 \
# --minShapes=input:1x3x224x224 \
# --optShapes=input:8x3x224x224 \
# --maxShapes=input:32x3x224x224 \
# --verbose
# Step 3: INT8 with Calibration
# trtexec --onnx=resnet50.onnx \
# --saveEngine=resnet50_int8.engine \
# --int8 \
# --calib=calibration_cache.bin \
# --workspace=4096
# Torch-TensorRT (direct conversion)
# import torch_tensorrt
#
# trt_model = torch_tensorrt.compile(model,
# inputs=[torch_tensorrt.Input(
# min_shape=[1, 3, 224, 224],
# opt_shape=[8, 3, 224, 224],
# max_shape=[32, 3, 224, 224],
# dtype=torch.half
# )],
# enabled_precisions={torch.half}
# )
# torch.jit.save(trt_model, "resnet50_trt.ts")
from dataclasses import dataclass
@dataclass
class ConversionMethod:
method: str
pipeline: str
precision: str
complexity: str
best_for: str
methods = [
ConversionMethod("ONNX → trtexec", "PyTorch → ONNX → TensorRT", "FP32 FP16 INT8", "ง่าย",
"Standard models, CLI-based workflow"),
ConversionMethod("Torch-TensorRT", "PyTorch → TensorRT direct", "FP32 FP16", "ง่าย",
"PyTorch-native, quick prototyping"),
ConversionMethod("TensorRT Python API", "Manual network building", "FP32 FP16 INT8", "ยาก",
"Custom layers, maximum control"),
ConversionMethod("TF-TRT", "TensorFlow → TensorRT", "FP32 FP16 INT8", "ปานกลาง",
"TensorFlow models, SavedModel format"),
]
print("=== Conversion Methods ===")
for m in methods:
print(f" [{m.method}] Pipeline: {m.pipeline}")
print(f" Precision: {m.precision} | Complexity: {m.complexity}")
print(f" Best for: {m.best_for}")
INT8 Calibration
# === INT8 Calibration ===
# import tensorrt as trt
# import pycuda.driver as cuda
# import numpy as np
#
# class Int8Calibrator(trt.IInt8EntropyCalibrator2):
# def __init__(self, data_loader, cache_file="calibration.cache"):
# super().__init__()
# self.data_loader = data_loader
# self.cache_file = cache_file
# self.batch_iter = iter(data_loader)
# self.device_input = cuda.mem_alloc(1 * 3 * 224 * 224 * 4)
#
# def get_batch_size(self):
# return 1
#
# def get_batch(self, names):
# try:
# batch = next(self.batch_iter)
# cuda.memcpy_htod(self.device_input, batch.numpy().ravel())
# return [int(self.device_input)]
# except StopIteration:
# return None
#
# def read_calibration_cache(self):
# if os.path.exists(self.cache_file):
# with open(self.cache_file, "rb") as f:
# return f.read()
# return None
#
# def write_calibration_cache(self, cache):
# with open(self.cache_file, "wb") as f:
# f.write(cache)
@dataclass
class PrecisionResult:
precision: str
model_size: str
latency_ms: float
throughput: str
accuracy: str
memory_mb: int
results = [
PrecisionResult("FP32", "98 MB", 8.5, "118 img/s", "76.13%", 450),
PrecisionResult("FP16", "49 MB", 3.2, "312 img/s", "76.11%", 280),
PrecisionResult("INT8", "25 MB", 1.8, "555 img/s", "75.89%", 180),
PrecisionResult("INT8 Mixed", "35 MB", 2.1, "476 img/s", "76.05%", 210),
]
print("=== Precision Comparison (ResNet50, RTX 4070 Ti) ===")
for r in results:
print(f" [{r.precision}] Size: {r.model_size} | Latency: {r.latency_ms}ms")
print(f" Throughput: {r.throughput} | Accuracy: {r.accuracy} | VRAM: {r.memory_mb}MB")
speedup_fp16 = 8.5 / 3.2
speedup_int8 = 8.5 / 1.8
print(f"\n FP16 Speedup: {speedup_fp16:.1f}x vs FP32")
print(f" INT8 Speedup: {speedup_int8:.1f}x vs FP32")
Home Lab Setup
# === Home Lab Configuration ===
@dataclass
class LabComponent:
component: str
spec: str
purpose: str
cost: str
lab = [
LabComponent("GPU", "RTX 4070 Ti 12GB", "TensorRT inference engine", "22,000 THB"),
LabComponent("CPU", "AMD Ryzen 7 5800X", "Data preprocessing, model export", "8,000 THB"),
LabComponent("RAM", "64GB DDR4 3200MHz", "Large model loading, batch processing", "5,000 THB"),
LabComponent("Storage", "1TB NVMe SSD + 2TB HDD", "Models on SSD, datasets on HDD", "3,500 THB"),
LabComponent("OS", "Ubuntu 22.04 LTS", "CUDA + TensorRT native support", "Free"),
LabComponent("Software", "Docker + NVIDIA Container Toolkit", "Isolated TensorRT environments", "Free"),
LabComponent("Network", "1Gbps Ethernet + SSH", "Remote access, model serving", "Included"),
LabComponent("UPS", "600VA", "Protect GPU from power loss", "2,500 THB"),
]
total_cost = 41000
print("Home Lab Build:")
for l in lab:
print(f" [{l.component}] {l.spec}")
print(f" Purpose: {l.purpose} | Cost: {l.cost}")
print(f"\n Total: ~{total_cost:,} THB")
# Docker Setup
# docker run --gpus all -it --rm \
# -v $(pwd)/models:/models \
# -v $(pwd)/data:/data \
# nvcr.io/nvidia/tensorrt:23.12-py3 \
# bash
# nvidia-smi # Check GPU
# trtexec --onnx=/models/resnet50.onnx --fp16 --saveEngine=/models/resnet50.engine
setup_steps = {
"1. Install NVIDIA Driver": "sudo apt install nvidia-driver-535",
"2. Install Docker": "curl -fsSL https://get.docker.com | sh",
"3. Install NVIDIA Container": "apt install nvidia-container-toolkit",
"4. Pull TensorRT Image": "docker pull nvcr.io/nvidia/tensorrt:23.12-py3",
"5. Run Container": "docker run --gpus all -it nvcr.io/nvidia/tensorrt:23.12-py3",
"6. Test GPU": "nvidia-smi inside container",
"7. Convert Model": "trtexec --onnx=model.onnx --fp16 --saveEngine=model.engine",
"8. Benchmark": "trtexec --loadEngine=model.engine --batch=8",
}
print(f"\n\nSetup Steps:")
for k, v in setup_steps.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- FP16: เริ่มด้วย FP16 ก่อน ง่ายและเร็วขึ้น 2-3 เท่า
- INT8: ใช้ INT8 เมื่อต้องการ Throughput สูงสุด ตรวจ Accuracy
- Docker: ใช้ NVIDIA TensorRT Docker Image ลดปัญหา Version
- Dynamic: ใช้ Dynamic Shapes สำหรับ Variable Batch Size
- Benchmark: ใช้ trtexec benchmark ทุก Model ก่อน Deploy
TensorRT คืออะไร
NVIDIA SDK High-performance Inference GPU PyTorch TensorFlow ONNX Engine Layer Fusion Kernel Auto-tuning FP16 INT8 เร็วขึ้น 2-10 เท่า Memory น้อยลง
ตั้ง Home Lab อย่างไร
RTX 3060 12GB ขึ้นไป Ubuntu 22.04 CUDA cuDNN TensorRT Docker nvidia/cuda RAM 32GB SSD NVMe nvidia-smi Jupyter SSH Remote
แปลง Model เป็น TensorRT อย่างไร
PyTorch ONNX export trtexec Engine FP16 INT8 Torch-TensorRT direct TensorRT Python API Dynamic Shapes trtexec benchmark Throughput Latency
INT8 Quantization ทำอย่างไร
ลด 4 เท่า เร็ว 2-4 เท่า Calibration Dataset 500-1000 ภาพ Scale Factor Cache Accuracy Drop 1% Mixed Precision Layer-wise Control
สรุป
TensorRT Optimization Home Lab NVIDIA GPU ONNX FP16 INT8 Quantization Calibration Engine trtexec Docker Benchmark Latency Throughput Production Deployment
