TensorRT Optimization Home Lab Setup —
TensorRT Home Lab

TensorRT Optimization Home Lab NVIDIA GPU Inference ONNX FP16 INT8 Quantization Engine Building Benchmark Latency Throughput Production Deployment
| GPU | VRAM | TensorRT FP16 | INT8 Support | Price (THB) | เหมาะกับ |
|---|---|---|---|---|---|
| RTX 3060 | 12GB | ดี | มี | 10,000-13,000 | Entry home lab |
| RTX 3090 | 24GB | ดีมาก | มี | 25,000-35,000 | Large models |
| RTX 4070 Ti | 12GB | ดีมาก | มี | 20,000-25,000 | Best value |
| RTX 4090 | 24GB | ดีที่สุด | มี | 55,000-70,000 | Pro home lab |
| Tesla T4 | 16GB | ดี | Optimized | 15,000-20,000 used | Server inference |
| A100 40GB | 40GB | ดีที่สุด | Optimized | 200,000+ | Enterprise |
Model Conversion
=== TensorRT Model Conversion ===
อ่านเพิ่ม: Medusa Commerce Batch Processing Pipeline | SiamCafe Blog · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog · อ่านเพิ่ม: ModSecurity WAF Monitoring และ Alerting | SiamCafe Blog
Step 1: PyTorch → ONNX
import torch
import torchvision.models as models
model = models.resnet50(pretrained=True).eval().cuda()
dummy_input = torch.randn(1, 3, 224, 224).cuda()
torch.onnx.export(
model, dummy_input, "resnet50.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
opset_version=17
)
Step 2: ONNX → TensorRT Engine
trtexec command:
trtexec --onnx=resnet50.onnx \
--saveEngine=resnet50_fp16.engine \
--fp16 \
--workspace=4096 \
--minShapes=input:1x3x224x224 \
เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน mô hình fake giá rẻ
--optShapes=input:8x3x224x224 \
--maxShapes=input:32x3x224x224 \
--verbose
Step 3: INT8 with Calibration
trtexec --onnx=resnet50.onnx \
--saveEngine=resnet50_int8.engine \
แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook
--int8 \
--calib=calibration_cache.bin \
--workspace=4096
Torch-TensorRT (direct conversion)
import torch_tensorrt
trt_model = torch_tensorrt.compile(model,
inputs=[torch_tensorrt.Input(
min_shape=[1, 3, 224, 224],
opt_shape=[8, 3, 224, 224],
max_shape=[32, 3, 224, 224],
dtype=torch.half
)],
enabled_precisions={torch.half}
)
เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Fail2ban Advanced DevSecOps Integration
torch.jit.save(trt_model, "resnet50_trt.ts")
from dataclasses import dataclass
@dataclass
class ConversionMethod:
method: str
pipeline: str
precision: str
complexity: str
best_for: str
methods = [
ConversionMethod("ONNX → trtexec", "PyTorch → ONNX → TensorRT", "FP32 FP16 INT8", "ง่าย",
"Standard models, CLI-based workflow"),
ConversionMethod("Torch-TensorRT", "PyTorch → TensorRT direct", "FP32 FP16", "ง่าย",
แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex
"PyTorch-native, quick prototyping"),
ConversionMethod("TensorRT Python API", "Manual network building", "FP32 FP16 INT8", "ยาก",
"Custom layers, maximum control"),
ConversionMethod("TF-TRT", "TensorFlow → TensorRT", "FP32 FP16 INT8", "ปานกลาง",
"TensorFlow models, SavedModel format"),
]
print("=== Conversion Methods ===")
for m in methods:
print(f" [{m.method}] Pipeline: {m.pipeline}")
print(f" Precision: {m.precision} | Complexity: {m.complexity}")
เนื้อหาเกี่ยวข้อง — REST API Design Testing Strategy QA
print(f" Best for: {m.best_for}")
INT8 Calibration
=== INT8 Calibration ===
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
class Int8Calibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, data_loader, cache_file="calibration.cache"):
super().__init__()
self.data_loader = data_loader
self.cache_file = cache_file
self.batch_iter = iter(data_loader)
self.device_input = cuda.mem_alloc(1 * 3 * 224 * 224 * 4)
def get_batch_size(self):
return 1
def get_batch(self, names):
try:

batch = next(self.batch_iter)
cuda.memcpy_htod(self.device_input, batch.numpy().ravel())
return [int(self.device_input)]
except StopIteration:
return None
def read_calibration_cache(self):
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
return f.read()
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)
@dataclass
class PrecisionResult:
precision: str
เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Fivetran Connector Machine Learning Pipeline
model_size: str
latency_ms: float
throughput: str
accuracy: str
memory_mb: int
results = [
PrecisionResult("FP32", "98 MB", 8.5, "118 img/s", "76.13%", 450),
PrecisionResult("FP16", "49 MB", 3.2, "312 img/s", "76.11%", 280),
PrecisionResult("INT8", "25 MB", 1.8, "555 img/s", "75.89%", 180),
PrecisionResult("INT8 Mixed", "35 MB", 2.1, "476 img/s", "76.05%", 210),
]
print("=== Precision Comparison (ResNet50, RTX 4070 Ti) ===")
for r in results:
print(f" [{r.precision}] Size: {r.model_size} | Latency: {r.latency_ms}ms")
print(f" Throughput: {r.throughput} | Accuracy: {r.accuracy} | VRAM: {r.memory_mb}MB")
speedup_fp16 = 8.5 / 3.2
speedup_int8 = 8.5 / 1.8
print(f"\n FP16 Speedup: {speedup_fp16:.1f}x vs FP32")
print(f" INT8 Speedup: {speedup_int8:.1f}x vs FP32")
Home Lab Setup
# === Home Lab Configuration ===
@dataclass
class LabComponent:
component: str
spec: str
purpose: str
cost: str
lab = [
LabComponent("GPU", "RTX 4070 Ti 12GB", "TensorRT inference engine", "22,000 THB"),
LabComponent("CPU", "AMD Ryzen 7 5800X", "Data preprocessing, model export", "8,000 THB"),
LabComponent("RAM", "64GB DDR4 3200MHz", "Large model loading, batch processing", "5,000 THB"),
LabComponent("Storage", "1TB NVMe SSD + 2TB HDD", "Models on SSD, datasets on HDD", "3,500 THB"),
LabComponent("OS", "Ubuntu 22.04 LTS", "CUDA + TensorRT native support", "Free"),
LabComponent("Software", "Docker + NVIDIA Container Toolkit", "Isolated TensorRT environments", "Free"),
LabComponent("Network", "1Gbps Ethernet + SSH", "Remote access, model serving", "Included"),
LabComponent("UPS", "600VA", "Protect GPU from power loss", "2,500 THB"),
]
total_cost = 41000
print("Home Lab Build:")
for l in lab:
print(f" [{l.component}] {l.spec}")
print(f" Purpose: {l.purpose} | Cost: {l.cost}")
print(f"\n Total: ~{total_cost:,} THB")
# Docker Setup
# docker run --gpus all -it --rm \
# -v $(pwd)/models:/models \
# -v $(pwd)/data:/data \
# nvcr.io/nvidia/tensorrt:23.12-py3 \
# bash
# nvidia-smi # Check GPU
# trtexec --onnx=/models/resnet50.onnx --fp16 --saveEngine=/models/resnet50.engine
setup_steps = {
"1. Install NVIDIA Driver": "sudo apt install nvidia-driver-535",
"2. Install Docker": "curl -fsSL https://get.docker.com | sh",
"3. Install NVIDIA Container": "apt install nvidia-container-toolkit",
"4. Pull TensorRT Image": "docker pull nvcr.io/nvidia/tensorrt:23.12-py3",
"5. Run Container": "docker run --gpus all -it nvcr.io/nvidia/tensorrt:23.12-py3",
"6. Test GPU": "nvidia-smi inside container",
"7. Convert Model": "trtexec --onnx=model.onnx --fp16 --saveEngine=model.engine",
"8. Benchmark": "trtexec --loadEngine=model.engine --batch=8",
}
print(f"\n\nSetup Steps:")
for k, v in setup_steps.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- FP16: เริ่มด้วย FP16 ก่อน ง่ายและเร็วขึ้น 2-3 เท่า
- INT8: ใช้ INT8 เมื่อต้องการ Throughput สูงสุด ตรวจ Accuracy
- Docker: ใช้ NVIDIA TensorRT Docker Image ลดปัญหา Version
- Dynamic: ใช้ Dynamic Shapes สำหรับ Variable Batch Size
- Benchmark: ใช้ trtexec benchmark ทุก Model ก่อน Deploy
TensorRT คืออะไร
NVIDIA SDK High-performance Inference GPU PyTorch TensorFlow ONNX Engine Layer Fusion Kernel Auto-tuning FP16 INT8 เร็วขึ้น 2-10 เท่า Memory น้อยลง





