SiamCafe.net Blog
Technology

TensorRT Optimization Home Lab Setup

tensorrt optimization home lab setup
TensorRT Optimization Home Lab Setup | SiamCafe Blog
2025-07-16· อ. บอม — SiamCafe.net· 11,104 คำ

TensorRT Home Lab

TensorRT Optimization Home Lab NVIDIA GPU Inference ONNX FP16 INT8 Quantization Engine Building Benchmark Latency Throughput Production Deployment

GPUVRAMTensorRT FP16INT8 SupportPrice (THB)เหมาะกับ
RTX 306012GBดีมี10,000-13,000Entry home lab
RTX 309024GBดีมากมี25,000-35,000Large models
RTX 4070 Ti12GBดีมากมี20,000-25,000Best value
RTX 409024GBดีที่สุดมี55,000-70,000Pro home lab
Tesla T416GBดีOptimized15,000-20,000 usedServer inference
A100 40GB40GBดีที่สุดOptimized200,000+Enterprise

Model Conversion

# === TensorRT Model Conversion ===

# Step 1: PyTorch → ONNX
# import torch
# import torchvision.models as models
#
# model = models.resnet50(pretrained=True).eval().cuda()
# dummy_input = torch.randn(1, 3, 224, 224).cuda()
#
# torch.onnx.export(
#     model, dummy_input, "resnet50.onnx",
#     input_names=["input"],
#     output_names=["output"],
#     dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
#     opset_version=17
# )

# Step 2: ONNX → TensorRT Engine
# trtexec command:
# trtexec --onnx=resnet50.onnx \
#         --saveEngine=resnet50_fp16.engine \
#         --fp16 \
#         --workspace=4096 \
#         --minShapes=input:1x3x224x224 \
#         --optShapes=input:8x3x224x224 \
#         --maxShapes=input:32x3x224x224 \
#         --verbose

# Step 3: INT8 with Calibration
# trtexec --onnx=resnet50.onnx \
#         --saveEngine=resnet50_int8.engine \
#         --int8 \
#         --calib=calibration_cache.bin \
#         --workspace=4096

# Torch-TensorRT (direct conversion)
# import torch_tensorrt
#
# trt_model = torch_tensorrt.compile(model,
#     inputs=[torch_tensorrt.Input(
#         min_shape=[1, 3, 224, 224],
#         opt_shape=[8, 3, 224, 224],
#         max_shape=[32, 3, 224, 224],
#         dtype=torch.half
#     )],
#     enabled_precisions={torch.half}
# )
# torch.jit.save(trt_model, "resnet50_trt.ts")

from dataclasses import dataclass

@dataclass
class ConversionMethod:
    method: str
    pipeline: str
    precision: str
    complexity: str
    best_for: str

methods = [
    ConversionMethod("ONNX → trtexec", "PyTorch → ONNX → TensorRT", "FP32 FP16 INT8", "ง่าย",
        "Standard models, CLI-based workflow"),
    ConversionMethod("Torch-TensorRT", "PyTorch → TensorRT direct", "FP32 FP16", "ง่าย",
        "PyTorch-native, quick prototyping"),
    ConversionMethod("TensorRT Python API", "Manual network building", "FP32 FP16 INT8", "ยาก",
        "Custom layers, maximum control"),
    ConversionMethod("TF-TRT", "TensorFlow → TensorRT", "FP32 FP16 INT8", "ปานกลาง",
        "TensorFlow models, SavedModel format"),
]

print("=== Conversion Methods ===")
for m in methods:
    print(f"  [{m.method}] Pipeline: {m.pipeline}")
    print(f"    Precision: {m.precision} | Complexity: {m.complexity}")
    print(f"    Best for: {m.best_for}")

INT8 Calibration

# === INT8 Calibration ===

# import tensorrt as trt
# import pycuda.driver as cuda
# import numpy as np
#
# class Int8Calibrator(trt.IInt8EntropyCalibrator2):
#     def __init__(self, data_loader, cache_file="calibration.cache"):
#         super().__init__()
#         self.data_loader = data_loader
#         self.cache_file = cache_file
#         self.batch_iter = iter(data_loader)
#         self.device_input = cuda.mem_alloc(1 * 3 * 224 * 224 * 4)
#
#     def get_batch_size(self):
#         return 1
#
#     def get_batch(self, names):
#         try:
#             batch = next(self.batch_iter)
#             cuda.memcpy_htod(self.device_input, batch.numpy().ravel())
#             return [int(self.device_input)]
#         except StopIteration:
#             return None
#
#     def read_calibration_cache(self):
#         if os.path.exists(self.cache_file):
#             with open(self.cache_file, "rb") as f:
#                 return f.read()
#         return None
#
#     def write_calibration_cache(self, cache):
#         with open(self.cache_file, "wb") as f:
#             f.write(cache)

@dataclass
class PrecisionResult:
    precision: str
    model_size: str
    latency_ms: float
    throughput: str
    accuracy: str
    memory_mb: int

results = [
    PrecisionResult("FP32", "98 MB", 8.5, "118 img/s", "76.13%", 450),
    PrecisionResult("FP16", "49 MB", 3.2, "312 img/s", "76.11%", 280),
    PrecisionResult("INT8", "25 MB", 1.8, "555 img/s", "75.89%", 180),
    PrecisionResult("INT8 Mixed", "35 MB", 2.1, "476 img/s", "76.05%", 210),
]

print("=== Precision Comparison (ResNet50, RTX 4070 Ti) ===")
for r in results:
    print(f"  [{r.precision}] Size: {r.model_size} | Latency: {r.latency_ms}ms")
    print(f"    Throughput: {r.throughput} | Accuracy: {r.accuracy} | VRAM: {r.memory_mb}MB")

speedup_fp16 = 8.5 / 3.2
speedup_int8 = 8.5 / 1.8
print(f"\n  FP16 Speedup: {speedup_fp16:.1f}x vs FP32")
print(f"  INT8 Speedup: {speedup_int8:.1f}x vs FP32")

Home Lab Setup

# === Home Lab Configuration ===

@dataclass
class LabComponent:
    component: str
    spec: str
    purpose: str
    cost: str

lab = [
    LabComponent("GPU", "RTX 4070 Ti 12GB", "TensorRT inference engine", "22,000 THB"),
    LabComponent("CPU", "AMD Ryzen 7 5800X", "Data preprocessing, model export", "8,000 THB"),
    LabComponent("RAM", "64GB DDR4 3200MHz", "Large model loading, batch processing", "5,000 THB"),
    LabComponent("Storage", "1TB NVMe SSD + 2TB HDD", "Models on SSD, datasets on HDD", "3,500 THB"),
    LabComponent("OS", "Ubuntu 22.04 LTS", "CUDA + TensorRT native support", "Free"),
    LabComponent("Software", "Docker + NVIDIA Container Toolkit", "Isolated TensorRT environments", "Free"),
    LabComponent("Network", "1Gbps Ethernet + SSH", "Remote access, model serving", "Included"),
    LabComponent("UPS", "600VA", "Protect GPU from power loss", "2,500 THB"),
]

total_cost = 41000
print("Home Lab Build:")
for l in lab:
    print(f"  [{l.component}] {l.spec}")
    print(f"    Purpose: {l.purpose} | Cost: {l.cost}")
print(f"\n  Total: ~{total_cost:,} THB")

# Docker Setup
# docker run --gpus all -it --rm \
#   -v $(pwd)/models:/models \
#   -v $(pwd)/data:/data \
#   nvcr.io/nvidia/tensorrt:23.12-py3 \
#   bash

# nvidia-smi  # Check GPU
# trtexec --onnx=/models/resnet50.onnx --fp16 --saveEngine=/models/resnet50.engine

setup_steps = {
    "1. Install NVIDIA Driver": "sudo apt install nvidia-driver-535",
    "2. Install Docker": "curl -fsSL https://get.docker.com | sh",
    "3. Install NVIDIA Container": "apt install nvidia-container-toolkit",
    "4. Pull TensorRT Image": "docker pull nvcr.io/nvidia/tensorrt:23.12-py3",
    "5. Run Container": "docker run --gpus all -it nvcr.io/nvidia/tensorrt:23.12-py3",
    "6. Test GPU": "nvidia-smi inside container",
    "7. Convert Model": "trtexec --onnx=model.onnx --fp16 --saveEngine=model.engine",
    "8. Benchmark": "trtexec --loadEngine=model.engine --batch=8",
}

print(f"\n\nSetup Steps:")
for k, v in setup_steps.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

TensorRT คืออะไร

NVIDIA SDK High-performance Inference GPU PyTorch TensorFlow ONNX Engine Layer Fusion Kernel Auto-tuning FP16 INT8 เร็วขึ้น 2-10 เท่า Memory น้อยลง

ตั้ง Home Lab อย่างไร

RTX 3060 12GB ขึ้นไป Ubuntu 22.04 CUDA cuDNN TensorRT Docker nvidia/cuda RAM 32GB SSD NVMe nvidia-smi Jupyter SSH Remote

แปลง Model เป็น TensorRT อย่างไร

PyTorch ONNX export trtexec Engine FP16 INT8 Torch-TensorRT direct TensorRT Python API Dynamic Shapes trtexec benchmark Throughput Latency

INT8 Quantization ทำอย่างไร

ลด 4 เท่า เร็ว 2-4 เท่า Calibration Dataset 500-1000 ภาพ Scale Factor Cache Accuracy Drop 1% Mixed Precision Layer-wise Control

สรุป

TensorRT Optimization Home Lab NVIDIA GPU ONNX FP16 INT8 Quantization Calibration Engine trtexec Docker Benchmark Latency Throughput Production Deployment

📖 บทความที่เกี่ยวข้อง

OSPF Area Design Home Lab Setupอ่านบทความ → TensorRT Optimization Learning Path Roadmapอ่านบทความ → TensorRT Optimization Metric Collectionอ่านบทความ → SASE Framework Home Lab Setupอ่านบทความ → TensorRT Optimization Network Segmentationอ่านบทความ →

📚 ดูบทความทั้งหมด →