TensorRT Optimization Home Lab Setup —

TensorRT Home Lab

TensorRT Optimization Home Lab NVIDIA GPU Inference ONNX FP16 INT8 Quantization Engine Building Benchmark Latency Throughput Production Deployment

GPU	VRAM	TensorRT FP16	INT8 Support	Price (THB)	เหมาะกับ
RTX 3060	12GB	ดี	มี	10,000-13,000	Entry home lab
RTX 3090	24GB	ดีมาก	มี	25,000-35,000	Large models
RTX 4070 Ti	12GB	ดีมาก	มี	20,000-25,000	Best value
RTX 4090	24GB	ดีที่สุด	มี	55,000-70,000	Pro home lab
Tesla T4	16GB	ดี	Optimized	15,000-20,000 used	Server inference
A100 40GB	40GB	ดีที่สุด	Optimized	200,000+	Enterprise

Model Conversion

=== TensorRT Model Conversion ===

อ่านเพิ่ม: Medusa Commerce Batch Processing Pipeline | SiamCafe Blog · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog · อ่านเพิ่ม: ModSecurity WAF Monitoring และ Alerting | SiamCafe Blog

Step 1: PyTorch → ONNX

import torch

import torchvision.models as models

model = models.resnet50(pretrained=True).eval().cuda()

dummy_input = torch.randn(1, 3, 224, 224).cuda()

torch.onnx.export(

model, dummy_input, "resnet50.onnx",

input_names=["input"],

output_names=["output"],

dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},

opset_version=17

)

Step 2: ONNX → TensorRT Engine

trtexec command:

trtexec --onnx=resnet50.onnx \

--saveEngine=resnet50_fp16.engine \

--fp16 \

--workspace=4096 \

--minShapes=input:1x3x224x224 \

เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน mô hình fake giá rẻ

--optShapes=input:8x3x224x224 \

--maxShapes=input:32x3x224x224 \

--verbose

Step 3: INT8 with Calibration

trtexec --onnx=resnet50.onnx \

--saveEngine=resnet50_int8.engine \

แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook

--int8 \

--calib=calibration_cache.bin \

--workspace=4096

Torch-TensorRT (direct conversion)

import torch_tensorrt

trt_model = torch_tensorrt.compile(model,

inputs=[torch_tensorrt.Input(

min_shape=[1, 3, 224, 224],

opt_shape=[8, 3, 224, 224],

max_shape=[32, 3, 224, 224],

dtype=torch.half

)],

enabled_precisions={torch.half}

)

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Fail2ban Advanced DevSecOps Integration

torch.jit.save(trt_model, "resnet50_trt.ts")

from dataclasses import dataclass

@dataclass

class ConversionMethod:

method: str

pipeline: str

precision: str

complexity: str

best_for: str

methods = [

ConversionMethod("ONNX → trtexec", "PyTorch → ONNX → TensorRT", "FP32 FP16 INT8", "ง่าย",

"Standard models, CLI-based workflow"),

ConversionMethod("Torch-TensorRT", "PyTorch → TensorRT direct", "FP32 FP16", "ง่าย",

แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex

"PyTorch-native, quick prototyping"),

ConversionMethod("TensorRT Python API", "Manual network building", "FP32 FP16 INT8", "ยาก",

"Custom layers, maximum control"),

ConversionMethod("TF-TRT", "TensorFlow → TensorRT", "FP32 FP16 INT8", "ปานกลาง",

"TensorFlow models, SavedModel format"),

]

print("=== Conversion Methods ===")

for m in methods:

print(f" [{m.method}] Pipeline: {m.pipeline}")

print(f" Precision: {m.precision} | Complexity: {m.complexity}")

เนื้อหาเกี่ยวข้อง — REST API Design Testing Strategy QA

print(f" Best for: {m.best_for}")

INT8 Calibration

=== INT8 Calibration ===

import tensorrt as trt

import pycuda.driver as cuda

import numpy as np

class Int8Calibrator(trt.IInt8EntropyCalibrator2):

def init(self, data_loader, cache_file="calibration.cache"):

super().__init__()

self.data_loader = data_loader

self.cache_file = cache_file

self.batch_iter = iter(data_loader)

self.device_input = cuda.mem_alloc(1 * 3 * 224 * 224 * 4)

def get_batch_size(self):

return 1

def get_batch(self, names):

try:

batch = next(self.batch_iter)

cuda.memcpy_htod(self.device_input, batch.numpy().ravel())

return [int(self.device_input)]

except StopIteration:

return None

def read_calibration_cache(self):

if os.path.exists(self.cache_file):

with open(self.cache_file, "rb") as f:

return f.read()

return None

def write_calibration_cache(self, cache):

with open(self.cache_file, "wb") as f:

f.write(cache)

@dataclass

class PrecisionResult:

precision: str

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Fivetran Connector Machine Learning Pipeline

model_size: str

latency_ms: float

throughput: str

accuracy: str

memory_mb: int

results = [

PrecisionResult("FP32", "98 MB", 8.5, "118 img/s", "76.13%", 450),

PrecisionResult("FP16", "49 MB", 3.2, "312 img/s", "76.11%", 280),

PrecisionResult("INT8", "25 MB", 1.8, "555 img/s", "75.89%", 180),

PrecisionResult("INT8 Mixed", "35 MB", 2.1, "476 img/s", "76.05%", 210),

]

print("=== Precision Comparison (ResNet50, RTX 4070 Ti) ===")

for r in results:

print(f" [{r.precision}] Size: {r.model_size} | Latency: {r.latency_ms}ms")

print(f" Throughput: {r.throughput} | Accuracy: {r.accuracy} | VRAM: {r.memory_mb}MB")

speedup_fp16 = 8.5 / 3.2

speedup_int8 = 8.5 / 1.8

print(f"\n FP16 Speedup: {speedup_fp16:.1f}x vs FP32")

print(f" INT8 Speedup: {speedup_int8:.1f}x vs FP32")

Home Lab Setup

# === Home Lab Configuration ===

@dataclass
class LabComponent:
    component: str
    spec: str
    purpose: str
    cost: str

lab = [
    LabComponent("GPU", "RTX 4070 Ti 12GB", "TensorRT inference engine", "22,000 THB"),
    LabComponent("CPU", "AMD Ryzen 7 5800X", "Data preprocessing, model export", "8,000 THB"),
    LabComponent("RAM", "64GB DDR4 3200MHz", "Large model loading, batch processing", "5,000 THB"),
    LabComponent("Storage", "1TB NVMe SSD + 2TB HDD", "Models on SSD, datasets on HDD", "3,500 THB"),
    LabComponent("OS", "Ubuntu 22.04 LTS", "CUDA + TensorRT native support", "Free"),
    LabComponent("Software", "Docker + NVIDIA Container Toolkit", "Isolated TensorRT environments", "Free"),
    LabComponent("Network", "1Gbps Ethernet + SSH", "Remote access, model serving", "Included"),
    LabComponent("UPS", "600VA", "Protect GPU from power loss", "2,500 THB"),
]

total_cost = 41000
print("Home Lab Build:")
for l in lab:
    print(f"  [{l.component}] {l.spec}")
    print(f"    Purpose: {l.purpose} | Cost: {l.cost}")
print(f"\n  Total: ~{total_cost:,} THB")

# Docker Setup
# docker run --gpus all -it --rm \
#   -v $(pwd)/models:/models \
#   -v $(pwd)/data:/data \
#   nvcr.io/nvidia/tensorrt:23.12-py3 \
#   bash

# nvidia-smi  # Check GPU
# trtexec --onnx=/models/resnet50.onnx --fp16 --saveEngine=/models/resnet50.engine

setup_steps = {
    "1. Install NVIDIA Driver": "sudo apt install nvidia-driver-535",
    "2. Install Docker": "curl -fsSL https://get.docker.com | sh",
    "3. Install NVIDIA Container": "apt install nvidia-container-toolkit",
    "4. Pull TensorRT Image": "docker pull nvcr.io/nvidia/tensorrt:23.12-py3",
    "5. Run Container": "docker run --gpus all -it nvcr.io/nvidia/tensorrt:23.12-py3",
    "6. Test GPU": "nvidia-smi inside container",
    "7. Convert Model": "trtexec --onnx=model.onnx --fp16 --saveEngine=model.engine",
    "8. Benchmark": "trtexec --loadEngine=model.engine --batch=8",
}

print(f"\n\nSetup Steps:")
for k, v in setup_steps.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

FP16: เริ่มด้วย FP16 ก่อน ง่ายและเร็วขึ้น 2-3 เท่า
INT8: ใช้ INT8 เมื่อต้องการ Throughput สูงสุด ตรวจ Accuracy
Docker: ใช้ NVIDIA TensorRT Docker Image ลดปัญหา Version
Dynamic: ใช้ Dynamic Shapes สำหรับ Variable Batch Size
Benchmark: ใช้ trtexec benchmark ทุก Model ก่อน Deploy

TensorRT คืออะไร

NVIDIA SDK High-performance Inference GPU PyTorch TensorFlow ONNX Engine Layer Fusion Kernel Auto-tuning FP16 INT8 เร็วขึ้น 2-10 เท่า Memory น้อยลง