ai

TensorRT Optimization Home Lab Setup —

TensorRT Optimization Home Lab Setup —

TensorRT Home Lab

TensorRT Optimization Home Lab Setup —

TensorRT Optimization Home Lab NVIDIA GPU Inference ONNX FP16 INT8 Quantization Engine Building Benchmark Latency Throughput Production Deployment

GPUVRAMTensorRT FP16INT8 SupportPrice (THB)เหมาะกับ
RTX 306012GBดีมี10,000-13,000Entry home lab
RTX 309024GBดีมากมี25,000-35,000Large models
RTX 4070 Ti12GBดีมากมี20,000-25,000Best value
RTX 409024GBดีที่สุดมี55,000-70,000Pro home lab
Tesla T416GBดีOptimized15,000-20,000 usedServer inference
A100 40GB40GBดีที่สุดOptimized200,000+Enterprise

Model Conversion

=== TensorRT Model Conversion ===

อ่านเพิ่ม: Medusa Commerce Batch Processing Pipeline | SiamCafe Blog · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog · อ่านเพิ่ม: ModSecurity WAF Monitoring และ Alerting | SiamCafe Blog

Step 1: PyTorch → ONNX

import torch

import torchvision.models as models

model = models.resnet50(pretrained=True).eval().cuda()

dummy_input = torch.randn(1, 3, 224, 224).cuda()

torch.onnx.export(

model, dummy_input, "resnet50.onnx",

input_names=["input"],

output_names=["output"],

dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},

opset_version=17

)

Step 2: ONNX → TensorRT Engine

trtexec command:

trtexec --onnx=resnet50.onnx \

--saveEngine=resnet50_fp16.engine \

--fp16 \

--workspace=4096 \

--minShapes=input:1x3x224x224 \

เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน mô hình fake giá rẻ

--optShapes=input:8x3x224x224 \

--maxShapes=input:32x3x224x224 \

--verbose

Step 3: INT8 with Calibration

trtexec --onnx=resnet50.onnx \

--saveEngine=resnet50_int8.engine \

แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook

--int8 \

--calib=calibration_cache.bin \

--workspace=4096

Torch-TensorRT (direct conversion)

import torch_tensorrt

trt_model = torch_tensorrt.compile(model,

inputs=[torch_tensorrt.Input(

min_shape=[1, 3, 224, 224],

opt_shape=[8, 3, 224, 224],

max_shape=[32, 3, 224, 224],

dtype=torch.half

)],

enabled_precisions={torch.half}

)

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Fail2ban Advanced DevSecOps Integration

torch.jit.save(trt_model, "resnet50_trt.ts")

from dataclasses import dataclass

@dataclass

class ConversionMethod:

method: str

pipeline: str

precision: str

complexity: str

best_for: str

methods = [

ConversionMethod("ONNX → trtexec", "PyTorch → ONNX → TensorRT", "FP32 FP16 INT8", "ง่าย",

"Standard models, CLI-based workflow"),

ConversionMethod("Torch-TensorRT", "PyTorch → TensorRT direct", "FP32 FP16", "ง่าย",

แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex

"PyTorch-native, quick prototyping"),

ConversionMethod("TensorRT Python API", "Manual network building", "FP32 FP16 INT8", "ยาก",

"Custom layers, maximum control"),

ConversionMethod("TF-TRT", "TensorFlow → TensorRT", "FP32 FP16 INT8", "ปานกลาง",

"TensorFlow models, SavedModel format"),

]

print("=== Conversion Methods ===")

for m in methods:

print(f" [{m.method}] Pipeline: {m.pipeline}")

print(f" Precision: {m.precision} | Complexity: {m.complexity}")

เนื้อหาเกี่ยวข้อง — REST API Design Testing Strategy QA

print(f" Best for: {m.best_for}")

INT8 Calibration

=== INT8 Calibration ===

import tensorrt as trt

import pycuda.driver as cuda

import numpy as np

class Int8Calibrator(trt.IInt8EntropyCalibrator2):

def __init__(self, data_loader, cache_file="calibration.cache"):

super().__init__()

self.data_loader = data_loader

self.cache_file = cache_file

self.batch_iter = iter(data_loader)

self.device_input = cuda.mem_alloc(1 * 3 * 224 * 224 * 4)

def get_batch_size(self):

return 1

def get_batch(self, names):

try:

TensorRT Optimization Home Lab Setup —

batch = next(self.batch_iter)

cuda.memcpy_htod(self.device_input, batch.numpy().ravel())

return [int(self.device_input)]

except StopIteration:

return None

def read_calibration_cache(self):

if os.path.exists(self.cache_file):

with open(self.cache_file, "rb") as f:

return f.read()

return None

def write_calibration_cache(self, cache):

with open(self.cache_file, "wb") as f:

f.write(cache)

@dataclass

class PrecisionResult:

precision: str

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Fivetran Connector Machine Learning Pipeline

model_size: str

latency_ms: float

throughput: str

accuracy: str

memory_mb: int

results = [

PrecisionResult("FP32", "98 MB", 8.5, "118 img/s", "76.13%", 450),

PrecisionResult("FP16", "49 MB", 3.2, "312 img/s", "76.11%", 280),

PrecisionResult("INT8", "25 MB", 1.8, "555 img/s", "75.89%", 180),

PrecisionResult("INT8 Mixed", "35 MB", 2.1, "476 img/s", "76.05%", 210),

]

print("=== Precision Comparison (ResNet50, RTX 4070 Ti) ===")

for r in results:

print(f" [{r.precision}] Size: {r.model_size} | Latency: {r.latency_ms}ms")

print(f" Throughput: {r.throughput} | Accuracy: {r.accuracy} | VRAM: {r.memory_mb}MB")

speedup_fp16 = 8.5 / 3.2

speedup_int8 = 8.5 / 1.8

print(f"\n FP16 Speedup: {speedup_fp16:.1f}x vs FP32")

print(f" INT8 Speedup: {speedup_int8:.1f}x vs FP32")

Home Lab Setup

# === Home Lab Configuration ===

@dataclass
class LabComponent:
    component: str
    spec: str
    purpose: str
    cost: str

lab = [
    LabComponent("GPU", "RTX 4070 Ti 12GB", "TensorRT inference engine", "22,000 THB"),
    LabComponent("CPU", "AMD Ryzen 7 5800X", "Data preprocessing, model export", "8,000 THB"),
    LabComponent("RAM", "64GB DDR4 3200MHz", "Large model loading, batch processing", "5,000 THB"),
    LabComponent("Storage", "1TB NVMe SSD + 2TB HDD", "Models on SSD, datasets on HDD", "3,500 THB"),
    LabComponent("OS", "Ubuntu 22.04 LTS", "CUDA + TensorRT native support", "Free"),
    LabComponent("Software", "Docker + NVIDIA Container Toolkit", "Isolated TensorRT environments", "Free"),
    LabComponent("Network", "1Gbps Ethernet + SSH", "Remote access, model serving", "Included"),
    LabComponent("UPS", "600VA", "Protect GPU from power loss", "2,500 THB"),
]

total_cost = 41000
print("Home Lab Build:")
for l in lab:
    print(f"  [{l.component}] {l.spec}")
    print(f"    Purpose: {l.purpose} | Cost: {l.cost}")
print(f"\n  Total: ~{total_cost:,} THB")

# Docker Setup
# docker run --gpus all -it --rm \
#   -v $(pwd)/models:/models \
#   -v $(pwd)/data:/data \
#   nvcr.io/nvidia/tensorrt:23.12-py3 \
#   bash

# nvidia-smi  # Check GPU
# trtexec --onnx=/models/resnet50.onnx --fp16 --saveEngine=/models/resnet50.engine

setup_steps = {
    "1. Install NVIDIA Driver": "sudo apt install nvidia-driver-535",
    "2. Install Docker": "curl -fsSL https://get.docker.com | sh",
    "3. Install NVIDIA Container": "apt install nvidia-container-toolkit",
    "4. Pull TensorRT Image": "docker pull nvcr.io/nvidia/tensorrt:23.12-py3",
    "5. Run Container": "docker run --gpus all -it nvcr.io/nvidia/tensorrt:23.12-py3",
    "6. Test GPU": "nvidia-smi inside container",
    "7. Convert Model": "trtexec --onnx=model.onnx --fp16 --saveEngine=model.engine",
    "8. Benchmark": "trtexec --loadEngine=model.engine --batch=8",
}

print(f"\n\nSetup Steps:")
for k, v in setup_steps.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

  • FP16: เริ่มด้วย FP16 ก่อน ง่ายและเร็วขึ้น 2-3 เท่า
  • INT8: ใช้ INT8 เมื่อต้องการ Throughput สูงสุด ตรวจ Accuracy
  • Docker: ใช้ NVIDIA TensorRT Docker Image ลดปัญหา Version
  • Dynamic: ใช้ Dynamic Shapes สำหรับ Variable Batch Size
  • Benchmark: ใช้ trtexec benchmark ทุก Model ก่อน Deploy

TensorRT คืออะไร

NVIDIA SDK High-performance Inference GPU PyTorch TensorFlow ONNX Engine Layer Fusion Kernel Auto-tuning FP16 INT8 เร็วขึ้น 2-10 เท่า Memory น้อยลง

XM Legend · เทรดเดอร์ & ผู้สอน Forex 13 ปี

ผู้ก่อตั้ง SiamCafe ตั้งแต่ปี 1997 · เทรดเดอร์สาย Forex มากกว่า 13 ปี ได้รับการยกย่องเป็น XM Legend · แบ่งปันความรู้ Forex, ไอที, AI และการเทรด จากประสบการณ์จริงในตลาดจริง