Computer Vision YOLO Scaling Strategy — วิธี

YOLO Object Detection

YOLO You Only Look Once Real-time Object Detection ตรวจจับวัตถุครั้งเดียว YOLOv8 Ultralytics Detection Segmentation Pose Estimation CCTV Autonomous Driving

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Apache Kafka Streams Observability Stack — คู่มือฉบับสมบูรณ์ 2026

Scale ได้หลายระดับ Model Nano Small Medium Large GPU TensorRT Kubernetes Batch Processing Edge Device Jetson Raspberry Pi

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Certificate Manager DevSecOps Integration

Model	Parameters	mAP	Speed (ms)	เหมาะกับ
YOLOv8n	3.2M	37.3	1.2	Edge / Mobile
YOLOv8s	11.2M	44.9	2.1	Edge / Light Server
YOLOv8m	25.9M	50.2	4.7	Server
YOLOv8l	43.7M	52.9	7.1	Server GPU
YOLOv8x	68.2M	53.9	10.8	High Accuracy

YOLO Inference Pipeline

# yolo_pipeline.py — YOLO Inference Pipeline

# pip install ultralytics opencv-python



from ultralytics import YOLO

import cv2

import numpy as np

from dataclasses import dataclass

from typing import List, Tuple

import time



@dataclass

class Detection:

    class_name: str

    confidence: float

    bbox: Tuple[int, int, int, int]  # x1, y1, x2, y2



class YOLOPipeline:

    """YOLO Inference Pipeline with Scaling Options"""



    def __init__(self, model_size: str = "n", device: str = "cpu"):

        self.model_name = f"yolov8{model_size}.pt"

        self.device = device

        # self.model = YOLO(self.model_name)

        # self.model.to(device)

        print(f"  Loaded {self.model_name} on {device}")



    def detect(self, image_path: str, conf: float = 0.25) -> List[Detection]:

        """ตรวจจับวัตถุในภาพ"""

        # results = self.model(image_path, conf=conf, device=self.device)

        # detections = []

        # for result in results:

        #     for box in result.boxes:

        #         detections.append(Detection(

        #             class_name=result.names[int(box.cls)],

        #             confidence=float(box.conf),

        #             bbox=tuple(map(int, box.xyxy[0])),

        #         ))

        # return detections

        print(f"  Detecting objects in {image_path}")

        return []



    def batch_detect(self, image_paths: List[str],

                     batch_size: int = 8) -> dict:

        """Batch Detection — ประมวลผลหลายภาพพร้อมกัน"""

        results = {}

        for i in range(0, len(image_paths), batch_size):

            batch = image_paths[i:i+batch_size]

            # batch_results = self.model(batch, device=self.device)

            for path in batch:

                results[path] = self.detect(path)

        return results



    def export_tensorrt(self):

        """Export เป็น TensorRT Engine"""

        # self.model.export(format="engine", half=True, device=0)

        print(f"  Exported {self.model_name} to TensorRT FP16")



    def export_onnx(self):

        """Export เป็น ONNX"""

        # self.model.export(format="onnx", opset=12, simplify=True)

        print(f"  Exported {self.model_name} to ONNX")



# Scaling Options

scaling_options = {

    "Model Size": {

        "description": "เลือกขนาด Model ตามความต้องการ",

        "options": {

            "Nano (n)": "Edge Device, Mobile, Raspberry Pi",

            "Small (s)": "Edge Server, Jetson, Light Workload",

            "Medium (m)": "Server, General Purpose",

            "Large (l)": "GPU Server, High Accuracy",

            "XLarge (x)": "Multi-GPU, Maximum Accuracy",

        },

    },

    "Export Format": {

        "description": "Export Model เป็นรูปแบบที่เร็วขึ้น",

        "options": {

            "TensorRT": "NVIDIA GPU เร็วขึ้น 2-5x (FP16/INT8)",

            "ONNX": "Cross-platform CPU/GPU",

            "OpenVINO": "Intel CPU/GPU เร็วขึ้น 2-3x",

            "CoreML": "Apple Silicon M1/M2/M3",

            "TFLite": "Mobile Android/iOS Edge TPU",

        },

    },

    "Batch Processing": {

        "description": "ประมวลผลหลายภาพพร้อมกัน",

        "options": {

            "Batch Size 8": "GPU Memory 4-8 GB",

            "Batch Size 16": "GPU Memory 8-16 GB",

            "Batch Size 32": "GPU Memory 16-24 GB",

        },

    },

}



print("YOLO Scaling Options:")

for category, info in scaling_options.items():

    print(f"\n  [{category}]")

    print(f"  {info['description']}")

    for opt, desc in info["options"].items():

        print(f"    {opt}: {desc}")

Kubernetes Scaling

# k8s_yolo.py — Kubernetes Deployment for YOLO

# apiVersion: apps/v1

# kind: Deployment

# metadata:

#   name: yolo-inference

# spec:

#   replicas: 3

#   selector:

#     matchLabels:

#       app: yolo-inference

#   template:

#     metadata:

#       labels:

#         app: yolo-inference

#     spec:

#       containers:

#       - name: yolo

#         image: ultralytics/yolov8:latest-gpu

#         ports:

#         - containerPort: 8080

#         resources:

#           limits:

#             nvidia.com/gpu: 1

#             memory: 8Gi

#             cpu: 4

#           requests:

#             memory: 4Gi

#             cpu: 2

#         env:

#         - name: MODEL_SIZE

#           value: "m"

#         - name: DEVICE

#           value: "cuda"

#         - name: BATCH_SIZE

#           value: "8"



# apiVersion: autoscaling/v2

# kind: HorizontalPodAutoscaler

# metadata:

#   name: yolo-hpa

# spec:

#   scaleTargetRef:

#     apiVersion: apps/v1

#     kind: Deployment

#     name: yolo-inference

#   minReplicas: 2

#   maxReplicas: 10

#   metrics:

#   - type: Resource

#     resource:

#       name: cpu

#       target:

#         type: Utilization

#         averageUtilization: 70

#   - type: Pods

#     pods:

#       metric:

#         name: inference_queue_length

#       target:

#         type: AverageValue

#         averageValue: "5"



from dataclasses import dataclass

from typing import List



@dataclass

class ScalingTier:

    name: str

    replicas: str

    gpu: str

    model: str

    throughput: str

    cost: str



class YOLOScaling:

    """YOLO Scaling Strategy"""



    def __init__(self):

        self.tiers: List[ScalingTier] = []



    def add_tier(self, tier: ScalingTier):

        self.tiers.append(tier)



    def show_tiers(self):

        print(f"\n{'='*60}")

        print(f"YOLO Scaling Tiers")

        print(f"{'='*60}")



        for tier in self.tiers:

            print(f"\n  [{tier.name}]")

            print(f"    Replicas: {tier.replicas}")

            print(f"    GPU: {tier.gpu}")

            print(f"    Model: {tier.model}")

            print(f"    Throughput: {tier.throughput}")

            print(f"    Cost: {tier.cost}")



scaling = YOLOScaling()



tiers = [

    ScalingTier("Edge", "1", "Jetson Nano/Orin", "YOLOv8n",

                "15-30 FPS", "Low ($200-500)"),

    ScalingTier("Small", "1-2 Pods", "T4 GPU", "YOLOv8s TensorRT",

                "50-100 FPS", "Medium ($0.5/hr)"),

    ScalingTier("Medium", "3-5 Pods", "A10G GPU", "YOLOv8m TensorRT",

                "100-300 FPS", "Medium ($1/hr)"),

    ScalingTier("Large", "5-10 Pods", "A100 GPU", "YOLOv8l TensorRT",

                "300-1000 FPS", "High ($3/hr)"),

    ScalingTier("Enterprise", "10+ Pods HPA", "Multi-A100", "YOLOv8x TensorRT",

                "1000+ FPS", "High ($10+/hr)"),

]



for tier in tiers:

    scaling.add_tier(tier)



scaling.show_tiers()



# Architecture

architecture = {

    "Input": "Camera Stream / Image Upload / Video File",

    "Queue": "Redis / Kafka — Buffer Requests",

    "Inference": "YOLO Pods (GPU) — TensorRT FP16",

    "Post-processing": "NMS, Tracking, Business Logic",

    "Output": "API Response / WebSocket / Storage",

    "Monitoring": "Prometheus + Grafana — FPS, Latency, GPU Util",

}



print(f"\n\nInference Architecture:")

for layer, desc in architecture.items():

    print(f"  {layer}: {desc}")

Performance Optimization

# optimization.py — YOLO Performance Optimization

optimizations = {

    "TensorRT FP16": {

        "speedup": "2-3x faster",

        "accuracy_loss": "< 0.5% mAP",

        "command": "yolo export model=yolov8m.pt format=engine half=True",

        "requirement": "NVIDIA GPU (Compute Capability >= 7.0)",

    },

    "TensorRT INT8": {

        "speedup": "3-5x faster",

        "accuracy_loss": "1-2% mAP",

        "command": "yolo export model=yolov8m.pt format=engine int8=True data=coco.yaml",

        "requirement": "Calibration Dataset needed",

    },

    "ONNX Runtime": {

        "speedup": "1.5-2x faster (CPU)",

        "accuracy_loss": "0%",

        "command": "yolo export model=yolov8m.pt format=onnx simplify=True",

        "requirement": "onnxruntime / onnxruntime-gpu",

    },

    "OpenVINO": {

        "speedup": "2-3x faster (Intel CPU)",

        "accuracy_loss": "< 0.5%",

        "command": "yolo export model=yolov8m.pt format=openvino half=True",

        "requirement": "Intel CPU/iGPU",

    },

    "Image Resize": {

        "speedup": "640->320 = 2-4x faster",

        "accuracy_loss": "5-10% mAP",

        "command": "model.predict(source, imgsz=320)",

        "requirement": "ลด Input Resolution",

    },

    "Batch Inference": {

        "speedup": "2-4x throughput",

        "accuracy_loss": "0%",

        "command": "model.predict(sources, batch=16)",

        "requirement": "GPU Memory เพียงพอ",

    },

}



print("YOLO Performance Optimizations:")

for name, info in optimizations.items():

    print(f"\n  [{name}]")

    print(f"    Speedup: {info['speedup']}")

    print(f"    Accuracy Loss: {info['accuracy_loss']}")

    print(f"    Command: {info['command']}")

    print(f"    Requirement: {info['requirement']}")



# GPU Comparison

gpus = {

    "Jetson Nano": {"VRAM": "4GB", "YOLOv8n": "15 FPS", "cost": "$200"},

    "Jetson Orin": {"VRAM": "8-32GB", "YOLOv8n": "60 FPS", "cost": "$500-2000"},

    "T4": {"VRAM": "16GB", "YOLOv8m": "80 FPS", "cost": "$0.5/hr (Cloud)"},

    "A10G": {"VRAM": "24GB", "YOLOv8m": "150 FPS", "cost": "$1/hr (Cloud)"},

    "RTX 4090": {"VRAM": "24GB", "YOLOv8m": "200 FPS", "cost": "$1,599"},

    "A100": {"VRAM": "80GB", "YOLOv8l": "250 FPS", "cost": "$3/hr (Cloud)"},

}



print(f"\n\nGPU Comparison for YOLO:")

for gpu, info in gpus.items():

    print(f"  {gpu:<14} VRAM: {info['VRAM']:<8} Speed: {info.get('YOLOv8m', info.get('YOLOv8n', info.get('YOLOv8l', 'N/A'))):<10} Cost: {info['cost']}")

เคล็ดลับ

TensorRT: Export YOLO เป็น TensorRT FP16 เร็วขึ้น 2-3 เท่า
Model Size: เลือกขนาดตามงาน Nano สำหรับ Edge Large สำหรับ Server
Batch: ใช้ Batch Inference เพิ่ม Throughput 2-4 เท่า
Queue: ใช้ Redis/Kafka Buffer Requests ป้องกัน Overload
HPA: ใช้ Kubernetes HPA Scale Pods ตาม GPU Utilization
Monitor: ติดตาม FPS Latency GPU Memory ด้วย Prometheus

YOLO คืออะไร

You Only Look Once Real-time Object Detection ตรวจจับวัตถุครั้งเดียว YOLOv8 Ultralytics Detection Segmentation Pose CCTV Autonomous Driving Quality Inspection

แนะนำเพิ่มเติม — ติดตาม XM Signal

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Grafana Mimir Metrics MLOps Workflow