Image Segmentation Container Orchestration
Image Segmentation Computer Vision แบ่งภาพ Object Region Semantic Instance Panoptic U-Net DeepLab Mask R-CNN SAM Container Orchestration Kubernetes GPU Production
| Model | Type | VRAM | Speed | Use Case |
|---|---|---|---|---|
| U-Net | Semantic | 2-4GB | Fast | Medical, Satellite |
| DeepLab V3+ | Semantic | 4-6GB | Medium | Scene Parsing |
| Mask R-CNN | Instance | 6-8GB | Slow | Object Detection |
| SAM | Panoptic | 8-16GB | Medium | General Purpose |
| YOLO v8 Seg | Instance | 4-6GB | Fast | Real-time |
Docker Setup
# === Image Segmentation Docker Setup ===
# Dockerfile
# FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
#
# RUN apt-get update && apt-get install -y \
# python3 python3-pip libgl1-mesa-glx libglib2.0-0 \
# && rm -rf /var/lib/apt/lists/*
#
# WORKDIR /app
# COPY requirements.txt .
# RUN pip3 install --no-cache-dir -r requirements.txt
# COPY . .
#
# EXPOSE 8080
# CMD ["python3", "server.py"]
# requirements.txt
# torch==2.1.0
# torchvision==0.16.0
# opencv-python-headless==4.8.0
# fastapi==0.104.0
# uvicorn==0.24.0
# Pillow==10.1.0
# numpy==1.24.0
# server.py — FastAPI Inference Server
# from fastapi import FastAPI, UploadFile, File
# from PIL import Image
# import torch
# import torchvision.transforms as T
# import io
# import numpy as np
#
# app = FastAPI()
# model = None
#
# @app.on_event("startup")
# async def load_model():
# global model
# model = torch.hub.load('pytorch/vision', 'deeplabv3_resnet101',
# pretrained=True).eval().cuda()
#
# @app.post("/segment")
# async def segment(file: UploadFile = File(...)):
# img = Image.open(io.BytesIO(await file.read())).convert("RGB")
# transform = T.Compose([
# T.ToTensor(),
# T.Normalize(mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
# ])
# input_tensor = transform(img).unsqueeze(0).cuda()
# with torch.no_grad():
# output = model(input_tensor)['out'][0]
# mask = output.argmax(0).cpu().numpy()
# return {"classes": np.unique(mask).tolist(),
# "shape": list(mask.shape)}
#
# @app.get("/health")
# async def health():
# return {"status": "ok", "gpu": torch.cuda.is_available()}
# docker build -t seg-server:latest .
# docker run --gpus all -p 8080:8080 seg-server:latest
# docker-compose.yml
# version: '3.8'
# services:
# seg-worker-1:
# build: .
# runtime: nvidia
# environment:
# - NVIDIA_VISIBLE_DEVICES=0
# ports:
# - "8080:8080"
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
# healthcheck:
# test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
# interval: 30s
# timeout: 10s
# retries: 3
#
# nginx:
# image: nginx:alpine
# ports:
# - "80:80"
# volumes:
# - ./nginx.conf:/etc/nginx/nginx.conf
from dataclasses import dataclass
from typing import List
@dataclass
class SegModel:
name: str
type: str
vram_gb: float
fps: float
accuracy: float
models = [
SegModel("U-Net", "Semantic", 3.0, 30, 0.92),
SegModel("DeepLab V3+", "Semantic", 5.0, 15, 0.95),
SegModel("Mask R-CNN", "Instance", 7.0, 5, 0.93),
SegModel("SAM ViT-H", "Panoptic", 12.0, 8, 0.97),
SegModel("YOLO v8 Seg", "Instance", 4.0, 45, 0.90),
]
print("Image Segmentation Models:")
for m in models:
print(f" {m.name}: {m.type} | VRAM {m.vram_gb}GB | "
f"{m.fps} FPS | Acc {m.accuracy:.0%}")
Kubernetes Deployment
# === Kubernetes Segmentation Deployment ===
# segmentation-deployment.yaml
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: seg-worker
# labels:
# app: segmentation
# spec:
# replicas: 3
# selector:
# matchLabels:
# app: segmentation
# template:
# metadata:
# labels:
# app: segmentation
# spec:
# containers:
# - name: seg-server
# image: seg-server:latest
# ports:
# - containerPort: 8080
# resources:
# limits:
# nvidia.com/gpu: 1
# memory: "8Gi"
# cpu: "4"
# requests:
# nvidia.com/gpu: 1
# memory: "4Gi"
# cpu: "2"
# livenessProbe:
# httpGet:
# path: /health
# port: 8080
# initialDelaySeconds: 60
# periodSeconds: 30
# readinessProbe:
# httpGet:
# path: /health
# port: 8080
# initialDelaySeconds: 30
# periodSeconds: 10
# volumeMounts:
# - name: model-cache
# mountPath: /root/.cache
# volumes:
# - name: model-cache
# persistentVolumeClaim:
# claimName: model-cache-pvc
# nodeSelector:
# gpu: "true"
# tolerations:
# - key: nvidia.com/gpu
# operator: Exists
# effect: NoSchedule
#
# ---
# apiVersion: v1
# kind: Service
# metadata:
# name: seg-service
# spec:
# selector:
# app: segmentation
# ports:
# - port: 80
# targetPort: 8080
# type: ClusterIP
#
# ---
# apiVersion: autoscaling/v2
# kind: HorizontalPodAutoscaler
# metadata:
# name: seg-hpa
# spec:
# scaleTargetRef:
# apiVersion: apps/v1
# kind: Deployment
# name: seg-worker
# minReplicas: 2
# maxReplicas: 8
# metrics:
# - type: Resource
# resource:
# name: cpu
# target:
# type: Utilization
# averageUtilization: 70
# - type: Pods
# pods:
# metric:
# name: inference_queue_length
# target:
# type: AverageValue
# averageValue: "10"
# Triton Inference Server Deployment
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: triton-server
# spec:
# replicas: 2
# template:
# spec:
# containers:
# - name: triton
# image: nvcr.io/nvidia/tritonserver:23.10-py3
# args: ["tritonserver", "--model-repository=/models"]
# ports:
# - containerPort: 8000 # HTTP
# - containerPort: 8001 # gRPC
# - containerPort: 8002 # Metrics
# resources:
# limits:
# nvidia.com/gpu: 1
# kubectl apply -f segmentation-deployment.yaml
# kubectl get pods -l app=segmentation
# kubectl logs -f deployment/seg-worker
# kubectl top pods -l app=segmentation
pipeline_stages = {
"Input": "รับภาพจาก API / Message Queue / S3",
"Preprocess": "Resize, Normalize, Augmentation",
"Inference": "GPU Model Prediction (Batch)",
"Postprocess": "Decode Mask, Filter, Smooth",
"Output": "ส่งผลลัพธ์ API / Store S3 / Kafka",
}
print("\nSegmentation Pipeline:")
for stage, desc in pipeline_stages.items():
print(f" [{stage}] -> {desc}")
Monitoring
# monitoring.py — Segmentation Service Monitoring
metrics = {
"inference_latency_ms": {
"desc": "เวลา Inference ต่อภาพ",
"target": "< 200ms",
"alert": "> 500ms",
},
"gpu_utilization_pct": {
"desc": "GPU Utilization",
"target": "60-80%",
"alert": "> 95% หรือ < 20%",
},
"gpu_memory_used_gb": {
"desc": "GPU Memory Usage",
"target": "< 80% VRAM",
"alert": "> 90% VRAM (OOM risk)",
},
"requests_per_second": {
"desc": "Throughput",
"target": "> 10 req/s per GPU",
"alert": "< 5 req/s",
},
"queue_length": {
"desc": "Pending Requests",
"target": "< 10",
"alert": "> 50 (need scaling)",
},
"error_rate_pct": {
"desc": "Error Rate",
"target": "< 1%",
"alert": "> 5%",
},
}
print("Segmentation Service Metrics:")
for metric, info in metrics.items():
print(f"\n [{metric}]")
print(f" {info['desc']}")
print(f" Target: {info['target']}")
print(f" Alert: {info['alert']}")
# Prometheus Metrics (example)
# from prometheus_client import Histogram, Gauge, Counter
#
# INFERENCE_TIME = Histogram('inference_duration_seconds',
# 'Time for inference')
# GPU_UTIL = Gauge('gpu_utilization_percent', 'GPU utilization')
# REQUESTS = Counter('inference_requests_total', 'Total requests',
# ['model', 'status'])
#
# @INFERENCE_TIME.time()
# def predict(image):
# result = model(image)
# REQUESTS.labels(model='deeplabv3', status='success').inc()
# return result
# Grafana Dashboard
dashboard_panels = [
"Inference Latency (p50, p95, p99)",
"GPU Utilization per Pod",
"GPU Memory Usage per Pod",
"Requests per Second (RPS)",
"Queue Length",
"Error Rate",
"Pod Count (Running vs Desired)",
"Model Version Distribution",
]
print(f"\n\nGrafana Dashboard Panels:")
for panel in dashboard_panels:
print(f" - {panel}")
เคล็ดลับ
- Dynamic Batching: รวม Requests เป็น Batch เพิ่ม GPU Throughput 3-5x
- Model Cache: ใช้ PVC แชร์ Model Cache ไม่ต้อง Download ทุก Pod
- TensorRT: Optimize Model ด้วย TensorRT เร็วขึ้น 2-3x
- Readiness Probe: ตั้ง Probe รอ Model Load เสร็จก่อนรับ Traffic
- Autoscale: Scale ตาม Queue Length ไม่ใช่แค่ CPU
Image Segmentation คืออะไร
Computer Vision แบ่งภาพ Object Region Semantic Instance Panoptic U-Net DeepLab Mask R-CNN SAM YOLO Seg
ทำไมต้องรัน Image Segmentation บน Kubernetes
Production Requests มาก GPU Resource Management Auto-scaling Health Check Self-healing Rolling Update Model ไม่ Downtime
ต้องใช้ GPU เท่าไหร่สำหรับ Image Segmentation
U-Net 2-4GB DeepLab 4-6GB Mask R-CNN 6-8GB SAM 8-16GB RTX 3060 Development A100 H100 Production
Triton Inference Server คืออะไร
NVIDIA Open Source Inference Platform TensorFlow PyTorch ONNX TensorRT Multiple Models Dynamic Batching Model Versioning Prometheus Metrics
สรุป
Image Segmentation Container Orchestration Docker Kubernetes GPU Scheduling U-Net DeepLab Mask R-CNN SAM Triton Inference Server Dynamic Batching Auto-scaling TensorRT Production ML Serving
