ML Capacity Planning
MLOps Pipeline Capacity Planning GPU CPU Memory Storage Auto-scaling Cost Optimization Training Inference Resource Sizing Production Operations
| Resource | Training | Inference | Sizing Method | Cost Impact |
|---|---|---|---|---|
| GPU | A100 80GB / H100 | T4 / L4 / A10G | Model size + batch | สูงสุด 70% |
| CPU | 32-64 cores | 4-16 cores | Data preprocessing | ปานกลาง 15% |
| RAM | 128-512 GB | 16-64 GB | Dataset + model | ต่ำ 5% |
| Storage | 1-10 TB SSD | 50-200 GB | Data + checkpoints | ต่ำ 5% |
| Network | 10-100 Gbps | 1-10 Gbps | Multi-GPU comm | ต่ำ 5% |
Training Capacity
# === Training Resource Calculator ===
from dataclasses import dataclass
@dataclass
class ModelSpec:
name: str
params: str
fp32_memory_gb: float
fp16_memory_gb: float
min_gpu: str
training_time_per_epoch: str
recommended_gpu: str
models = [
ModelSpec("ResNet-50", "25M", 0.1, 0.05, "Any GPU 4GB+", "5 min (ImageNet)", "RTX 3060 12GB"),
ModelSpec("BERT-base", "110M", 0.5, 0.25, "8GB GPU", "4 hours (GLUE)", "RTX 3090 24GB"),
ModelSpec("GPT-2 Medium", "345M", 1.4, 0.7, "16GB GPU", "8 hours", "A100 40GB"),
ModelSpec("LLaMA 7B", "7B", 28, 14, "2x A100 40GB", "Days (full)", "4x A100 80GB"),
ModelSpec("LLaMA 13B", "13B", 52, 26, "4x A100 40GB", "Days (full)", "8x A100 80GB"),
ModelSpec("LLaMA 70B", "70B", 280, 140, "8x A100 80GB", "Weeks (full)", "16x H100 80GB"),
]
print("=== Training GPU Requirements ===")
for m in models:
print(f" [{m.name}] Params: {m.params}")
print(f" FP32: {m.fp32_memory_gb}GB | FP16: {m.fp16_memory_gb}GB")
print(f" Min GPU: {m.min_gpu}")
print(f" Recommended: {m.recommended_gpu}")
print(f" Time/epoch: {m.training_time_per_epoch}")
# Training Cost Calculator
@dataclass
class GPUCost:
gpu: str
on_demand: float
spot: float
reserved: float
gpu_costs = [
GPUCost("T4 (16GB)", 0.526, 0.158, 0.35),
GPUCost("A10G (24GB)", 1.006, 0.302, 0.67),
GPUCost("A100 40GB", 3.673, 1.102, 2.45),
GPUCost("A100 80GB", 5.12, 1.536, 3.41),
GPUCost("H100 80GB", 8.10, 2.43, 5.40),
]
print(f"\n\nGPU Pricing (USD/hour, AWS):")
for g in gpu_costs:
saving = (1 - g.spot / g.on_demand) * 100
print(f" [{g.gpu}] On-demand: | Spot: ({saving:.0f}% off) | Reserved: ")
Inference Scaling
# === Inference Capacity Calculator ===
@dataclass
class InferenceCalc:
service: str
qps_target: int
latency_p99_ms: int
throughput_per_instance: int
cpu_per_instance: int
ram_per_instance_gb: int
gpu_per_instance: str
def calculate_replicas(calc):
replicas = max(1, calc.qps_target // calc.throughput_per_instance)
headroom = int(replicas * 1.3)
total_cpu = headroom * calc.cpu_per_instance
total_ram = headroom * calc.ram_per_instance_gb
return replicas, headroom, total_cpu, total_ram
services = [
InferenceCalc("Recommendation API", 500, 50, 100, 4, 8, "T4"),
InferenceCalc("Fraud Detection", 1000, 20, 200, 8, 16, "None (CPU)"),
InferenceCalc("Image Classification", 200, 100, 50, 4, 8, "T4"),
InferenceCalc("LLM Chat API", 50, 2000, 5, 8, 32, "A10G"),
InferenceCalc("Search Ranking", 2000, 30, 500, 16, 32, "None (CPU)"),
]
print("=== Inference Capacity ===")
for s in services:
replicas, headroom, cpu, ram = calculate_replicas(s)
print(f" [{s.service}] Target: {s.qps_target} QPS | Latency: {s.latency_p99_ms}ms")
print(f" Per instance: {s.throughput_per_instance} QPS | GPU: {s.gpu_per_instance}")
print(f" Min replicas: {replicas} | With 30% headroom: {headroom}")
print(f" Total: {cpu} CPU cores, {ram}GB RAM")
# Kubernetes HPA Config
# apiVersion: autoscaling/v2
# kind: HorizontalPodAutoscaler
# metadata:
# name: ml-api-hpa
# spec:
# scaleTargetRef:
# apiVersion: apps/v1
# kind: Deployment
# name: ml-api
# minReplicas: 3
# maxReplicas: 20
# metrics:
# - type: Resource
# resource:
# name: cpu
# target:
# type: Utilization
# averageUtilization: 70
# - type: Pods
# pods:
# metric:
# name: requests_per_second
# target:
# type: AverageValue
# averageValue: "100"
Cost Optimization
# === Cost Optimization Strategies ===
@dataclass
class CostStrategy:
strategy: str
saving: str
trade_off: str
implementation: str
strategies = [
CostStrategy("Spot/Preemptible GPU", "60-90% off training", "Can be interrupted",
"Use for training jobs with checkpointing"),
CostStrategy("Mixed Precision (FP16)", "50% GPU memory", "Minimal accuracy loss",
"torch.cuda.amp.autocast() in PyTorch"),
CostStrategy("Model Quantization INT8", "75% model size", "1-2% accuracy drop",
"TensorRT INT8 or ONNX quantization"),
CostStrategy("Knowledge Distillation", "50-90% model size", "Requires teacher model",
"Train small student from large teacher"),
CostStrategy("Auto-scaling to zero", "100% idle cost", "Cold start latency",
"KEDA scale to 0 when no traffic"),
CostStrategy("Batch inference", "80% compute cost", "Not real-time",
"Process overnight instead of real-time"),
CostStrategy("Reserved instances", "30-50% base cost", "Commitment required",
"1yr or 3yr commitment for baseline"),
CostStrategy("Right-sizing", "20-40% waste", "Requires monitoring",
"Match instance type to actual usage"),
]
print("Cost Optimization:")
for s in strategies:
print(f" [{s.strategy}] Saving: {s.saving}")
print(f" Trade-off: {s.trade_off}")
print(f" How: {s.implementation}")
# Monthly Cost Example
monthly = {
"Training (4x A100 Spot, 100hr/mo)": "$440",
"Inference (3x T4 On-demand 24/7)": "$1,137",
"Storage (2TB S3)": "$46",
"Data Pipeline (Airflow)": "$200",
"MLflow Server": "$100",
"Monitoring": "$50",
"Total": "~$1,973/month",
"After optimization": "~$1,200/month (40% saving)",
}
print(f"\n\nMonthly Cost Estimate:")
for k, v in monthly.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- Spot: ใช้ Spot Instance สำหรับ Training + Checkpointing ลด 60-90%
- FP16: ใช้ Mixed Precision ลด GPU Memory 50% เสีย Accuracy น้อยมาก
- HPA: ตั้ง Auto-scaling ตาม QPS ไม่ใช่แค่ CPU Utilization
- Monitor: ติดตาม GPU Utilization ถ้าต่ำกว่า 60% Right-size ลงได้
- Budget: ตั้ง Budget Alert ป้องกันค่าใช้จ่ายเกินงบ
Capacity Planning สำหรับ ML คืออะไร
วางแผน Resource GPU CPU RAM Storage Network Training Inference Peak Load Growth Rate Short-term Long-term Historical Usage Trend
คำนวณ GPU สำหรับ Training อย่างไร
Model Parameters VRAM Multi-GPU Mixed Precision FP16 Gradient Checkpointing DeepSpeed ZeRO Epochs Training Time Spot Instance Cost 60-90%
วางแผน Inference Resource อย่างไร
QPS Latency p99 Concurrent Requests Replicas Throughput Headroom 30% HPA Auto-scale CPU GPU Kubernetes TensorRT ONNX Optimization
ลด Cost อย่างไร
Spot Instance 60-90% Auto-scaling Mixed Precision Quantization INT8 Distillation Batch Inference Reserved Right-sizing Budget Alert Cost Allocation
สรุป
MLOps Pipeline Capacity Planning GPU CPU Memory Storage Auto-scaling HPA Cost Optimization Spot Instance Mixed Precision Quantization Kubernetes Production
