
Stable Diffusion ComfyUI Scaling Strategy วิธี
Stable Diffusion ComfyUI Scaling

ComfyUI Node-based GUI สำหรับ Stable Diffusion สร้างภาพ AI Visual Workflow SDXL SD1.5 ControlNet LoRA IPAdapter Scale สำหรับ Production Multi-GPU Multi-Node
| Scale Level | Setup | Users | Throughput |
|---|---|---|---|
| Single GPU | 1x RTX 4090 | 1-5 | 2-5 img/min |
| Multi-GPU | 2-4x GPU | 5-20 | 10-20 img/min |
| Docker Compose | 4-8 Workers | 20-50 | 20-40 img/min |
| Kubernetes | Auto-scaling | 50-500+ | 50-200+ img/min |
Docker Setup
=== ComfyUI Docker Setup ===
Dockerfile
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
RUN apt-get update && apt-get install -y \
python3 python3-pip git wget \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
RUN git clone https://github.com/comfyanonymous/ComfyUI.git .
RUN pip3 install -r requirements.txt
RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu121
EXPOSE 8188
CMD ["python3", "main.py", "--listen", "0.0.0.0", "--port", "8188"]
docker-compose.yml
version: '3.8'
services:
comfyui-worker-1:
build: .
runtime: nvidia
environment:
- NVIDIA_VISIBLE_DEVICES=0
ports:
- "8188:8188"
volumes:
- ./models:/app/models
- ./output:/app/output
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
comfyui-worker-2:
build: .
runtime: nvidia
environment:
- NVIDIA_VISIBLE_DEVICES=1
ports:
- "8189:8188"
volumes:
- ./models:/app/models
- ./output:/app/output
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
nginx:
image: nginx:alpine
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
depends_on:
- comfyui-worker-1
- comfyui-worker-2
redis:
image: redis:7-alpine
ports:
- "6379:6379"
nginx.conf — Load Balancer
upstream comfyui_workers {
least_conn;
server comfyui-worker-1:8188;
server comfyui-worker-2:8188;
}
server {
listen 80;
location / {
proxy_pass http://comfyui_workers;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_read_timeout 300s;
}
}
Commands
docker compose up -d
docker compose logs -f comfyui-worker-1
docker compose scale comfyui-worker-1=4
docker compose down
import json
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class WorkerConfig:
name: str
gpu_id: int
port: int
vram_gb: float
status: str = "idle"
class ComfyUICluster:
def __init__(self):
self.workers: List[WorkerConfig] = []
def add_worker(self, worker: WorkerConfig):
self.workers.append(worker)
def get_idle_worker(self) -> WorkerConfig:
for w in self.workers:
if w.status == "idle":
w.status = "busy"
return w
return None
def release_worker(self, name: str):
for w in self.workers:
if w.name == name:
w.status = "idle"
def show_status(self):
print("ComfyUI Cluster Status:")
for w in self.workers:
print(f" {w.name} | GPU:{w.gpu_id} | Port:{w.port} | "
f"VRAM:{w.vram_gb}GB | {w.status}")
cluster = ComfyUICluster()
cluster.add_worker(WorkerConfig("worker-1", 0, 8188, 24.0))
cluster.add_worker(WorkerConfig("worker-2", 1, 8189, 24.0))
cluster.add_worker(WorkerConfig("worker-3", 2, 8190, 24.0))
cluster.add_worker(WorkerConfig("worker-4", 3, 8191, 24.0))
cluster.show_status()
Kubernetes Deployment
=== Kubernetes ComfyUI Deployment ===
comfyui-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: comfyui-worker
labels:
app: comfyui
spec:

replicas: 3
selector:
matchLabels:
app: comfyui
template:
metadata:
labels:
app: comfyui
spec:
containers:
- name: comfyui
image: comfyui:latest
ports:
- containerPort: 8188
resources:
limits:
nvidia.com/gpu: 1
memory: "16Gi"
requests:
nvidia.com/gpu: 1
memory: "8Gi"
volumeMounts:
- name: models
mountPath: /app/models
env:
- name: COMFYUI_LISTEN
value: "0.0.0.0"
volumes:
- name: models
persistentVolumeClaim:
claimName: comfyui-models-pvc
nodeSelector:
gpu: "true"
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
---
apiVersion: v1
kind: Service
metadata:
name: comfyui-service
spec:
selector:
app: comfyui
ports:
- port: 80
targetPort: 8188
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: comfyui-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: comfyui-worker
minReplicas: 2
maxReplicas: 10
metrics:
- type: Pods
pods:
metric:
name: queue_length
target:
type: AverageValue
averageValue: "5"
kubectl commands
kubectl apply -f comfyui-deployment.yaml
kubectl get pods -l app=comfyui
kubectl scale deployment comfyui-worker --replicas=5
kubectl logs -f deployment/comfyui-worker
kubectl top pods -l app=comfyui
Queue System with Redis
import time
from typing import Optional
class ImageQueue:
"""Queue System สำหรับ ComfyUI"""
def __init__(self, redis_url: str = "redis://localhost:6379"):
self.queue = []
self.results = {}
self.job_counter = 0
def submit_job(self, workflow: dict, priority: int = 0) -> str:
self.job_counter += 1
job_id = f"job-{self.job_counter:06d}"
self.queue.append({
"id": job_id,
"workflow": workflow,
"priority": priority,
"status": "queued",
"submitted_at": time.time(),
})
return job_id
def get_next_job(self) -> Optional[dict]:
if not self.queue:
return None
self.queue.sort(key=lambda x: -x["priority"])
job = self.queue.pop(0)
job["status"] = "processing"
return job
def complete_job(self, job_id: str, result: dict):
self.results[job_id] = {
"status": "completed",
"result": result,
"completed_at": time.time(),
}
def get_status(self, job_id: str) -> dict:
if job_id in self.results:
return self.results[job_id]
for job in self.queue:
if job["id"] == job_id:
return {"status": job["status"], "position": self.queue.index(job)}
return {"status": "not_found"}
queue = ImageQueue()
job1 = queue.submit_job({"prompt": "a cat", "steps": 20}, priority=1)
job2 = queue.submit_job({"prompt": "a dog", "steps": 30}, priority=2)
print(f"Submitted: {job1}, {job2}")
print(f"Status: {queue.get_status(job1)}")
next_job = queue.get_next_job()
print(f"Processing: {next_job['id']}")
Performance Optimization
# performance.py — ComfyUI Performance Tips
optimizations = {
"Model Loading": {
"tip": "ใช้ --highvram หรือ --gpu-only ให้ Model อยู่ใน VRAM",
"impact": "ลดเวลา Load 80%",
"command": "python main.py --highvram --listen 0.0.0.0",
},
"xformers": {
"tip": "ติดตั้ง xformers สำหรับ Memory Efficient Attention",
"impact": "ลด VRAM 30% เร็วขึ้น 20%",
"command": "pip install xformers",
},
"FP16/BF16": {
"tip": "ใช้ Half Precision ลด VRAM และเพิ่มความเร็ว",
"impact": "ลด VRAM 50% เร็วขึ้น 30%",
"command": "python main.py --force-fp16",
},
"Batch Processing": {
"tip": "รวม Requests เป็น Batch ประมวลผลพร้อมกัน",
"impact": "เพิ่ม Throughput 40%",
"command": "ตั้ง batch_size ใน Workflow",
},
"Model Caching": {
"tip": "Cache Models ใน RAM/VRAM ไม่ต้อง Load จาก Disk",
"impact": "ลดเวลา First Image จาก 30s เหลือ 3s",
"command": "python main.py --highvram --disable-smart-memory",
},
"TensorRT": {
"tip": "Compile Model เป็น TensorRT Engine",
"impact": "เร็วขึ้น 40-60%",
"command": "ใช้ ComfyUI-TensorRT Node",
},
}
print("ComfyUI Performance Optimization:")
for opt, info in optimizations.items():
print(f"\n [{opt}]")
print(f" Tip: {info['tip']}")
print(f" Impact: {info['impact']}")
print(f" Command: {info['command']}")
# VRAM Requirements
vram_req = {
"SD 1.5 (512x512)": {"min": "4GB", "rec": "6GB", "speed": "~5 img/min"},
"SD 1.5 (768x768)": {"min": "6GB", "rec": "8GB", "speed": "~3 img/min"},
"SDXL (1024x1024)": {"min": "8GB", "rec": "12GB", "speed": "~2 img/min"},
"SDXL + Refiner": {"min": "12GB", "rec": "16GB", "speed": "~1 img/min"},
"SDXL + ControlNet": {"min": "12GB", "rec": "24GB", "speed": "~1.5 img/min"},
"Flux.1 Dev": {"min": "16GB", "rec": "24GB", "speed": "~0.5 img/min"},
}
print(f"\n\nVRAM Requirements:")
for model, req in vram_req.items():
print(f" {model}: Min {req['min']} | Rec {req['rec']} | {req['speed']}")
เคล็ดลับ
- GPU: RTX 4090 24GB ดีที่สุดสำหรับ Price/Performance
- Queue: ใช้ Redis Queue จัดคิว ป้องกัน GPU Overload
- Models: แชร์ Models ผ่าน NFS/PVC ไม่ต้อง Download ทุก Worker
- Monitoring: ใช้ Prometheus + Grafana Monitor GPU VRAM Queue
- Autoscale: Scale ตาม Queue Length ไม่ใช่ CPU Usage
ComfyUI คืออะไร
Node-based GUI สำหรับ Stable Diffusion Visual Workflow SDXL SD1.5 ControlNet LoRA IPAdapter Custom Node Python Browser