CDK Construct Chaos Engineering
AWS CDK Construct Chaos Engineering FIS Fault Injection Simulator EC2 ECS EKS RDS Stop Condition CloudWatch IaC TypeScript Production
| FIS Action | Target | Effect | Duration |
|---|---|---|---|
| aws:ec2:stop-instances | EC2 Instance | หยุด Instance ทดสอบ HA | 5-30 นาที |
| aws:ecs:drain-container-instances | ECS Container | Drain Tasks ทดสอบ Scaling | 5-15 นาที |
| aws:fis:inject-api-internal-error | AWS API | จำลอง API Error 500 | 5-10 นาที |
| aws:network:disrupt-connectivity | VPC Subnet | ตัด Network Connectivity | 5-15 นาที |
| aws:ssm:send-command | EC2 via SSM | รัน Stress Test CPU/Memory | 5-30 นาที |
| aws:rds:failover-db-cluster | RDS Aurora | Failover Primary → Replica | 1-5 นาที |
CDK Chaos Construct
# === CDK Chaos Engineering Construct ===
# TypeScript CDK Construct
# import * as cdk from 'aws-cdk-lib';
# import * as fis from 'aws-cdk-lib/aws-fis';
# import * as iam from 'aws-cdk-lib/aws-iam';
#
# export class ChaosExperimentConstruct extends cdk.Construct {
# constructor(scope: cdk.Construct, id: string, props: ChaosProps) {
# super(scope, id);
#
# // IAM Role for FIS
# const fisRole = new iam.Role(this, 'FISRole', {
# assumedBy: new iam.ServicePrincipal('fis.amazonaws.com'),
# managedPolicies: [
# iam.ManagedPolicy.fromAwsManagedPolicyName('PowerUserAccess'),
# ],
# });
#
# // FIS Experiment Template
# new fis.CfnExperimentTemplate(this, 'ChaosExperiment', {
# description: props.description,
# roleArn: fisRole.roleArn,
# stopConditions: [{
# source: 'aws:cloudwatch:alarm',
# value: props.stopAlarmArn,
# }],
# targets: {
# 'ec2-instances': {
# resourceType: 'aws:ec2:instance',
# selectionMode: 'COUNT(1)',
# resourceTags: { 'chaos': 'true' },
# },
# },
# actions: {
# 'stop-instance': {
# actionId: 'aws:ec2:stop-instances',
# parameters: { startInstancesAfterDuration: 'PT5M' },
# targets: { Instances: 'ec2-instances' },
# },
# },
# tags: { Environment: props.environment },
# });
# }
# }
from dataclasses import dataclass
@dataclass
class CDKConstruct:
construct: str
level: str
resources: str
props: str
reusable: bool
constructs = [
CDKConstruct("ChaosEC2StopConstruct",
"L3 (Pattern)",
"FIS Template + IAM Role + CloudWatch Alarm",
"targetTag, duration, stopAlarmArn, environment",
True),
CDKConstruct("ChaosNetworkDisruptConstruct",
"L3 (Pattern)",
"FIS Template + IAM Role + VPC Subnet Target",
"subnetIds, duration, stopAlarmArn",
True),
CDKConstruct("ChaosECSConstruct",
"L3 (Pattern)",
"FIS Template + IAM Role + ECS Cluster Target",
"clusterArn, duration, stopAlarmArn",
True),
CDKConstruct("ChaosRDSFailoverConstruct",
"L3 (Pattern)",
"FIS Template + IAM Role + RDS Cluster Target",
"dbClusterIdentifier, stopAlarmArn",
True),
CDKConstruct("ChaosCPUStressConstruct",
"L3 (Pattern)",
"FIS Template + IAM Role + SSM Document",
"targetTag, cpuPercent, duration, stopAlarmArn",
True),
]
print("=== CDK Chaos Constructs ===")
for c in constructs:
print(f" [{c.construct}] Level: {c.level}")
print(f" Resources: {c.resources}")
print(f" Props: {c.props}")
print(f" Reusable: {c.reusable}")
CI/CD Integration
# === Chaos in CI/CD Pipeline ===
# GitHub Actions
# name: Chaos Engineering Pipeline
# on:
# schedule:
# - cron: '0 10 * * 1' # Every Monday 10 AM
# workflow_dispatch:
# jobs:
# deploy-staging:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - run: npm ci && npx cdk deploy --app 'npx ts-node app.ts' ChaosStack
#
# run-chaos:
# needs: deploy-staging
# runs-on: ubuntu-latest
# steps:
# - name: Start FIS Experiment
# run: |
# EXPERIMENT_ID=$(aws fis start-experiment \
# --experiment-template-id $TEMPLATE_ID \
# --query 'experiment.id' --output text)
# echo "Started: $EXPERIMENT_ID"
# - name: Wait for Completion
# run: |
# aws fis get-experiment --id $EXPERIMENT_ID \
# --query 'experiment.state.status'
# - name: Verify Recovery
# run: |
# curl -f https://staging.example.com/health || exit 1
@dataclass
class PipelinePhase:
phase: str
trigger: str
action: str
success_criteria: str
failure_action: str
phases = [
PipelinePhase("Pre-chaos Baseline",
"Before Experiment",
"บันทึก Metrics Baseline (Latency Error Rate Throughput)",
"Baseline recorded ไม่มี Pre-existing Issues",
"Fix Issues ก่อน Run Chaos"),
PipelinePhase("Start Experiment",
"aws fis start-experiment",
"Run FIS Experiment ตาม Template",
"Experiment Started สถานะ running",
"ตรวจ IAM Permission Template Config"),
PipelinePhase("Monitor During Chaos",
"CloudWatch Dashboard",
"ดู Metrics ระหว่าง Chaos ทำงาน",
"Recovery ใน SLA Target (เช่น 2 นาที)",
"Stop Condition Trigger หรือ Manual Stop"),
PipelinePhase("Post-chaos Verify",
"Health Check + Metrics",
"ตรวจ Service ฟื้นตัวปกติ Metrics กลับ Baseline",
"Health OK Metrics ปกติใน 5 นาที",
"Investigate ทำไม Recovery ช้า/ล้มเหลว"),
PipelinePhase("Report",
"หลัง Experiment เสร็จ",
"สร้าง Report สรุปผล Hypothesis vs Actual",
"Report Generated Findings Documented",
"สร้าง Action Items สำหรับ Improvement"),
]
print("=== CI/CD Chaos Pipeline ===")
for p in phases:
print(f" [{p.phase}] Trigger: {p.trigger}")
print(f" Action: {p.action}")
print(f" Success: {p.success_criteria}")
print(f" Failure: {p.failure_action}")
Best Practices
# === Chaos Best Practices ===
@dataclass
class BestPractice:
practice: str
why: str
how: str
cdk_implementation: str
practices = [
BestPractice("Always Set Stop Conditions",
"ป้องกัน Blast Radius เกินที่คาดไว้",
"CloudWatch Alarm ตรวจ Error Rate Latency",
"stopConditions: [{ source: 'aws:cloudwatch:alarm', value: alarmArn }]"),
BestPractice("Tag-based Targeting",
"ควบคุม Target ชัดเจน ไม่กระทบ Resource อื่น",
"ใช้ Tag chaos=true เฉพาะ Resource ที่ต้องการ",
"resourceTags: { chaos: 'true', environment: 'staging' }"),
BestPractice("Start Small",
"ลด Risk เริ่มจาก 1 Instance/Container",
"selectionMode COUNT(1) ก่อนเพิ่ม",
"selectionMode: 'COUNT(1)' → 'PERCENT(25)'"),
BestPractice("Staging First",
"ตรวจ Experiment ทำงานถูกต้องก่อน Production",
"Deploy Chaos Stack ไป Staging ก่อน",
"new ChaosStack(app, 'ChaosStaging', { env: staging })"),
BestPractice("Hypothesis-driven",
"ต้องรู้ว่าคาดหวังอะไรก่อน Run",
"เขียน Hypothesis เป็น Comment ใน CDK Code",
"// Hypothesis: ASG launches new instance within 2 min"),
]
print("=== Best Practices ===")
for p in practices:
print(f" [{p.practice}]")
print(f" Why: {p.why}")
print(f" How: {p.how}")
print(f" CDK: {p.cdk_implementation}")
เคล็ดลับ
- Stop Condition: ทุก Experiment ต้องมี Stop Condition เสมอ
- Tag: ใช้ Tag chaos=true เลือก Target ชัดเจน
- Construct Library: สร้าง Reusable Construct เป็น npm Package
- Schedule: Run Chaos ทุกสัปดาห์ สร้าง Confidence
- Report: บันทึกผลทุก Experiment เป็น Knowledge Base
AWS CDK คืออะไร
IaC Framework TypeScript Python Java Construct L1 L2 L3 Stack App Synth Deploy Diff CloudFormation Type Safety IDE Testing
Chaos Engineering กับ CDK ทำอย่างไร
CDK สร้าง FIS Experiment Template IaC Custom Construct Reusable Target Tag IAM Stop Condition Git PR Review CI/CD Staging
FIS Experiment สร้างอย่างไร
CfnExperimentTemplate Actions stop-instances drain network-disrupt api-error Targets Tag selectionMode Stop CloudWatch Alarm Duration
Best Practices มีอะไร
Stop Condition เสมอ Tag Target Staging First Start Small Hypothesis Report Schedule Weekly Runbook Team Buy-in Documentation
สรุป
CDK Construct Chaos Engineering AWS FIS Custom Construct Reusable CI/CD Stop Condition Tag Target Staging Hypothesis Report Production
