Web Components Disaster Recovery Plan —
DR Plan
Web Components Disaster Recovery Plan Backup RTO RPO Failover Architecture Recovery Testing 3-2-1 Rule Replication Off-site Chaos Engineering Production
| Component | RTO | RPO | Backup Method | Recovery Method |
|---|---|---|---|---|
| Database | 15 min | 5 min | WAL + Streaming Replica | Promote Replica |
| Application | 5 min | 0 (stateless) | Container Image Registry | Redeploy from Registry |
| Static Files | 10 min | 1 hr | S3 Cross-region Replication | Switch CDN Origin |
| Config/Secrets | 5 min | 0 | Git + Vault | Reapply from Git/Vault |
| DNS | 5 min | N/A | Multiple NS providers | Update DNS records |
| SSL Certs | 5 min | N/A | Cert Manager + Vault | Reissue or restore |
Backup Strategy
# === Backup Configuration ===
# PostgreSQL WAL Archiving
# postgresql.conf:
# wal_level = replica
# archive_mode = on
# archive_command = 'aws s3 cp %p s3://db-backup/wal/%f'
# max_wal_senders = 5
# Streaming Replication Setup
# primary: pg_hba.conf
# host replication replicator 10.0.0.0/8 md5
#
# replica:
# pg_basebackup -h primary -D /var/lib/postgresql/data -U replicator -P -R
# Daily Full Backup
# #!/bin/bash
# DATE=$(date +%Y%m%d)
# pg_dump -Fc mydb > /backup/mydb_.dump
# aws s3 cp /backup/mydb_.dump s3://db-backup/daily/
# find /backup -name "*.dump" -mtime +30 -delete
# File Backup with rsync
# rsync -avz --delete /var/www/html/ backup-server:/backup/www/
# aws s3 sync /var/www/uploads s3://file-backup/uploads/ --delete
# Config Backup
# git add -A && git commit -m "config backup $(date)" && git push
from dataclasses import dataclass
@dataclass
class BackupJob:
component: str
method: str
frequency: str
retention: str
destination: str
encryption: str
test_frequency: str
jobs = [
BackupJob("PostgreSQL Full", "pg_dump", "Daily 02:00", "30 days", "S3 ap-southeast-1", "AES-256", "Monthly"),
BackupJob("PostgreSQL WAL", "WAL Archiving", "Continuous", "7 days", "S3 ap-southeast-1", "AES-256", "Monthly"),
BackupJob("PostgreSQL Replica", "Streaming", "Real-time", "N/A", "DR Site Tokyo", "TLS", "Quarterly"),
BackupJob("Uploads/Media", "S3 Sync", "Hourly", "90 days", "S3 cross-region", "SSE-S3", "Monthly"),
BackupJob("Config/IaC", "Git Push", "On change", "Unlimited", "GitHub Private", "SSH", "On deploy"),
BackupJob("Secrets", "Vault Snapshot", "Daily", "30 days", "S3 encrypted", "AES-256", "Monthly"),
]
print("=== Backup Jobs ===")
for j in jobs:
print(f" [{j.component}] Method: {j.method}")
print(f" Freq: {j.frequency} | Retention: {j.retention}")
print(f" Dest: {j.destination} | Encrypt: {j.encryption}")
print(f" Test: {j.test_frequency}")
Failover Architecture
# === Multi-region Failover ===
# DNS Failover (Route53 / Cloudflare)
# Primary: ap-southeast-1 (Singapore) — Active
# DR: ap-northeast-1 (Tokyo) — Standby
# Health Check: HTTP /health every 30s
# Failover: Auto switch DNS when primary unhealthy
# Kubernetes Multi-cluster
# kubectl config use-context primary-cluster
# kubectl get pods # Running workloads
#
# kubectl config use-context dr-cluster
# kubectl get pods # Standby workloads (scaled to 0 or warm)
#
# Failover script:
# kubectl --context dr-cluster scale deploy/web --replicas=3
# kubectl --context dr-cluster scale deploy/api --replicas=3
# # Update DNS to DR cluster IP
# Database Failover
# On DR site:
# pg_ctl promote -D /var/lib/postgresql/data
# # Replica becomes new primary
@dataclass
class FailoverStep:
step: int
action: str
responsible: str
time_estimate: str
automated: bool
verification: str
steps = [
FailoverStep(1, "Detect failure — monitoring alert", "Monitoring System", "1-2 min", True, "Alert received in Opsgenie"),
FailoverStep(2, "Assess scope — determine affected components", "On-call Engineer", "2-5 min", False, "Impact assessment complete"),
FailoverStep(3, "Activate DR — switch DNS to DR site", "On-call Engineer", "1-2 min", True, "DNS propagated"),
FailoverStep(4, "Promote DB replica", "DBA / Script", "2-5 min", True, "DB accepting writes"),
FailoverStep(5, "Scale DR workloads", "DevOps / Script", "2-3 min", True, "All pods running"),
FailoverStep(6, "Verify services — health checks pass", "On-call Engineer", "3-5 min", True, "All health checks green"),
FailoverStep(7, "Notify stakeholders", "Incident Commander", "5 min", True, "Status page updated"),
FailoverStep(8, "Monitor DR site", "Team", "Ongoing", False, "Metrics stable 30 min"),
]
print("=== Failover Runbook ===")
total_time = 0
for s in steps:
auto = "Auto" if s.automated else "Manual"
print(f" Step {s.step}: {s.action}")
print(f" Who: {s.responsible} | Time: {s.time_estimate} | {auto}")
print(f" Verify: {s.verification}")
DR Testing
# === DR Test Plan ===
@dataclass
class DRTest:
test_type: str
frequency: str
scope: str
duration: str
participants: str
success_criteria: str
tests = [
DRTest("Tabletop Exercise", "Monthly", "Discuss scenarios, review runbook", "1 hour",
"All engineers + management", "Everyone knows their role"),
DRTest("Backup Restore", "Monthly", "Restore DB from latest backup", "30 min",
"DBA + DevOps", "DB restored, data verified"),
DRTest("Partial Failover", "Quarterly", "Failover one component to DR", "2 hours",
"DevOps + Backend", "Component running on DR, traffic flowing"),
DRTest("Full Failover", "Bi-annually", "Complete switch to DR site", "4 hours",
"All teams", "All services running on DR, RTO met"),
DRTest("Chaos Engineering", "Monthly", "Kill random pods/nodes", "1 hour",
"DevOps + SRE", "System self-heals, no customer impact"),
]
print("DR Test Plan:")
for t in tests:
print(f" [{t.test_type}] {t.frequency}")
print(f" Scope: {t.scope}")
print(f" Duration: {t.duration} | Team: {t.participants}")
print(f" Pass: {t.success_criteria}")
dr_metrics = {
"Last Full DR Test": "2024-10-15 — Passed (RTO: 18 min)",
"Last Backup Restore": "2024-11-01 — Passed (12 min)",
"Backup Success Rate": "99.8% (2 failures in 365 days)",
"Actual RTO (last incident)": "22 minutes",
"Actual RPO (last incident)": "3 minutes",
"DR Site Readiness": "Warm standby, auto-scale ready",
"Runbook Last Updated": "2024-11-15",
}
print(f"\n\nDR Metrics:")
for k, v in dr_metrics.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- 3-2-1: ใช้ 3-2-1 Rule สำหรับ Backup เสมอ
- Test: ทดสอบ DR Plan ทุก Quarter ไม่ใช่แค่เขียนแผน
- Automate: Automate Failover Steps ให้มากที่สุด
- Runbook: เขียน Runbook ชัดเจน ทุกู้คืนทำตามได้
- Monitor: Monitor Backup Status ทุกวัน Alert เมื่อ Failed
Disaster Recovery Plan คืออะไร
แผนกู้คืนระบบ Server ล่ม ไฟไหม้ Ransomware RTO RPO ขั้นตอนกู้คืน ทดสอบ Quarter อัปเดตเมื่อเปลี่ยน Business Impact