Web Components Disaster Recovery Plan

DR Plan

Web Components Disaster Recovery Plan Backup RTO RPO Failover Architecture Recovery Testing 3-2-1 Rule Replication Off-site Chaos Engineering Production

Component	RTO	RPO	Backup Method	Recovery Method
Database	15 min	5 min	WAL + Streaming Replica	Promote Replica
Application	5 min	0 (stateless)	Container Image Registry	Redeploy from Registry
Static Files	10 min	1 hr	S3 Cross-region Replication	Switch CDN Origin
Config/Secrets	5 min	0	Git + Vault	Reapply from Git/Vault
DNS	5 min	N/A	Multiple NS providers	Update DNS records
SSL Certs	5 min	N/A	Cert Manager + Vault	Reissue or restore

Backup Strategy

# === Backup Configuration ===

# PostgreSQL WAL Archiving
# postgresql.conf:
# wal_level = replica
# archive_mode = on
# archive_command = 'aws s3 cp %p s3://db-backup/wal/%f'
# max_wal_senders = 5

# Streaming Replication Setup
# primary: pg_hba.conf
# host replication replicator 10.0.0.0/8 md5
#
# replica:
# pg_basebackup -h primary -D /var/lib/postgresql/data -U replicator -P -R

# Daily Full Backup
# #!/bin/bash
# DATE=$(date +%Y%m%d)
# pg_dump -Fc mydb > /backup/mydb_.dump
# aws s3 cp /backup/mydb_.dump s3://db-backup/daily/
# find /backup -name "*.dump" -mtime +30 -delete

# File Backup with rsync
# rsync -avz --delete /var/www/html/ backup-server:/backup/www/
# aws s3 sync /var/www/uploads s3://file-backup/uploads/ --delete

# Config Backup
# git add -A && git commit -m "config backup $(date)" && git push

from dataclasses import dataclass

@dataclass
class BackupJob:
    component: str
    method: str
    frequency: str
    retention: str
    destination: str
    encryption: str
    test_frequency: str

jobs = [
    BackupJob("PostgreSQL Full", "pg_dump", "Daily 02:00", "30 days", "S3 ap-southeast-1", "AES-256", "Monthly"),
    BackupJob("PostgreSQL WAL", "WAL Archiving", "Continuous", "7 days", "S3 ap-southeast-1", "AES-256", "Monthly"),
    BackupJob("PostgreSQL Replica", "Streaming", "Real-time", "N/A", "DR Site Tokyo", "TLS", "Quarterly"),
    BackupJob("Uploads/Media", "S3 Sync", "Hourly", "90 days", "S3 cross-region", "SSE-S3", "Monthly"),
    BackupJob("Config/IaC", "Git Push", "On change", "Unlimited", "GitHub Private", "SSH", "On deploy"),
    BackupJob("Secrets", "Vault Snapshot", "Daily", "30 days", "S3 encrypted", "AES-256", "Monthly"),
]

print("=== Backup Jobs ===")
for j in jobs:
    print(f"  [{j.component}] Method: {j.method}")
    print(f"    Freq: {j.frequency} | Retention: {j.retention}")
    print(f"    Dest: {j.destination} | Encrypt: {j.encryption}")
    print(f"    Test: {j.test_frequency}")

Failover Architecture

# === Multi-region Failover ===

# DNS Failover (Route53 / Cloudflare)
# Primary: ap-southeast-1 (Singapore) — Active
# DR: ap-northeast-1 (Tokyo) — Standby
# Health Check: HTTP /health every 30s
# Failover: Auto switch DNS when primary unhealthy

# Kubernetes Multi-cluster
# kubectl config use-context primary-cluster
# kubectl get pods  # Running workloads
#
# kubectl config use-context dr-cluster
# kubectl get pods  # Standby workloads (scaled to 0 or warm)
#
# Failover script:
# kubectl --context dr-cluster scale deploy/web --replicas=3
# kubectl --context dr-cluster scale deploy/api --replicas=3
# # Update DNS to DR cluster IP

# Database Failover
# On DR site:
# pg_ctl promote -D /var/lib/postgresql/data
# # Replica becomes new primary

@dataclass
class FailoverStep:
    step: int
    action: str
    responsible: str
    time_estimate: str
    automated: bool
    verification: str

steps = [
    FailoverStep(1, "Detect failure — monitoring alert", "Monitoring System", "1-2 min", True, "Alert received in Opsgenie"),
    FailoverStep(2, "Assess scope — determine affected components", "On-call Engineer", "2-5 min", False, "Impact assessment complete"),
    FailoverStep(3, "Activate DR — switch DNS to DR site", "On-call Engineer", "1-2 min", True, "DNS propagated"),
    FailoverStep(4, "Promote DB replica", "DBA / Script", "2-5 min", True, "DB accepting writes"),
    FailoverStep(5, "Scale DR workloads", "DevOps / Script", "2-3 min", True, "All pods running"),
    FailoverStep(6, "Verify services — health checks pass", "On-call Engineer", "3-5 min", True, "All health checks green"),
    FailoverStep(7, "Notify stakeholders", "Incident Commander", "5 min", True, "Status page updated"),
    FailoverStep(8, "Monitor DR site", "Team", "Ongoing", False, "Metrics stable 30 min"),
]

print("=== Failover Runbook ===")
total_time = 0
for s in steps:
    auto = "Auto" if s.automated else "Manual"
    print(f"  Step {s.step}: {s.action}")
    print(f"    Who: {s.responsible} | Time: {s.time_estimate} | {auto}")
    print(f"    Verify: {s.verification}")

DR Testing

# === DR Test Plan ===

@dataclass
class DRTest:
    test_type: str
    frequency: str
    scope: str
    duration: str
    participants: str
    success_criteria: str

tests = [
    DRTest("Tabletop Exercise", "Monthly", "Discuss scenarios, review runbook", "1 hour",
        "All engineers + management", "Everyone knows their role"),
    DRTest("Backup Restore", "Monthly", "Restore DB from latest backup", "30 min",
        "DBA + DevOps", "DB restored, data verified"),
    DRTest("Partial Failover", "Quarterly", "Failover one component to DR", "2 hours",
        "DevOps + Backend", "Component running on DR, traffic flowing"),
    DRTest("Full Failover", "Bi-annually", "Complete switch to DR site", "4 hours",
        "All teams", "All services running on DR, RTO met"),
    DRTest("Chaos Engineering", "Monthly", "Kill random pods/nodes", "1 hour",
        "DevOps + SRE", "System self-heals, no customer impact"),
]

print("DR Test Plan:")
for t in tests:
    print(f"  [{t.test_type}] {t.frequency}")
    print(f"    Scope: {t.scope}")
    print(f"    Duration: {t.duration} | Team: {t.participants}")
    print(f"    Pass: {t.success_criteria}")

dr_metrics = {
    "Last Full DR Test": "2024-10-15 — Passed (RTO: 18 min)",
    "Last Backup Restore": "2024-11-01 — Passed (12 min)",
    "Backup Success Rate": "99.8% (2 failures in 365 days)",
    "Actual RTO (last incident)": "22 minutes",
    "Actual RPO (last incident)": "3 minutes",
    "DR Site Readiness": "Warm standby, auto-scale ready",
    "Runbook Last Updated": "2024-11-15",
}

print(f"\n\nDR Metrics:")
for k, v in dr_metrics.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

3-2-1: ใช้ 3-2-1 Rule สำหรับ Backup เสมอ
Test: ทดสอบ DR Plan ทุก Quarter ไม่ใช่แค่เขียนแผน
Automate: Automate Failover Steps ให้มากที่สุด
Runbook: เขียน Runbook ชัดเจน ทุกู้คืนทำตามได้
Monitor: Monitor Backup Status ทุกวัน Alert เมื่อ Failed

Disaster Recovery Plan คืออะไร

แผนกู้คืนระบบ Server ล่ม ไฟไหม้ Ransomware RTO RPO ขั้นตอนกู้คืน ทดสอบ Quarter อัปเดตเมื่อเปลี่ยน Business Impact