Supabase Realtime Disaster Recovery Plan คืออะไร
Supabase เป็น open source Firebase alternative ที่สร้างบน PostgreSQL มี Realtime feature สำหรับ subscribe การเปลี่ยนแปลงข้อมูลแบบ real-time ผ่าน WebSocket Disaster Recovery (DR) Plan คือแผนกู้คืนระบบเมื่อเกิดเหตุการณ์ร้ายแรง เช่น server down, data corruption หรือ region outage การรวมสองแนวคิดนี้ช่วยให้ Supabase Realtime applications มีความพร้อมใช้งานสูง กู้คืนได้เร็ว และไม่สูญเสียข้อมูลสำคัญ บทความนี้อธิบาย architecture, DR strategies, implementation และ testing สำหรับ Supabase Realtime systems
Supabase Realtime Architecture
# supabase_arch.py — Supabase Realtime architecture
import json
class SupabaseRealtimeArch:
COMPONENTS = {
"postgresql": {
"name": "PostgreSQL Database",
"description": "Core database — เก็บข้อมูลทั้งหมด, Row Level Security (RLS)",
"dr_concern": "Data loss, corruption, replication lag",
},
"realtime": {
"name": "Realtime Server",
"description": "Listen to PostgreSQL WAL → broadcast changes via WebSocket",
"dr_concern": "Connection loss, message loss, subscription state",
},
"auth": {
"name": "GoTrue (Auth)",
"description": "User authentication — JWT tokens, OAuth, Magic links",
"dr_concern": "Token validation, session state",
},
"storage": {
"name": "Storage (S3-compatible)",
"description": "File storage — images, documents, uploads",
"dr_concern": "File loss, access permissions",
},
"edge_functions": {
"name": "Edge Functions (Deno)",
"description": "Serverless functions — business logic, webhooks",
"dr_concern": "Function code, environment variables, secrets",
},
"postgrest": {
"name": "PostgREST (API)",
"description": "Auto-generated REST API จาก PostgreSQL schema",
"dr_concern": "API availability, schema changes",
},
}
REALTIME_FLOW = """
Realtime Data Flow:
1. Client subscribes to channel via WebSocket
2. PostgreSQL WAL (Write-Ahead Log) captures changes
3. Realtime server reads WAL via logical replication
4. Server broadcasts changes to subscribed clients
5. Client receives real-time updates
Key: WAL is the source of truth for Realtime
"""
def show_components(self):
print("=== Supabase Components ===\n")
for key, comp in self.COMPONENTS.items():
print(f"[{comp['name']}]")
print(f" {comp['description']}")
print(f" DR: {comp['dr_concern']}")
print()
def show_flow(self):
print("=== Realtime Flow ===")
print(self.REALTIME_FLOW)
arch = SupabaseRealtimeArch()
arch.show_components()
arch.show_flow()
Disaster Recovery Strategies
# dr_strategies.py — DR strategies for Supabase
import json
class DRStrategies:
STRATEGIES = {
"backup_restore": {
"name": "1. Backup & Restore",
"rpo": "1-24 hours (ขึ้นกับ backup frequency)",
"rto": "1-4 hours",
"cost": "ต่ำ",
"how": "pg_dump scheduled backups + point-in-time recovery (PITR)",
"suitable": "Small-medium projects, acceptable downtime",
},
"warm_standby": {
"name": "2. Warm Standby",
"rpo": "< 5 minutes (streaming replication lag)",
"rto": "15-30 minutes",
"cost": "ปานกลาง",
"how": "PostgreSQL streaming replication → standby ready to promote",
"suitable": "Production applications requiring moderate availability",
},
"hot_standby": {
"name": "3. Hot Standby (Active-Passive)",
"rpo": "< 1 minute (synchronous replication)",
"rto": "< 5 minutes",
"cost": "สูง",
"how": "Synchronous replication + auto-failover (Patroni/pg_auto_failover)",
"suitable": "Mission-critical applications",
},
"multi_region": {
"name": "4. Multi-Region Active-Active",
"rpo": "Near-zero",
"rto": "< 1 minute",
"cost": "สูงมาก",
"how": "Multiple Supabase instances + global load balancer + conflict resolution",
"suitable": "Global applications requiring zero downtime",
},
}
def show_strategies(self):
print("=== DR Strategies ===\n")
for key, strat in self.STRATEGIES.items():
print(f"[{strat['name']}]")
print(f" RPO: {strat['rpo']} | RTO: {strat['rto']} | Cost: {strat['cost']}")
print(f" How: {strat['how']}")
print()
def comparison(self):
print("=== Strategy Comparison ===")
print(f" {'Strategy':<25} {'RPO':<15} {'RTO':<15} {'Cost':<10}")
print(f" {'-'*65}")
for key, s in self.STRATEGIES.items():
print(f" {s['name'][:25]:<25} {s['rpo']:<15} {s['rto']:<15} {s['cost']:<10}")
dr = DRStrategies()
dr.show_strategies()
dr.comparison()
Implementation
# implementation.py — DR implementation for Supabase
import json
class DRImplementation:
BACKUP_SCRIPT = """
# backup.py — Automated Supabase backup
import subprocess
import boto3
from datetime import datetime
import os
class SupabaseBackup:
def __init__(self, db_url, s3_bucket):
self.db_url = db_url
self.s3_bucket = s3_bucket
self.s3 = boto3.client('s3')
def pg_dump(self):
'''Full database backup'''
timestamp = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
filename = f"supabase_backup_{timestamp}.sql.gz"
cmd = f"pg_dump {self.db_url} | gzip > /tmp/{filename}"
result = subprocess.run(cmd, shell=True, capture_output=True)
if result.returncode != 0:
raise Exception(f"pg_dump failed: {result.stderr.decode()}")
# Upload to S3
self.s3.upload_file(
f"/tmp/{filename}",
self.s3_bucket,
f"backups/daily/{filename}",
)
# Cleanup local
os.remove(f"/tmp/{filename}")
return filename
def wal_archive(self):
'''Enable WAL archiving for PITR'''
config = '''
# postgresql.conf additions
archive_mode = on
archive_command = 'aws s3 cp %p s3://bucket/wal/%f'
wal_level = replica
max_wal_senders = 5
'''
return config
def restore(self, backup_file, target_time=None):
'''Restore from backup'''
# Download from S3
self.s3.download_file(self.s3_bucket, backup_file, '/tmp/restore.sql.gz')
if target_time:
# Point-in-time recovery
cmd = f"gunzip -c /tmp/restore.sql.gz | psql {self.db_url}"
# Then replay WAL to target_time
else:
cmd = f"gunzip -c /tmp/restore.sql.gz | psql {self.db_url}"
result = subprocess.run(cmd, shell=True, capture_output=True)
return result.returncode == 0
# Schedule: daily full backup + continuous WAL archiving
backup = SupabaseBackup(
db_url="postgresql://user:pass@db:5432/supabase",
s3_bucket="supabase-backups",
)
"""
FAILOVER_SCRIPT = """
# failover.py — Automated failover
import requests
import time
class FailoverManager:
def __init__(self, primary_url, standby_url, health_check_interval=10):
self.primary = primary_url
self.standby = standby_url
self.interval = health_check_interval
self.failed_checks = 0
self.threshold = 3
def health_check(self, url):
try:
resp = requests.get(f"{url}/rest/v1/", timeout=5)
return resp.status_code == 200
except:
return False
def monitor(self):
while True:
if not self.health_check(self.primary):
self.failed_checks += 1
if self.failed_checks >= self.threshold:
self.failover()
self.failed_checks = 0
else:
self.failed_checks = 0
time.sleep(self.interval)
def failover(self):
print(f"FAILOVER: Promoting standby to primary")
# 1. Promote standby PostgreSQL
# 2. Update DNS to point to standby
# 3. Restart Realtime server with new DB
# 4. Notify team
self.notify_team("Failover executed")
"""
def show_backup(self):
print("=== Backup Script ===")
print(self.BACKUP_SCRIPT[:600])
def show_failover(self):
print(f"\n=== Failover Script ===")
print(self.FAILOVER_SCRIPT[:500])
impl = DRImplementation()
impl.show_backup()
impl.show_failover()
Realtime Recovery
# realtime_recovery.py — Realtime-specific recovery
import json
import random
class RealtimeRecovery:
CHALLENGES = {
"subscription_state": {
"name": "Subscription State Loss",
"problem": "เมื่อ Realtime server restart → clients disconnect → subscription state หาย",
"solution": "Client-side reconnection logic + re-subscribe + fetch missed events",
},
"message_gap": {
"name": "Message Gap",
"problem": "Events ที่เกิดระหว่าง failover อาจหายไป",
"solution": "ใช้ timestamp-based catch-up query หลัง reconnect",
},
"wal_position": {
"name": "WAL Position Tracking",
"problem": "Realtime server ต้องรู้ว่าอ่าน WAL ถึงไหนแล้ว",
"solution": "Store WAL position ใน persistent storage → resume from correct position",
},
}
CLIENT_RECOVERY = """
// client_recovery.js — Supabase client reconnection
import { createClient } from '@supabase/supabase-js'
const supabase = createClient(SUPABASE_URL, SUPABASE_KEY)
class RealtimeRecoveryClient {
constructor(table, callback) {
this.table = table
this.callback = callback
this.lastEventTime = new Date().toISOString()
this.channel = null
}
subscribe() {
this.channel = supabase
.channel(`-changes`)
.on('postgres_changes',
{ event: '*', schema: 'public', table: this.table },
(payload) => {
this.lastEventTime = new Date().toISOString()
this.callback(payload)
}
)
.subscribe((status) => {
if (status === 'SUBSCRIBED') {
// Catch up on missed events
this.catchUp()
}
})
}
async catchUp() {
// Fetch events that happened while disconnected
const { data } = await supabase
.from(this.table)
.select('*')
.gte('updated_at', this.lastEventTime)
.order('updated_at', { ascending: true })
if (data) {
data.forEach(row => this.callback({
eventType: 'CATCH_UP', new: row
}))
}
}
}
const recovery = new RealtimeRecoveryClient('messages', (payload) => {
console.log('Change:', payload)
})
recovery.subscribe()
"""
def show_challenges(self):
print("=== Realtime Recovery Challenges ===\n")
for key, ch in self.CHALLENGES.items():
print(f"[{ch['name']}]")
print(f" Problem: {ch['problem']}")
print(f" Solution: {ch['solution']}")
print()
def show_client(self):
print("=== Client Recovery Code ===")
print(self.CLIENT_RECOVERY[:500])
recovery = RealtimeRecovery()
recovery.show_challenges()
recovery.show_client()
DR Testing & Runbooks
# dr_testing.py — DR testing framework
import json
import random
class DRTesting:
TEST_SCENARIOS = {
"db_failover": {
"name": "Database Failover Test",
"frequency": "Monthly",
"steps": [
"1. Announce test to team (non-production first)",
"2. Stop primary PostgreSQL",
"3. Verify standby promotes automatically",
"4. Verify Realtime reconnects to new primary",
"5. Verify data consistency (row counts, checksums)",
"6. Measure actual RTO and RPO",
"7. Document results + issues",
],
},
"backup_restore": {
"name": "Backup Restore Test",
"frequency": "Weekly",
"steps": [
"1. Take latest backup",
"2. Restore to test environment",
"3. Verify data integrity (compare row counts)",
"4. Verify application functionality",
"5. Measure restore time",
],
},
"realtime_reconnect": {
"name": "Realtime Reconnection Test",
"frequency": "Weekly",
"steps": [
"1. Connect 100+ WebSocket clients",
"2. Restart Realtime server",
"3. Verify all clients reconnect within 30 seconds",
"4. Verify no messages lost (catch-up query works)",
"5. Verify subscription state restored",
],
},
}
RUNBOOK = {
"primary_down": {
"name": "Primary Database Down",
"severity": "P1",
"steps": [
"1. Verify primary is actually down (not network issue)",
"2. Check if auto-failover triggered",
"3. If not → manually promote standby: pg_ctl promote",
"4. Update Supabase config to point to new primary",
"5. Restart Realtime + PostgREST services",
"6. Verify client connectivity + data integrity",
"7. Investigate root cause",
"8. Rebuild new standby from promoted primary",
],
},
}
def show_tests(self):
print("=== DR Test Scenarios ===\n")
for key, test in self.TEST_SCENARIOS.items():
print(f"[{test['name']}] Frequency: {test['frequency']}")
for step in test["steps"][:4]:
print(f" {step}")
print()
def show_runbook(self):
print("=== Runbook: Primary DB Down ===")
rb = self.RUNBOOK["primary_down"]
print(f"Severity: {rb['severity']}")
for step in rb["steps"][:5]:
print(f" {step}")
def dr_metrics(self):
print(f"\n=== DR Metrics ===")
print(f" Last failover test: {random.randint(5, 30)} days ago")
print(f" Last backup restore test: {random.randint(1, 7)} days ago")
print(f" Measured RTO: {random.randint(2, 15)} minutes")
print(f" Measured RPO: {random.randint(0, 5)} minutes")
print(f" Backup success rate: {random.uniform(99, 100):.2f}%")
test = DRTesting()
test.show_tests()
test.show_runbook()
test.dr_metrics()
FAQ - คำถามที่พบบ่อย
Q: Supabase Cloud มี DR built-in ไหม?
A: Supabase Pro: daily backups, 7-day retention, point-in-time recovery Supabase Enterprise: custom backup schedule, multi-region, dedicated support Self-hosted: ต้อง setup DR เอง (backup, replication, failover) แนะนำ: Pro plan ขึ้นไปสำหรับ production — ได้ PITR + daily backups
Q: RPO และ RTO ควรเป็นเท่าไหร่?
A: RPO (Recovery Point Objective): ยอมเสียข้อมูลได้กี่นาที RTO (Recovery Time Objective): ระบบ down ได้นานเท่าไหร่ ทั่วไป: RPO < 1 hour, RTO < 4 hours Critical: RPO < 5 min, RTO < 15 min Mission-critical: RPO near-zero, RTO < 5 min ขึ้นกับ business impact — คำนวณจาก cost of downtime per hour
Q: Realtime reconnect อัตโนมัติไหม?
A: Supabase client library มี auto-reconnect built-in แต่: subscription state อาจหาย → ต้อง re-subscribe messages ระหว่าง disconnect หาย → ต้อง catch-up query Best practice: implement catch-up logic ที่ client ด้วย timestamp-based query
Q: ต้อง test DR บ่อยแค่ไหน?
A: Backup restore: ทุกสัปดาห์ (automated) Failover test: ทุกเดือน (manual/automated) Full DR drill: ทุกไตรมาส (team exercise) อย่า trust backup ที่ไม่เคย test restore — "backup ที่ restore ไม่ได้ = ไม่มี backup"
