Ceph Storage Cluster Business Continuity คืออะไร
Ceph เป็น open-source distributed storage system ที่รองรับ Object Storage (S3-compatible), Block Storage (RBD) และ File System (CephFS) ในระบบเดียว ออกแบบมาให้ไม่มี single point of failure และ self-healing อัตโนมัติ Business Continuity Planning (BCP) สำหรับ Ceph Cluster คือการวางแผนให้ระบบ storage ทำงานต่อเนื่องได้แม้เกิดเหตุการณ์ไม่คาดคิด เช่น disk failure, node failure, datacenter outage หรือ disaster recovery scenarios บทความนี้อธิบาย Ceph architecture, BCP strategies และ Python automation tools
Ceph Architecture
# ceph_arch.py — Ceph storage architecture
import json
class CephArchitecture:
COMPONENTS = {
"mon": {
"name": "Monitor (MON)",
"description": "เก็บ cluster map (OSD map, MON map, PG map) — ต้องมีจำนวนคี่ (3, 5, 7)",
"failure": "ถ้า MON ล่ม > 50% → cluster ไม่ทำงาน",
"recommendation": "3 MONs ขั้นต่ำ, 5 MONs สำหรับ production ใหญ่",
},
"osd": {
"name": "OSD (Object Storage Daemon)",
"description": "จัดเก็บข้อมูลจริง — 1 OSD ต่อ 1 disk, replicate data ระหว่าง OSDs",
"failure": "ถ้า OSD ล่ม → Ceph re-replicate data อัตโนมัติ",
"recommendation": "Minimum 3 OSDs สำหรับ replication factor 3",
},
"mgr": {
"name": "Manager (MGR)",
"description": "จัดการ cluster metrics, dashboard, modules — active/standby",
"failure": "ถ้า active MGR ล่ม → standby takeover อัตโนมัติ",
"recommendation": "2 MGRs (active + standby)",
},
"mds": {
"name": "MDS (Metadata Server)",
"description": "จัดการ metadata สำหรับ CephFS — ไม่จำเป็นถ้าใช้แค่ RBD/RGW",
"failure": "ถ้า MDS ล่ม → standby MDS takeover",
"recommendation": "2 MDS (active + standby) สำหรับ CephFS",
},
"rgw": {
"name": "RGW (RADOS Gateway)",
"description": "S3/Swift compatible object storage API",
"failure": "Stateless — load balance ได้เลย",
"recommendation": "2+ RGW instances behind load balancer",
},
}
CRUSH_MAP = {
"what": "CRUSH (Controlled Replication Under Scalable Hashing) — algorithm กำหนดว่า data อยู่ที่ OSD ไหน",
"hierarchy": "root → datacenter → rack → host → OSD",
"failure_domain": "กำหนด failure domain: replicate ข้าม hosts, racks หรือ datacenters",
"benefit": "ไม่ต้อง lookup table — คำนวณ placement ได้เลย",
}
def show_components(self):
print("=== Ceph Components ===\n")
for key, comp in self.COMPONENTS.items():
print(f"[{comp['name']}]")
print(f" {comp['description']}")
print(f" Failure: {comp['failure']}")
print()
def show_crush(self):
print("=== CRUSH Map ===")
for key, val in self.CRUSH_MAP.items():
print(f" [{key}] {val}")
arch = CephArchitecture()
arch.show_components()
arch.show_crush()
Business Continuity Strategies
# bcp.py — Business continuity strategies for Ceph
import json
class CephBCP:
STRATEGIES = {
"replication": {
"name": "Data Replication (size=3)",
"description": "เก็บ data 3 copies ข้าม failure domains (hosts/racks)",
"rpo": "0 (synchronous replication)",
"rto": "seconds — automatic recovery",
"overhead": "3x storage capacity",
"config": "ceph osd pool set mypool size 3 min_size 2",
},
"erasure_coding": {
"name": "Erasure Coding (k+m)",
"description": "แบ่ง data เป็น k chunks + m parity chunks — ทนได้ m failures",
"rpo": "0",
"rto": "seconds to minutes (rebuild time)",
"overhead": "1.5x storage (k=4, m=2) vs 3x for replication",
"config": "ceph osd erasure-code-profile set myprofile k=4 m=2",
},
"stretch_cluster": {
"name": "Stretch Cluster (2 Datacenters)",
"description": "Ceph cluster ข้าม 2 datacenters + arbiter node — synchronous replication",
"rpo": "0",
"rto": "minutes (automatic failover)",
"requirement": "Low latency between DCs (< 10ms RTT)",
},
"rbd_mirroring": {
"name": "RBD Mirroring (Async DR)",
"description": "Mirror block devices ไปยัง remote cluster — async replication",
"rpo": "seconds to minutes (ขึ้นกับ sync interval)",
"rto": "minutes (manual failover)",
"use_case": "DR สำหรับ VMs, databases, Kubernetes PVs",
},
"rgw_multisite": {
"name": "RGW Multi-site (Object Replication)",
"description": "Replicate objects ข้าม clusters/regions — active-active หรือ active-passive",
"rpo": "seconds (async) หรือ 0 (sync zones)",
"rto": "DNS failover — seconds to minutes",
"use_case": "S3-compatible object storage DR",
},
}
TIERS = {
"tier1": {
"name": "Tier 1: High Availability",
"description": "ป้องกัน component failure ภายใน cluster",
"methods": "Replication size=3, CRUSH failure domain=host/rack, MON quorum",
},
"tier2": {
"name": "Tier 2: Disaster Recovery",
"description": "ป้องกัน datacenter-level failure",
"methods": "Stretch cluster, RBD mirroring, RGW multi-site",
},
"tier3": {
"name": "Tier 3: Backup & Archive",
"description": "ป้องกัน data corruption, ransomware, human error",
"methods": "RBD snapshots, RGW versioning, off-site backups",
},
}
def show_strategies(self):
print("=== BCP Strategies ===\n")
for key, s in self.STRATEGIES.items():
print(f"[{s['name']}]")
print(f" RPO: {s['rpo']} | RTO: {s['rto']}")
print(f" {s['description']}")
print()
def show_tiers(self):
print("=== Protection Tiers ===")
for key, tier in self.TIERS.items():
print(f"\n[{tier['name']}]")
print(f" {tier['description']}")
print(f" Methods: {tier['methods']}")
bcp = CephBCP()
bcp.show_strategies()
bcp.show_tiers()
Python Monitoring & Automation
# monitoring.py — Python Ceph monitoring tools
import json
class CephMonitoring:
CODE = """
# ceph_monitor.py — Monitor Ceph cluster health
import subprocess
import json
from datetime import datetime
class CephClusterMonitor:
def __init__(self):
self.history = []
def cluster_health(self):
'''Get cluster health status'''
try:
result = subprocess.run(
['ceph', 'health', 'detail', '-f', 'json'],
capture_output=True, text=True, timeout=10,
)
health = json.loads(result.stdout)
return {
'status': health.get('status', 'unknown'),
'checks': list(health.get('checks', {}).keys()),
'timestamp': datetime.utcnow().isoformat(),
}
except Exception as e:
return {'status': 'error', 'error': str(e)}
def osd_status(self):
'''Get OSD tree and status'''
result = subprocess.run(
['ceph', 'osd', 'tree', '-f', 'json'],
capture_output=True, text=True, timeout=10,
)
tree = json.loads(result.stdout)
osds = [n for n in tree.get('nodes', []) if n.get('type') == 'osd']
up = sum(1 for o in osds if o.get('status') == 'up')
down = sum(1 for o in osds if o.get('status') != 'up')
return {
'total_osds': len(osds),
'up': up,
'down': down,
'health': 'ok' if down == 0 else 'degraded',
'down_osds': [o['name'] for o in osds if o.get('status') != 'up'],
}
def pool_status(self):
'''Get pool statistics'''
result = subprocess.run(
['ceph', 'df', '-f', 'json'],
capture_output=True, text=True, timeout=10,
)
data = json.loads(result.stdout)
pools = []
for pool in data.get('pools', []):
stats = pool.get('stats', {})
pools.append({
'name': pool['name'],
'stored_gb': round(stats.get('stored', 0) / 1e9, 2),
'objects': stats.get('objects', 0),
'used_pct': round(stats.get('percent_used', 0) * 100, 1),
})
total = data.get('stats', {})
return {
'total_capacity_tb': round(total.get('total_bytes', 0) / 1e12, 2),
'used_tb': round(total.get('total_used_bytes', 0) / 1e12, 2),
'available_tb': round(total.get('total_avail_bytes', 0) / 1e12, 2),
'pools': pools,
}
def replication_status(self):
'''Check data replication health'''
result = subprocess.run(
['ceph', 'pg', 'stat', '-f', 'json'],
capture_output=True, text=True, timeout=10,
)
pg_stat = json.loads(result.stdout)
return {
'num_pgs': pg_stat.get('num_pgs', 0),
'active_clean': pg_stat.get('num_pg_by_state', []),
'degraded_objects': pg_stat.get('degraded_objects', 0),
'misplaced_objects': pg_stat.get('misplaced_objects', 0),
'recovering': pg_stat.get('recovering_objects', 0),
}
def bcp_readiness(self):
'''Assess BCP readiness'''
health = self.cluster_health()
osds = self.osd_status()
pools = self.pool_status()
checks = {
'cluster_healthy': health['status'] == 'HEALTH_OK',
'all_osds_up': osds['down'] == 0,
'capacity_ok': pools['used_tb'] / max(pools['total_capacity_tb'], 1) < 0.8,
'min_3_osds': osds['total_osds'] >= 3,
}
score = sum(checks.values()) / len(checks) * 100
return {
'bcp_score': round(score),
'checks': checks,
'recommendation': 'Ready' if score == 100 else 'Action needed',
}
# monitor = CephClusterMonitor()
# health = monitor.cluster_health()
# bcp = monitor.bcp_readiness()
"""
def show_code(self):
print("=== Ceph Monitor ===")
print(self.CODE[:600])
monitor = CephMonitoring()
monitor.show_code()
DR Runbook
# runbook.py — Disaster Recovery runbook
import json
class DRRunbook:
SCENARIOS = {
"disk_failure": {
"name": "Disk/OSD Failure",
"severity": "Low",
"auto_recovery": True,
"steps": [
"1. Ceph detects OSD down → marks out after 600s",
"2. Automatic rebalancing starts → data re-replicated",
"3. Monitor recovery: ceph -w",
"4. Replace failed disk → add new OSD",
"5. Verify: ceph health detail",
],
"rto": "Automatic — minutes to hours (ขึ้นกับ data size)",
},
"node_failure": {
"name": "Node/Host Failure",
"severity": "Medium",
"auto_recovery": True,
"steps": [
"1. Multiple OSDs down → Ceph marks out",
"2. Check MON quorum: ceph quorum_status",
"3. Automatic rebalancing if CRUSH failure domain = host",
"4. Repair/replace node → rejoin cluster",
"5. Verify data integrity: ceph pg repair",
],
"rto": "Minutes (automatic) — hours for full recovery",
},
"datacenter_outage": {
"name": "Datacenter Outage",
"severity": "Critical",
"auto_recovery": False,
"steps": [
"1. Assess: ดู MON quorum ว่ายังมีหรือไม่",
"2. Stretch cluster: automatic failover ถ้า quorum ยังอยู่",
"3. RBD mirroring: promote remote images (rbd mirror image promote)",
"4. RGW multisite: switch DNS ไป secondary zone",
"5. Verify applications connect to DR site",
"6. When primary DC recovered: resync data back",
],
"rto": "Minutes (stretch cluster) — hours (manual DR)",
},
}
def show_scenarios(self):
print("=== DR Scenarios ===\n")
for key, s in self.SCENARIOS.items():
print(f"[{s['name']}] Severity: {s['severity']}")
print(f" Auto Recovery: {s['auto_recovery']}")
print(f" RTO: {s['rto']}")
for step in s['steps'][:3]:
print(f" {step}")
print()
runbook = DRRunbook()
runbook.show_scenarios()
Testing & Validation
# testing.py — BCP testing for Ceph
import json
class BCPTesting:
TESTS = {
"osd_failure": {
"name": "OSD Failure Test",
"procedure": "Stop 1 OSD → verify cluster recovers → restart OSD",
"command": "systemctl stop ceph-osd@0",
"verify": "ceph health detail → HEALTH_WARN → HEALTH_OK after recovery",
"frequency": "Monthly",
},
"node_failure": {
"name": "Node Failure Test",
"procedure": "Shutdown 1 node → verify cluster still serves data",
"command": "ssh node1 'shutdown -h now'",
"verify": "ceph -s → degraded but serving, applications working",
"frequency": "Quarterly",
},
"dr_failover": {
"name": "DR Failover Test",
"procedure": "Simulate DC failure → failover to DR → verify",
"command": "rbd mirror image promote pool/image --force",
"verify": "Applications connect to DR site, data accessible",
"frequency": "Semi-annually",
},
"backup_restore": {
"name": "Backup Restore Test",
"procedure": "Restore from backup → verify data integrity",
"command": "rbd snap rollback pool/image@snapshot",
"verify": "Data matches original, checksums correct",
"frequency": "Quarterly",
},
}
def show_tests(self):
print("=== BCP Tests ===\n")
for key, test in self.TESTS.items():
print(f"[{test['name']}] ({test['frequency']})")
print(f" Procedure: {test['procedure']}")
print(f" Verify: {test['verify']}")
print()
testing = BCPTesting()
testing.show_tests()
FAQ - คำถามที่พบบ่อย
Q: Ceph เหมาะกับ production ไหม?
A: เหมาะมาก — ใช้โดย CERN, Bloomberg, Deutsche Telekom, OVHcloud, DigitalOcean ข้อดี: open-source, self-healing, scale-out, unified storage (block+object+file) ข้อควรระวัง: ต้องมีทีมที่เข้าใจ Ceph, hardware requirements สูง (10GbE+, SSD สำหรับ WAL/DB) Minimum: 3 nodes, 3 MONs, 9+ OSDs สำหรับ production
Q: Replication กับ Erasure Coding อันไหนดีกว่า?
A: Replication (size=3): เร็วกว่าทั้ง read/write, recovery เร็ว, แต่ใช้ capacity 3x Erasure Coding (k=4, m=2): ประหยัด capacity (1.5x), แต่ write ช้ากว่า, recovery ช้ากว่า เลือก Replication: hot data, databases, VMs (block storage) เลือก EC: cold data, archives, large objects (object storage) Best practice: Replication สำหรับ RBD pools, EC สำหรับ RGW data pools
Q: Ceph ต้องใช้กี่ nodes ขั้นต่ำ?
A: Development: 1 node (ไม่แนะนำ production) Minimum production: 3 nodes (MON quorum + replication size 3) Recommended: 5+ nodes (MON quorum 5, แยก failure domains) Large scale: 10-100+ nodes แนะนำ: แยก MON/MGR nodes จาก OSD nodes ใน production ใหญ่
Q: RPO=0 เป็นไปได้ไหมกับ Ceph?
A: ได้ — ภายใน cluster: synchronous replication (size=3) → RPO=0 ข้าม datacenters: stretch cluster → RPO=0 (ต้อง latency < 10ms) RBD mirroring: RPO = seconds (async) — ไม่ใช่ 0 สำคัญ: RPO=0 ข้าม DC ต้อง low latency — ถ้า latency สูง performance จะลดลงมาก tradeoff: RPO=0 = ช้ากว่า (sync write ทั้ง 2 DCs), RPO > 0 = เร็วกว่า (async)
