Ceph Storage Cluster Capacity Planning — วางแผน Storage ด้วย Ceph
Ceph Storage Cluster คืออะไร
Ceph เป็น open source distributed storage system ที่ให้บริการ object storage, block storage และ file storage บน single unified platform ออกแบบมาเพื่อ high availability, scalability และ self-healing ใช้ CRUSH algorithm สำหรับ data placement ไม่มี single point of failure
Components หลักของ Ceph ได้แก่ MON (Monitor) จัดการ cluster map และ consensus ต้องมีอย่างน้อย 3 nodes, OSD (Object Storage Daemon) เก็บข้อมูลจริง แต่ละ OSD ดูแล disk หนึ่งลูก, MDS (Metadata Server) จัดการ metadata สำหรับ CephFS, MGR (Manager) จัดการ monitoring, dashboard และ orchestration, RGW (RADOS Gateway) ให้บริการ S3/Swift compatible object storage
Capacity Planning สำหรับ Ceph สำคัญมากเพราะ replication factor กระทบ usable capacity โดยตรง (3x replication ใช้ raw space 3 เท่า), OSD placement groups ต้อง size ถูกต้องเพื่อ performance ที่ดี, network bandwidth ต้องเพียงพอสำหรับ replication traffic, recovery จาก disk failure ต้องมี spare capacity และ growth projection ต้องวางแผนล่วงหน้า 6-12 เดือน
ติดตั้ง Ceph Cluster
วิธีติดตั้ง Ceph ด้วย cephadm
# === ติดตั้ง Ceph Cluster ด้วย cephadm ===
# Prerequisites:
# - 3+ nodes with Ubuntu 22.04 or Rocky Linux 9
# - Each node: 4+ CPU cores, 8+ GB RAM, 1+ SSD/NVMe for OSD
# - Network: 10GbE recommended (minimum 1GbE)
# - Separate public and cluster networks recommended
# 1. Install cephadm on first node
curl --silent --remote-name --location https://download.ceph.com/rpm-reef/el9/noarch/cephadm
chmod +x cephadm
./cephadm add-repo --release reef
./cephadm install
# 2. Bootstrap cluster
cephadm bootstrap \
--mon-ip 10.0.1.10 \
--cluster-network 10.0.2.0/24 \
--dashboard-password-noupdate \
--initial-dashboard-password MyDashP@ss
# Dashboard: https://10.0.1.10:8443
# Default user: admin
# 3. Add hosts
# Copy SSH key to other nodes first
ssh-copy-id root@ceph-node2
ssh-copy-id root@ceph-node3
ceph orch host add ceph-node2 10.0.1.11
ceph orch host add ceph-node3 10.0.1.12
# 4. Add OSDs (auto-detect available disks)
ceph orch apply osd --all-available-devices
# Or specific devices
ceph orch daemon add osd ceph-node1:/dev/sdb
ceph orch daemon add osd ceph-node2:/dev/sdb
ceph orch daemon add osd ceph-node3:/dev/sdb
# 5. Set MON count
ceph orch apply mon --placement="3 ceph-node1 ceph-node2 ceph-node3"
# 6. Enable features
ceph mgr module enable dashboard
ceph mgr module enable prometheus
ceph mgr module enable pg_autoscaler
# 7. Create pools
ceph osd pool create rbd-pool 128
ceph osd pool set rbd-pool size 3
ceph osd pool set rbd-pool min_size 2
ceph osd pool application enable rbd-pool rbd
# Create CephFS
ceph fs volume create cephfs
# Create RGW (S3)
ceph orch apply rgw myrgw --placement="2 ceph-node1 ceph-node2" --port=7480
# 8. Verify cluster
ceph -s
# cluster:
# id: abc123
# health: HEALTH_OK
# services:
# mon: 3 daemons
# mgr: 2 active
# osd: 9 osds: 9 up, 9 in
# rgw: 2 daemons active
ceph osd tree
ceph df
echo "Ceph cluster installed"
Capacity Planning และ Sizing
เครื่องมือคำนวณ capacity
#!/usr/bin/env python3
# ceph_capacity_planner.py — Ceph Cluster Capacity Planning
import json
import math
import logging
from typing import Dict, List
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")
@dataclass
class DiskSpec:
size_tb: float
type: str # hdd, ssd, nvme
rpm: int = 0 # HDD only
iops: int = 0
throughput_mbps: int = 0
@dataclass
class NodeSpec:
hostname: str
cpu_cores: int
ram_gb: int
disks: List[DiskSpec]
network_gbps: float = 10
class CephCapacityPlanner:
def __init__(self):
self.nodes: List[NodeSpec] = []
def add_node(self, node: NodeSpec):
self.nodes.append(node)
def calculate_raw_capacity(self):
total_tb = 0
by_type = {"hdd": 0, "ssd": 0, "nvme": 0}
osd_count = 0
for node in self.nodes:
for disk in node.disks:
total_tb += disk.size_tb
by_type[disk.type] += disk.size_tb
osd_count += 1
return {
"total_raw_tb": round(total_tb, 1),
"by_type": {k: round(v, 1) for k, v in by_type.items() if v > 0},
"osd_count": osd_count,
"node_count": len(self.nodes),
}
def calculate_usable_capacity(self, replication_factor=3,
overhead_pct=5, max_fill_pct=85):
raw = self.calculate_raw_capacity()
raw_tb = raw["total_raw_tb"]
# After replication
after_replication = raw_tb / replication_factor
# Reserved for overhead (BlueStore, metadata)
after_overhead = after_replication * (1 - overhead_pct / 100)
# Max usable (never fill above 85%)
usable = after_overhead * (max_fill_pct / 100)
return {
"raw_tb": round(raw_tb, 1),
"replication_factor": replication_factor,
"after_replication_tb": round(after_replication, 1),
"overhead_pct": overhead_pct,
"after_overhead_tb": round(after_overhead, 1),
"max_fill_pct": max_fill_pct,
"usable_tb": round(usable, 1),
"efficiency_pct": round(usable / raw_tb * 100, 1),
"osd_count": raw["osd_count"],
}
def calculate_pg_count(self, pool_pct_of_data=100, osd_count=None,
replication_factor=3, target_pgs_per_osd=100):
if osd_count is None:
osd_count = self.calculate_raw_capacity()["osd_count"]
# Formula: (target_pgs_per_osd * osd_count * pool_pct / 100) / replication_factor
ideal = (target_pgs_per_osd * osd_count * pool_pct_of_data / 100) / replication_factor
# Round to nearest power of 2
pg_count = 2 ** math.ceil(math.log2(max(ideal, 1)))
return {
"osd_count": osd_count,
"pool_data_pct": pool_pct_of_data,
"replication_factor": replication_factor,
"ideal_pgs": round(ideal),
"recommended_pg_count": pg_count,
}
def forecast_growth(self, current_usage_tb, monthly_growth_tb,
months=12):
capacity = self.calculate_usable_capacity()
usable = capacity["usable_tb"]
forecast = []
for month in range(1, months + 1):
projected = current_usage_tb + (monthly_growth_tb * month)
utilization = projected / usable * 100
if utilization >= 85:
status = "CRITICAL"
elif utilization >= 70:
status = "WARNING"
else:
status = "OK"
forecast.append({
"month": month,
"projected_tb": round(projected, 1),
"utilization_pct": round(utilization, 1),
"free_tb": round(usable - projected, 1),
"status": status,
})
# Find when capacity runs out
months_until_full = None
for f in forecast:
if f["utilization_pct"] >= 85:
months_until_full = f["month"]
break
return {
"current_usage_tb": current_usage_tb,
"usable_capacity_tb": usable,
"monthly_growth_tb": monthly_growth_tb,
"months_until_85pct": months_until_full,
"forecast": forecast,
}
def recommend_hardware(self, target_usable_tb, workload="mixed"):
configs = {
"archive": {
"disk": DiskSpec(18, "hdd", rpm=7200, iops=100, throughput_mbps=200),
"disks_per_node": 12, "nodes_min": 3, "replication": 3,
},
"mixed": {
"disk": DiskSpec(4, "ssd", iops=50000, throughput_mbps=500),
"disks_per_node": 6, "nodes_min": 3, "replication": 3,
},
"performance": {
"disk": DiskSpec(2, "nvme", iops=200000, throughput_mbps=3000),
"disks_per_node": 4, "nodes_min": 3, "replication": 3,
},
}
config = configs.get(workload, configs["mixed"])
raw_needed = target_usable_tb / (1 / config["replication"]) / 0.80
disks_needed = math.ceil(raw_needed / config["disk"].size_tb)
nodes_needed = max(
math.ceil(disks_needed / config["disks_per_node"]),
config["nodes_min"],
)
actual_disks = nodes_needed * config["disks_per_node"]
actual_raw = actual_disks * config["disk"].size_tb
actual_usable = actual_raw / config["replication"] * 0.80
return {
"workload": workload,
"target_usable_tb": target_usable_tb,
"recommendation": {
"nodes": nodes_needed,
"disks_per_node": config["disks_per_node"],
"total_disks": actual_disks,
"disk_type": config["disk"].type,
"disk_size_tb": config["disk"].size_tb,
"raw_capacity_tb": round(actual_raw, 1),
"usable_capacity_tb": round(actual_usable, 1),
"replication": config["replication"],
},
}
planner = CephCapacityPlanner()
for i in range(3):
planner.add_node(NodeSpec(
hostname=f"ceph-node{i+1}",
cpu_cores=16, ram_gb=64,
disks=[DiskSpec(4, "ssd", iops=50000) for _ in range(6)],
network_gbps=25,
))
print("Raw:", json.dumps(planner.calculate_raw_capacity(), indent=2))
print("Usable:", json.dumps(planner.calculate_usable_capacity(), indent=2))
print("PGs:", json.dumps(planner.calculate_pg_count(pool_pct_of_data=80), indent=2))
print("Forecast:", json.dumps(planner.forecast_growth(10, 2, months=12), indent=2))
print("Recommend:", json.dumps(planner.recommend_hardware(50, "mixed"), indent=2))
Monitoring Cluster Health
Monitor Ceph cluster health และ performance
#!/usr/bin/env python3
# ceph_monitor.py — Ceph Cluster Health Monitoring
import subprocess
import json
import logging
from datetime import datetime
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ceph_mon")
class CephMonitor:
def __init__(self):
self.history = []
def _run_ceph(self, *args):
cmd = ["ceph", "--format", "json"] + list(args)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.error(f"ceph command failed: {result.stderr}")
return {}
return json.loads(result.stdout)
def get_cluster_status(self):
status = self._run_ceph("status")
health = status.get("health", {})
osd_map = status.get("osdmap", {}).get("osdmap", {})
pg_map = status.get("pgmap", {})
return {
"timestamp": datetime.utcnow().isoformat(),
"health": health.get("status", "UNKNOWN"),
"health_checks": list(health.get("checks", {}).keys()),
"osds": {
"total": osd_map.get("num_osds", 0),
"up": osd_map.get("num_up_osds", 0),
"in": osd_map.get("num_in_osds", 0),
},
"pgs": {
"total": pg_map.get("num_pgs", 0),
"active_clean": pg_map.get("num_pgs_by_state", [{}])[0].get("count", 0)
if pg_map.get("num_pgs_by_state") else 0,
},
"storage": {
"total_bytes": pg_map.get("bytes_total", 0),
"used_bytes": pg_map.get("bytes_used", 0),
"available_bytes": pg_map.get("bytes_avail", 0),
"used_pct": round(
pg_map.get("bytes_used", 0) / max(pg_map.get("bytes_total", 1), 1) * 100, 1
),
},
"io": {
"read_bytes_sec": pg_map.get("read_bytes_sec", 0),
"write_bytes_sec": pg_map.get("write_bytes_sec", 0),
"read_op_sec": pg_map.get("read_op_per_sec", 0),
"write_op_sec": pg_map.get("write_op_per_sec", 0),
},
}
def get_osd_status(self):
osds = self._run_ceph("osd", "df")
osd_list = []
for osd in osds.get("nodes", []):
utilization = osd.get("utilization", 0)
status = "ok"
if utilization > 85:
status = "critical"
elif utilization > 75:
status = "warning"
osd_list.append({
"id": osd.get("id"),
"name": osd.get("name"),
"total_kb": osd.get("kb"),
"used_kb": osd.get("kb_used"),
"available_kb": osd.get("kb_avail"),
"utilization_pct": round(utilization, 1),
"status": status,
})
return sorted(osd_list, key=lambda x: x["utilization_pct"], reverse=True)
def get_pool_stats(self):
pools = self._run_ceph("osd", "pool", "stats")
pool_list = []
for pool in pools:
pool_list.append({
"name": pool.get("pool_name"),
"id": pool.get("pool_id"),
"stored_bytes": pool.get("stored", 0),
"objects": pool.get("objects", 0),
"read_bytes_sec": pool.get("rd_bytes", 0),
"write_bytes_sec": pool.get("wr_bytes", 0),
})
return pool_list
def health_check(self):
status = self.get_cluster_status()
osds = self.get_osd_status()
issues = []
if status["health"] != "HEALTH_OK":
issues.append(f"Cluster health: {status['health']}")
for check in status["health_checks"]:
issues.append(f" - {check}")
down_osds = status["osds"]["total"] - status["osds"]["up"]
if down_osds > 0:
issues.append(f"{down_osds} OSDs are down")
if status["storage"]["used_pct"] > 80:
issues.append(f"Storage usage: {status['storage']['used_pct']}%")
for osd in osds:
if osd["status"] == "critical":
issues.append(f"OSD {osd['name']} at {osd['utilization_pct']}%")
return {
"timestamp": status["timestamp"],
"healthy": len(issues) == 0,
"issues": issues,
"summary": status,
}
# monitor = CephMonitor()
# health = monitor.health_check()
# print(json.dumps(health, indent=2))
Performance Tuning และ Optimization
ปรับแต่ง Ceph performance
=== Ceph Performance Tuning ===
1. BlueStore Tuning
Dedicated WAL/DB on NVMe for HDD OSDs
ceph osd set-full-ratio 0.90
ceph osd set-nearfull-ratio 0.80
ceph osd set-backfillfull-ratio 0.85
Per-OSD settings in ceph.conf
[osd]
bluestore_cache_size = 4294967296 # 4GB cache per OSD
bluestore_cache_autotune = true
bluestore_cache_meta_ratio = 0.4
bluestore_cache_kv_ratio = 0.4
bluestore_cache_data_ratio = 0.2
# Async IO
bdev_async_discard = true
bdev_enable_discard = true
# Recovery tuning (balance between recovery speed and client IO)
osd_recovery_max_active = 3
osd_recovery_max_active_hdd = 1
osd_recovery_max_active_ssd = 10
osd_max_backfills = 1
osd_recovery_sleep = 0
osd_recovery_priority = 5
2. Network Tuning
Separate public and cluster networks
[global]
public_network = 10.0.1.0/24
cluster_network = 10.0.2.0/24
ms_type = async+posix
ms_async_op_threads = 5
System network tuning
sudo sysctl -w net.core.rmem_max=67108864
sudo sysctl -w net.core.wmem_max=67108864
sudo sysctl -w net.core.rmem_default=33554432
sudo sysctl -w net.core.wmem_default=33554432
sudo sysctl -w net.ipv4.tcp_rmem="4096 87380 33554432"
sudo sysctl -w net.ipv4.tcp_wmem="4096 65536 33554432"
3. PG Autoscaler
ceph mgr module enable pg_autoscaler
ceph osd pool set rbd-pool pg_autoscale_mode on
Check PG status
ceph osd pool autoscale-status
4. Pool Optimization
Set compression for suitable pools
ceph osd pool set archive-pool compression_algorithm snappy
ceph osd pool set archive-pool compression_mode aggressive
ceph osd pool set archive-pool compression_required_ratio 0.875
5. Benchmark
RADOS bench (sequential write)
rados bench -p rbd-pool 60 write --no-cleanup
RADOS bench (sequential read)
rados bench -p rbd-pool 60 seq
RADOS bench (random read)
rados bench -p rbd-pool 60 rand
RBD bench
rbd create test-image --size 10G --pool rbd-pool
rbd bench test-image --io-type write --io-size 4K --io-threads 16 --pool rbd-pool
rbd bench test-image --io-type read --io-size 4K --io-threads 16 --pool rbd-pool
fio with RBD
[global]
ioengine=rbd
clientname=admin
pool=rbd-pool
rbdname=test-image
rw=randread
bs=4k
numjobs=4
iodepth=32
runtime=60
echo "Ceph tuning complete"
Scaling และ Maintenance
ขยาย cluster และ maintenance
=== Ceph Cluster Scaling ===
1. Add New OSD Host
Add host to cluster
ceph orch host add ceph-node4 10.0.1.13
Add OSDs on new host
ceph orch daemon add osd ceph-node4:/dev/sdb
ceph orch daemon add osd ceph-node4:/dev/sdc
ceph orch daemon add osd ceph-node4:/dev/sdd
Monitor rebalancing
ceph -w
Watch PG states: active+remapped, backfilling, recovering
Wait until all PGs are active+clean
2. Remove OSD (Graceful)
Mark OSD out (starts rebalancing)
ceph osd out osd.5
Wait for rebalancing
while ! ceph osd safe-to-destroy osd.5; do
echo "Waiting for safe-to-destroy..."
sleep 30
done
Stop and remove
ceph orch daemon rm osd.5
ceph osd purge osd.5 --yes-i-really-mean-it
3. Replace Failed Disk
Identify failed OSD
ceph osd tree | grep down
ceph health detail
Remove failed OSD
ceph osd out osd.3
ceph osd purge osd.3 --yes-i-really-mean-it
Replace disk physically, then add new OSD
ceph orch daemon add osd ceph-node2:/dev/sdc
4. Maintenance Window
Set noout flag (prevent rebalancing during maintenance)
ceph osd set noout
Perform maintenance...
Restart services, update firmware, etc.
sudo systemctl restart ceph-osd@3
Unset flag after maintenance
ceph osd unset noout
5. Upgrade Ceph
Check current version
ceph versions
Set container image for upgrade
ceph orch upgrade start --image quay.io/ceph/ceph:v18.2.2
Monitor upgrade progress
ceph orch upgrade status
6. Backup Strategies
RBD snapshots
rbd snap create rbd-pool/myimage@snap1
rbd snap ls rbd-pool/myimage
Export snapshot
rbd export rbd-pool/myimage@snap1 /backup/myimage-snap1.img
Incremental export
rbd export-diff rbd-pool/myimage@snap1 --from-snap snap0 /backup/myimage-diff.img
CephFS snapshots
mkdir /mnt/cephfs/.snap/daily-$(date +%Y%m%d)
7. Monitoring Alerts
Prometheus alerting rules for Ceph
groups:
- name: ceph
rules:
- alert: CephHealthWarning
expr: ceph_health_status == 1
for: 5m
- alert: CephHealthError
expr: ceph_health_status == 2
for: 1m
- alert: CephOSDDown
expr: count(ceph_osd_up == 0) > 0
for: 5m
- alert: CephStorageFull
expr: ceph_cluster_total_used_bytes / ceph_cluster_total_bytes > 0.80
for: 10m
echo "Ceph scaling and maintenance documented"
FAQ คำถามที่พบบ่อย
Q: Ceph ต้องการ hardware อะไรบ้าง?
A: ขั้นต่ำสำหรับ production 3 nodes แต่ละ node ต้องมี CPU 4+ cores (แนะนำ 1 core ต่อ OSD), RAM 4GB base + 5GB ต่อ OSD (เช่น 6 OSDs ต้อง 34GB), 1 SSD/NVMe ต่อ OSD สำหรับ data (หรือ HDD กับ NVMe WAL/DB), network 10GbE ขั้นต่ำ (25GbE สำหรับ NVMe clusters) SSD สำหรับ MON และ MGR (ไม่ต้องใหญ่ 50-100GB พอ) สำหรับ production จริงจัง แยก public network และ cluster network
Q: Replication 3 กับ Erasure Coding ต่างกันอย่างไร?
A: Replication 3 เก็บ data 3 copies ใช้ raw space 3 เท่า แต่ recovery เร็ว read performance ดี (อ่านจาก copy ไหนัก็ได้) เหมาะสำหรับ hot data ที่ต้องการ low latency Erasure Coding (เช่น k=4, m=2) ใช้ raw space 1.5 เท่า ประหยัดกว่ามาก แต่ write latency สูงกว่า, recovery ช้ากว่า, ใช้ CPU มากกว่า เหมาะสำหรับ cold/archive data ที่ต้องการ capacity efficiency
Q: PG count ตั้งเท่าไหรดี?
A: ใช้สูตร (target_pgs_per_osd * total_osds * pool_pct) / replication_factor แล้ว round up เป็น power of 2 เช่น 18 OSDs, replication 3, pool ใช้ 80% ของ data = (100 * 18 * 0.8) / 3 = 480 ปัดเป็น 512 PGs ต่ำกว่า 100 PGs per OSD จะเสีย performance สูงกว่า 200 PGs per OSD จะใช้ memory มาก Ceph Reef+ มี pg_autoscaler ที่ adjust อัตโนมัติ แนะนำเปิดใช้
Q: Ceph เหมาะกับ workload ประเภทไหน?
A: เหมาะมากสำหรับ object storage (S3 compatible) ใช้แทน MinIO สำหรับ large scale, block storage สำหรับ VMs (Proxmox, OpenStack), Kubernetes persistent volumes (Rook-Ceph), backup storage และ data lake ไม่เหมาะสำหรับ workloads ที่ต้องการ ultra-low latency (ใช้ local NVMe แทน), small clusters น้อยกว่า 3 nodes, single-tenant database ที่ต้องการ consistent latency