Ceph Storage Cluster คืออะไร
Ceph เป็น open source distributed storage system ที่ให้บริการ object storage, block storage และ file storage บน single unified platform ออกแบบมาเพื่อ high availability, scalability และ self-healing ใช้ CRUSH algorithm สำหรับ data placement ไม่มี single point of failure
Components หลักของ Ceph ได้แก่ MON (Monitor) จัดการ cluster map และ consensus ต้องมีอย่างน้อย 3 nodes, OSD (Object Storage Daemon) เก็บข้อมูลจริง แต่ละ OSD ดูแล disk หนึ่งลูก, MDS (Metadata Server) จัดการ metadata สำหรับ CephFS, MGR (Manager) จัดการ monitoring, dashboard และ orchestration, RGW (RADOS Gateway) ให้บริการ S3/Swift compatible object storage
Capacity Planning สำหรับ Ceph สำคัญมากเพราะ replication factor กระทบ usable capacity โดยตรง (3x replication ใช้ raw space 3 เท่า), OSD placement groups ต้อง size ถูกต้องเพื่อ performance ที่ดี, network bandwidth ต้องเพียงพอสำหรับ replication traffic, recovery จาก disk failure ต้องมี spare capacity และ growth projection ต้องวางแผนล่วงหน้า 6-12 เดือน
ติดตั้ง Ceph Cluster
วิธีติดตั้ง Ceph ด้วย cephadm
# === ติดตั้ง Ceph Cluster ด้วย cephadm ===
# Prerequisites:
# - 3+ nodes with Ubuntu 22.04 or Rocky Linux 9
# - Each node: 4+ CPU cores, 8+ GB RAM, 1+ SSD/NVMe for OSD
# - Network: 10GbE recommended (minimum 1GbE)
# - Separate public and cluster networks recommended
# 1. Install cephadm on first node
curl --silent --remote-name --location https://download.ceph.com/rpm-reef/el9/noarch/cephadm
chmod +x cephadm
./cephadm add-repo --release reef
./cephadm install
# 2. Bootstrap cluster
cephadm bootstrap \
--mon-ip 10.0.1.10 \
--cluster-network 10.0.2.0/24 \
--dashboard-password-noupdate \
--initial-dashboard-password MyDashP@ss
# Dashboard: https://10.0.1.10:8443
# Default user: admin
# 3. Add hosts
# Copy SSH key to other nodes first
ssh-copy-id root@ceph-node2
ssh-copy-id root@ceph-node3
ceph orch host add ceph-node2 10.0.1.11
ceph orch host add ceph-node3 10.0.1.12
# 4. Add OSDs (auto-detect available disks)
ceph orch apply osd --all-available-devices
# Or specific devices
ceph orch daemon add osd ceph-node1:/dev/sdb
ceph orch daemon add osd ceph-node2:/dev/sdb
ceph orch daemon add osd ceph-node3:/dev/sdb
# 5. Set MON count
ceph orch apply mon --placement="3 ceph-node1 ceph-node2 ceph-node3"
# 6. Enable features
ceph mgr module enable dashboard
ceph mgr module enable prometheus
ceph mgr module enable pg_autoscaler
# 7. Create pools
ceph osd pool create rbd-pool 128
ceph osd pool set rbd-pool size 3
ceph osd pool set rbd-pool min_size 2
ceph osd pool application enable rbd-pool rbd
# Create CephFS
ceph fs volume create cephfs
# Create RGW (S3)
ceph orch apply rgw myrgw --placement="2 ceph-node1 ceph-node2" --port=7480
# 8. Verify cluster
ceph -s
# cluster:
# id: abc123
# health: HEALTH_OK
# services:
# mon: 3 daemons
# mgr: 2 active
# osd: 9 osds: 9 up, 9 in
# rgw: 2 daemons active
ceph osd tree
ceph df
echo "Ceph cluster installed"
Capacity Planning และ Sizing
เครื่องมือคำนวณ capacity
#!/usr/bin/env python3
# ceph_capacity_planner.py — Ceph Cluster Capacity Planning
import json
import math
import logging
from typing import Dict, List
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")
@dataclass
class DiskSpec:
size_tb: float
type: str # hdd, ssd, nvme
rpm: int = 0 # HDD only
iops: int = 0
throughput_mbps: int = 0
@dataclass
class NodeSpec:
hostname: str
cpu_cores: int
ram_gb: int
disks: List[DiskSpec]
network_gbps: float = 10
class CephCapacityPlanner:
def __init__(self):
self.nodes: List[NodeSpec] = []
def add_node(self, node: NodeSpec):
self.nodes.append(node)
def calculate_raw_capacity(self):
total_tb = 0
by_type = {"hdd": 0, "ssd": 0, "nvme": 0}
osd_count = 0
for node in self.nodes:
for disk in node.disks:
total_tb += disk.size_tb
by_type[disk.type] += disk.size_tb
osd_count += 1
return {
"total_raw_tb": round(total_tb, 1),
"by_type": {k: round(v, 1) for k, v in by_type.items() if v > 0},
"osd_count": osd_count,
"node_count": len(self.nodes),
}
def calculate_usable_capacity(self, replication_factor=3,
overhead_pct=5, max_fill_pct=85):
raw = self.calculate_raw_capacity()
raw_tb = raw["total_raw_tb"]
# After replication
after_replication = raw_tb / replication_factor
# Reserved for overhead (BlueStore, metadata)
after_overhead = after_replication * (1 - overhead_pct / 100)
# Max usable (never fill above 85%)
usable = after_overhead * (max_fill_pct / 100)
return {
"raw_tb": round(raw_tb, 1),
"replication_factor": replication_factor,
"after_replication_tb": round(after_replication, 1),
"overhead_pct": overhead_pct,
"after_overhead_tb": round(after_overhead, 1),
"max_fill_pct": max_fill_pct,
"usable_tb": round(usable, 1),
"efficiency_pct": round(usable / raw_tb * 100, 1),
"osd_count": raw["osd_count"],
}
def calculate_pg_count(self, pool_pct_of_data=100, osd_count=None,
replication_factor=3, target_pgs_per_osd=100):
if osd_count is None:
osd_count = self.calculate_raw_capacity()["osd_count"]
# Formula: (target_pgs_per_osd * osd_count * pool_pct / 100) / replication_factor
ideal = (target_pgs_per_osd * osd_count * pool_pct_of_data / 100) / replication_factor
# Round to nearest power of 2
pg_count = 2 ** math.ceil(math.log2(max(ideal, 1)))
return {
"osd_count": osd_count,
"pool_data_pct": pool_pct_of_data,
"replication_factor": replication_factor,
"ideal_pgs": round(ideal),
"recommended_pg_count": pg_count,
}
def forecast_growth(self, current_usage_tb, monthly_growth_tb,
months=12):
capacity = self.calculate_usable_capacity()
usable = capacity["usable_tb"]
forecast = []
for month in range(1, months + 1):
projected = current_usage_tb + (monthly_growth_tb * month)
utilization = projected / usable * 100
if utilization >= 85:
status = "CRITICAL"
elif utilization >= 70:
status = "WARNING"
else:
status = "OK"
forecast.append({
"month": month,
"projected_tb": round(projected, 1),
"utilization_pct": round(utilization, 1),
"free_tb": round(usable - projected, 1),
"status": status,
})
# Find when capacity runs out
months_until_full = None
for f in forecast:
if f["utilization_pct"] >= 85:
months_until_full = f["month"]
break
return {
"current_usage_tb": current_usage_tb,
"usable_capacity_tb": usable,
"monthly_growth_tb": monthly_growth_tb,
"months_until_85pct": months_until_full,
"forecast": forecast,
}
def recommend_hardware(self, target_usable_tb, workload="mixed"):
configs = {
"archive": {
"disk": DiskSpec(18, "hdd", rpm=7200, iops=100, throughput_mbps=200),
"disks_per_node": 12, "nodes_min": 3, "replication": 3,
},
"mixed": {
"disk": DiskSpec(4, "ssd", iops=50000, throughput_mbps=500),
"disks_per_node": 6, "nodes_min": 3, "replication": 3,
},
"performance": {
"disk": DiskSpec(2, "nvme", iops=200000, throughput_mbps=3000),
"disks_per_node": 4, "nodes_min": 3, "replication": 3,
},
}
config = configs.get(workload, configs["mixed"])
raw_needed = target_usable_tb / (1 / config["replication"]) / 0.80
disks_needed = math.ceil(raw_needed / config["disk"].size_tb)
nodes_needed = max(
math.ceil(disks_needed / config["disks_per_node"]),
config["nodes_min"],
)
actual_disks = nodes_needed * config["disks_per_node"]
actual_raw = actual_disks * config["disk"].size_tb
actual_usable = actual_raw / config["replication"] * 0.80
return {
"workload": workload,
"target_usable_tb": target_usable_tb,
"recommendation": {
"nodes": nodes_needed,
"disks_per_node": config["disks_per_node"],
"total_disks": actual_disks,
"disk_type": config["disk"].type,
"disk_size_tb": config["disk"].size_tb,
"raw_capacity_tb": round(actual_raw, 1),
"usable_capacity_tb": round(actual_usable, 1),
"replication": config["replication"],
},
}
planner = CephCapacityPlanner()
for i in range(3):
planner.add_node(NodeSpec(
hostname=f"ceph-node{i+1}",
cpu_cores=16, ram_gb=64,
disks=[DiskSpec(4, "ssd", iops=50000) for _ in range(6)],
network_gbps=25,
))
print("Raw:", json.dumps(planner.calculate_raw_capacity(), indent=2))
print("Usable:", json.dumps(planner.calculate_usable_capacity(), indent=2))
print("PGs:", json.dumps(planner.calculate_pg_count(pool_pct_of_data=80), indent=2))
print("Forecast:", json.dumps(planner.forecast_growth(10, 2, months=12), indent=2))
print("Recommend:", json.dumps(planner.recommend_hardware(50, "mixed"), indent=2))
Monitoring Cluster Health
Monitor Ceph cluster health และ performance
#!/usr/bin/env python3
# ceph_monitor.py — Ceph Cluster Health Monitoring
import subprocess
import json
import logging
from datetime import datetime
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ceph_mon")
class CephMonitor:
def __init__(self):
self.history = []
def _run_ceph(self, *args):
cmd = ["ceph", "--format", "json"] + list(args)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.error(f"ceph command failed: {result.stderr}")
return {}
return json.loads(result.stdout)
def get_cluster_status(self):
status = self._run_ceph("status")
health = status.get("health", {})
osd_map = status.get("osdmap", {}).get("osdmap", {})
pg_map = status.get("pgmap", {})
return {
"timestamp": datetime.utcnow().isoformat(),
"health": health.get("status", "UNKNOWN"),
"health_checks": list(health.get("checks", {}).keys()),
"osds": {
"total": osd_map.get("num_osds", 0),
"up": osd_map.get("num_up_osds", 0),
"in": osd_map.get("num_in_osds", 0),
},
"pgs": {
"total": pg_map.get("num_pgs", 0),
"active_clean": pg_map.get("num_pgs_by_state", [{}])[0].get("count", 0)
if pg_map.get("num_pgs_by_state") else 0,
},
"storage": {
"total_bytes": pg_map.get("bytes_total", 0),
"used_bytes": pg_map.get("bytes_used", 0),
"available_bytes": pg_map.get("bytes_avail", 0),
"used_pct": round(
pg_map.get("bytes_used", 0) / max(pg_map.get("bytes_total", 1), 1) * 100, 1
),
},
"io": {
"read_bytes_sec": pg_map.get("read_bytes_sec", 0),
"write_bytes_sec": pg_map.get("write_bytes_sec", 0),
"read_op_sec": pg_map.get("read_op_per_sec", 0),
"write_op_sec": pg_map.get("write_op_per_sec", 0),
},
}
def get_osd_status(self):
osds = self._run_ceph("osd", "df")
osd_list = []
for osd in osds.get("nodes", []):
utilization = osd.get("utilization", 0)
status = "ok"
if utilization > 85:
status = "critical"
elif utilization > 75:
status = "warning"
osd_list.append({
"id": osd.get("id"),
"name": osd.get("name"),
"total_kb": osd.get("kb"),
"used_kb": osd.get("kb_used"),
"available_kb": osd.get("kb_avail"),
"utilization_pct": round(utilization, 1),
"status": status,
})
return sorted(osd_list, key=lambda x: x["utilization_pct"], reverse=True)
def get_pool_stats(self):
pools = self._run_ceph("osd", "pool", "stats")
pool_list = []
for pool in pools:
pool_list.append({
"name": pool.get("pool_name"),
"id": pool.get("pool_id"),
"stored_bytes": pool.get("stored", 0),
"objects": pool.get("objects", 0),
"read_bytes_sec": pool.get("rd_bytes", 0),
"write_bytes_sec": pool.get("wr_bytes", 0),
})
return pool_list
def health_check(self):
status = self.get_cluster_status()
osds = self.get_osd_status()
issues = []
if status["health"] != "HEALTH_OK":
issues.append(f"Cluster health: {status['health']}")
for check in status["health_checks"]:
issues.append(f" - {check}")
down_osds = status["osds"]["total"] - status["osds"]["up"]
if down_osds > 0:
issues.append(f"{down_osds} OSDs are down")
if status["storage"]["used_pct"] > 80:
issues.append(f"Storage usage: {status['storage']['used_pct']}%")
for osd in osds:
if osd["status"] == "critical":
issues.append(f"OSD {osd['name']} at {osd['utilization_pct']}%")
return {
"timestamp": status["timestamp"],
"healthy": len(issues) == 0,
"issues": issues,
"summary": status,
}
# monitor = CephMonitor()
# health = monitor.health_check()
# print(json.dumps(health, indent=2))
Performance Tuning และ Optimization
ปรับแต่ง Ceph performance
# === Ceph Performance Tuning ===
# 1. BlueStore Tuning
# ===================================
# Dedicated WAL/DB on NVMe for HDD OSDs
ceph osd set-full-ratio 0.90
ceph osd set-nearfull-ratio 0.80
ceph osd set-backfillfull-ratio 0.85
# Per-OSD settings in ceph.conf
# [osd]
# bluestore_cache_size = 4294967296 # 4GB cache per OSD
# bluestore_cache_autotune = true
# bluestore_cache_meta_ratio = 0.4
# bluestore_cache_kv_ratio = 0.4
# bluestore_cache_data_ratio = 0.2
#
# # Async IO
# bdev_async_discard = true
# bdev_enable_discard = true
#
# # Recovery tuning (balance between recovery speed and client IO)
# osd_recovery_max_active = 3
# osd_recovery_max_active_hdd = 1
# osd_recovery_max_active_ssd = 10
# osd_max_backfills = 1
# osd_recovery_sleep = 0
# osd_recovery_priority = 5
# 2. Network Tuning
# ===================================
# Separate public and cluster networks
# [global]
# public_network = 10.0.1.0/24
# cluster_network = 10.0.2.0/24
# ms_type = async+posix
# ms_async_op_threads = 5
# System network tuning
sudo sysctl -w net.core.rmem_max=67108864
sudo sysctl -w net.core.wmem_max=67108864
sudo sysctl -w net.core.rmem_default=33554432
sudo sysctl -w net.core.wmem_default=33554432
sudo sysctl -w net.ipv4.tcp_rmem="4096 87380 33554432"
sudo sysctl -w net.ipv4.tcp_wmem="4096 65536 33554432"
# 3. PG Autoscaler
# ===================================
ceph mgr module enable pg_autoscaler
ceph osd pool set rbd-pool pg_autoscale_mode on
# Check PG status
ceph osd pool autoscale-status
# 4. Pool Optimization
# ===================================
# Set compression for suitable pools
ceph osd pool set archive-pool compression_algorithm snappy
ceph osd pool set archive-pool compression_mode aggressive
ceph osd pool set archive-pool compression_required_ratio 0.875
# 5. Benchmark
# ===================================
# RADOS bench (sequential write)
rados bench -p rbd-pool 60 write --no-cleanup
# RADOS bench (sequential read)
rados bench -p rbd-pool 60 seq
# RADOS bench (random read)
rados bench -p rbd-pool 60 rand
# RBD bench
rbd create test-image --size 10G --pool rbd-pool
rbd bench test-image --io-type write --io-size 4K --io-threads 16 --pool rbd-pool
rbd bench test-image --io-type read --io-size 4K --io-threads 16 --pool rbd-pool
# fio with RBD
# [global]
# ioengine=rbd
# clientname=admin
# pool=rbd-pool
# rbdname=test-image
# rw=randread
# bs=4k
# numjobs=4
# iodepth=32
# runtime=60
echo "Ceph tuning complete"
Scaling และ Maintenance
ขยาย cluster และ maintenance
# === Ceph Cluster Scaling ===
# 1. Add New OSD Host
# ===================================
# Add host to cluster
ceph orch host add ceph-node4 10.0.1.13
# Add OSDs on new host
ceph orch daemon add osd ceph-node4:/dev/sdb
ceph orch daemon add osd ceph-node4:/dev/sdc
ceph orch daemon add osd ceph-node4:/dev/sdd
# Monitor rebalancing
ceph -w
# Watch PG states: active+remapped, backfilling, recovering
# Wait until all PGs are active+clean
# 2. Remove OSD (Graceful)
# ===================================
# Mark OSD out (starts rebalancing)
ceph osd out osd.5
# Wait for rebalancing
while ! ceph osd safe-to-destroy osd.5; do
echo "Waiting for safe-to-destroy..."
sleep 30
done
# Stop and remove
ceph orch daemon rm osd.5
ceph osd purge osd.5 --yes-i-really-mean-it
# 3. Replace Failed Disk
# ===================================
# Identify failed OSD
ceph osd tree | grep down
ceph health detail
# Remove failed OSD
ceph osd out osd.3
ceph osd purge osd.3 --yes-i-really-mean-it
# Replace disk physically, then add new OSD
ceph orch daemon add osd ceph-node2:/dev/sdc
# 4. Maintenance Window
# ===================================
# Set noout flag (prevent rebalancing during maintenance)
ceph osd set noout
# Perform maintenance...
# Restart services, update firmware, etc.
sudo systemctl restart ceph-osd@3
# Unset flag after maintenance
ceph osd unset noout
# 5. Upgrade Ceph
# ===================================
# Check current version
ceph versions
# Set container image for upgrade
ceph orch upgrade start --image quay.io/ceph/ceph:v18.2.2
# Monitor upgrade progress
ceph orch upgrade status
# 6. Backup Strategies
# ===================================
# RBD snapshots
rbd snap create rbd-pool/myimage@snap1
rbd snap ls rbd-pool/myimage
# Export snapshot
rbd export rbd-pool/myimage@snap1 /backup/myimage-snap1.img
# Incremental export
rbd export-diff rbd-pool/myimage@snap1 --from-snap snap0 /backup/myimage-diff.img
# CephFS snapshots
mkdir /mnt/cephfs/.snap/daily-$(date +%Y%m%d)
# 7. Monitoring Alerts
# ===================================
# Prometheus alerting rules for Ceph
# groups:
# - name: ceph
# rules:
# - alert: CephHealthWarning
# expr: ceph_health_status == 1
# for: 5m
# - alert: CephHealthError
# expr: ceph_health_status == 2
# for: 1m
# - alert: CephOSDDown
# expr: count(ceph_osd_up == 0) > 0
# for: 5m
# - alert: CephStorageFull
# expr: ceph_cluster_total_used_bytes / ceph_cluster_total_bytes > 0.80
# for: 10m
echo "Ceph scaling and maintenance documented"
FAQ คำถามที่พบบ่อย
Q: Ceph ต้องการ hardware อะไรบ้าง?
A: ขั้นต่ำสำหรับ production 3 nodes แต่ละ node ต้องมี CPU 4+ cores (แนะนำ 1 core ต่อ OSD), RAM 4GB base + 5GB ต่อ OSD (เช่น 6 OSDs ต้อง 34GB), 1 SSD/NVMe ต่อ OSD สำหรับ data (หรือ HDD กับ NVMe WAL/DB), network 10GbE ขั้นต่ำ (25GbE สำหรับ NVMe clusters) SSD สำหรับ MON และ MGR (ไม่ต้องใหญ่ 50-100GB พอ) สำหรับ production จริงจัง แยก public network และ cluster network
Q: Replication 3 กับ Erasure Coding ต่างกันอย่างไร?
A: Replication 3 เก็บ data 3 copies ใช้ raw space 3 เท่า แต่ recovery เร็ว read performance ดี (อ่านจาก copy ไหนัก็ได้) เหมาะสำหรับ hot data ที่ต้องการ low latency Erasure Coding (เช่น k=4, m=2) ใช้ raw space 1.5 เท่า ประหยัดกว่ามาก แต่ write latency สูงกว่า, recovery ช้ากว่า, ใช้ CPU มากกว่า เหมาะสำหรับ cold/archive data ที่ต้องการ capacity efficiency
Q: PG count ตั้งเท่าไหรดี?
A: ใช้สูตร (target_pgs_per_osd * total_osds * pool_pct) / replication_factor แล้ว round up เป็น power of 2 เช่น 18 OSDs, replication 3, pool ใช้ 80% ของ data = (100 * 18 * 0.8) / 3 = 480 ปัดเป็น 512 PGs ต่ำกว่า 100 PGs per OSD จะเสีย performance สูงกว่า 200 PGs per OSD จะใช้ memory มาก Ceph Reef+ มี pg_autoscaler ที่ adjust อัตโนมัติ แนะนำเปิดใช้
Q: Ceph เหมาะกับ workload ประเภทไหน?
A: เหมาะมากสำหรับ object storage (S3 compatible) ใช้แทน MinIO สำหรับ large scale, block storage สำหรับ VMs (Proxmox, OpenStack), Kubernetes persistent volumes (Rook-Ceph), backup storage และ data lake ไม่เหมาะสำหรับ workloads ที่ต้องการ ultra-low latency (ใช้ local NVMe แทน), small clusters น้อยกว่า 3 nodes, single-tenant database ที่ต้องการ consistent latency
