Ceph Storage Cluster คืออะไร

Ceph เป็น open source distributed storage system ที่ให้บริการ object storage, block storage และ file storage บน single unified platform ออกแบบมาเพื่อ high availability, scalability และ self-healing ใช้ CRUSH algorithm สำหรับ data placement ไม่มี single point of failure

Components หลักของ Ceph ได้แก่ MON (Monitor) จัดการ cluster map และ consensus ต้องมีอย่างน้อย 3 nodes, OSD (Object Storage Daemon) เก็บข้อมูลจริง แต่ละ OSD ดูแล disk หนึ่งลูก, MDS (Metadata Server) จัดการ metadata สำหรับ CephFS, MGR (Manager) จัดการ monitoring, dashboard และ orchestration, RGW (RADOS Gateway) ให้บริการ S3/Swift compatible object storage

Capacity Planning สำหรับ Ceph สำคัญมากเพราะ replication factor กระทบ usable capacity โดยตรง (3x replication ใช้ raw space 3 เท่า), OSD placement groups ต้อง size ถูกต้องเพื่อ performance ที่ดี, network bandwidth ต้องเพียงพอสำหรับ replication traffic, recovery จาก disk failure ต้องมี spare capacity และ growth projection ต้องวางแผนล่วงหน้า 6-12 เดือน

ติดตั้ง Ceph Cluster

วิธีติดตั้ง Ceph ด้วย cephadm

# === ติดตั้ง Ceph Cluster ด้วย cephadm ===

# Prerequisites:
# - 3+ nodes with Ubuntu 22.04 or Rocky Linux 9
# - Each node: 4+ CPU cores, 8+ GB RAM, 1+ SSD/NVMe for OSD
# - Network: 10GbE recommended (minimum 1GbE)
# - Separate public and cluster networks recommended

# 1. Install cephadm on first node
curl --silent --remote-name --location https://download.ceph.com/rpm-reef/el9/noarch/cephadm
chmod +x cephadm
./cephadm add-repo --release reef
./cephadm install

# 2. Bootstrap cluster
cephadm bootstrap \
 --mon-ip 10.0.1.10 \
 --cluster-network 10.0.2.0/24 \
 --dashboard-password-noupdate \
 --initial-dashboard-password MyDashP@ss

# Dashboard: https://10.0.1.10:8443
# Default user: admin

# 3. Add hosts
# Copy SSH key to other nodes first
ssh-copy-id root@ceph-node2
ssh-copy-id root@ceph-node3

ceph orch host add ceph-node2 10.0.1.11
ceph orch host add ceph-node3 10.0.1.12

# 4. Add OSDs (auto-detect available disks)
ceph orch apply osd --all-available-devices

# Or specific devices
ceph orch daemon add osd ceph-node1:/dev/sdb
ceph orch daemon add osd ceph-node2:/dev/sdb
ceph orch daemon add osd ceph-node3:/dev/sdb

# 5. Set MON count
ceph orch apply mon --placement="3 ceph-node1 ceph-node2 ceph-node3"

# 6. Enable features
ceph mgr module enable dashboard
ceph mgr module enable prometheus
ceph mgr module enable pg_autoscaler

# 7. Create pools
ceph osd pool create rbd-pool 128
ceph osd pool set rbd-pool size 3
ceph osd pool set rbd-pool min_size 2
ceph osd pool application enable rbd-pool rbd

# Create CephFS
ceph fs volume create cephfs

# Create RGW (S3)
ceph orch apply rgw myrgw --placement="2 ceph-node1 ceph-node2" --port=7480

# 8. Verify cluster
ceph -s
# cluster:
# id: abc123
# health: HEALTH_OK
# services:
# mon: 3 daemons
# mgr: 2 active
# osd: 9 osds: 9 up, 9 in
# rgw: 2 daemons active

ceph osd tree
ceph df

echo "Ceph cluster installed"

Capacity Planning และ Sizing

เครื่องมือคำนวณ capacity

#!/usr/bin/env python3
# ceph_capacity_planner.py — Ceph Cluster Capacity Planning
import json
import math
import logging
from typing import Dict, List
from dataclasses import dataclass

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("capacity")

@dataclass
class DiskSpec:
 size_tb: float
 type: str # hdd, ssd, nvme
 rpm: int = 0 # HDD only
 iops: int = 0
 throughput_mbps: int = 0

@dataclass
class NodeSpec:
 hostname: str
 cpu_cores: int
 ram_gb: int
 disks: List[DiskSpec]
 network_gbps: float = 10

class CephCapacityPlanner:
 def __init__(self):
 self.nodes: List[NodeSpec] = []
 
 def add_node(self, node: NodeSpec):
 self.nodes.append(node)
 
 def calculate_raw_capacity(self):
 total_tb = 0
 by_type = {"hdd": 0, "ssd": 0, "nvme": 0}
 osd_count = 0
 
 for node in self.nodes:
 for disk in node.disks:
 total_tb += disk.size_tb
 by_type[disk.type] += disk.size_tb
 osd_count += 1
 
 return {
 "total_raw_tb": round(total_tb, 1),
 "by_type": {k: round(v, 1) for k, v in by_type.items() if v > 0},
 "osd_count": osd_count,
 "node_count": len(self.nodes),
 }
 
 def calculate_usable_capacity(self, replication_factor=3,
 overhead_pct=5, max_fill_pct=85):
 raw = self.calculate_raw_capacity()
 raw_tb = raw["total_raw_tb"]
 
 # After replication
 after_replication = raw_tb / replication_factor
 
 # Reserved for overhead (BlueStore, metadata)
 after_overhead = after_replication * (1 - overhead_pct / 100)
 
 # Max usable (never fill above 85%)
 usable = after_overhead * (max_fill_pct / 100)
 
 return {
 "raw_tb": round(raw_tb, 1),
 "replication_factor": replication_factor,
 "after_replication_tb": round(after_replication, 1),
 "overhead_pct": overhead_pct,
 "after_overhead_tb": round(after_overhead, 1),
 "max_fill_pct": max_fill_pct,
 "usable_tb": round(usable, 1),
 "efficiency_pct": round(usable / raw_tb * 100, 1),
 "osd_count": raw["osd_count"],
 }
 
 def calculate_pg_count(self, pool_pct_of_data=100, osd_count=None,
 replication_factor=3, target_pgs_per_osd=100):
 if osd_count is None:
 osd_count = self.calculate_raw_capacity()["osd_count"]
 
 # Formula: (target_pgs_per_osd * osd_count * pool_pct / 100) / replication_factor
 ideal = (target_pgs_per_osd * osd_count * pool_pct_of_data / 100) / replication_factor
 
 # Round to nearest power of 2
 pg_count = 2 ** math.ceil(math.log2(max(ideal, 1)))
 
 return {
 "osd_count": osd_count,
 "pool_data_pct": pool_pct_of_data,
 "replication_factor": replication_factor,
 "ideal_pgs": round(ideal),
 "recommended_pg_count": pg_count,
 }
 
 def forecast_growth(self, current_usage_tb, monthly_growth_tb,
 months=12):
 capacity = self.calculate_usable_capacity()
 usable = capacity["usable_tb"]
 
 forecast = []
 for month in range(1, months + 1):
 projected = current_usage_tb + (monthly_growth_tb * month)
 utilization = projected / usable * 100
 
 if utilization >= 85:
 status = "CRITICAL"
 elif utilization >= 70:
 status = "WARNING"
 else:
 status = "OK"
 
 forecast.append({
 "month": month,
 "projected_tb": round(projected, 1),
 "utilization_pct": round(utilization, 1),
 "free_tb": round(usable - projected, 1),
 "status": status,
 })
 
 # Find when capacity runs out
 months_until_full = None
 for f in forecast:
 if f["utilization_pct"] >= 85:
 months_until_full = f["month"]
 break
 
 return {
 "current_usage_tb": current_usage_tb,
 "usable_capacity_tb": usable,
 "monthly_growth_tb": monthly_growth_tb,
 "months_until_85pct": months_until_full,
 "forecast": forecast,
 }
 
 def recommend_hardware(self, target_usable_tb, workload="mixed"):
 configs = {
 "archive": {
 "disk": DiskSpec(18, "hdd", rpm=7200, iops=100, throughput_mbps=200),
 "disks_per_node": 12, "nodes_min": 3, "replication": 3,
 },
 "mixed": {
 "disk": DiskSpec(4, "ssd", iops=50000, throughput_mbps=500),
 "disks_per_node": 6, "nodes_min": 3, "replication": 3,
 },
 "performance": {
 "disk": DiskSpec(2, "nvme", iops=200000, throughput_mbps=3000),
 "disks_per_node": 4, "nodes_min": 3, "replication": 3,
 },
 }
 
 config = configs.get(workload, configs["mixed"])
 
 raw_needed = target_usable_tb / (1 / config["replication"]) / 0.80
 disks_needed = math.ceil(raw_needed / config["disk"].size_tb)
 nodes_needed = max(
 math.ceil(disks_needed / config["disks_per_node"]),
 config["nodes_min"],
 )
 
 actual_disks = nodes_needed * config["disks_per_node"]
 actual_raw = actual_disks * config["disk"].size_tb
 actual_usable = actual_raw / config["replication"] * 0.80
 
 return {
 "workload": workload,
 "target_usable_tb": target_usable_tb,
 "recommendation": {
 "nodes": nodes_needed,
 "disks_per_node": config["disks_per_node"],
 "total_disks": actual_disks,
 "disk_type": config["disk"].type,
 "disk_size_tb": config["disk"].size_tb,
 "raw_capacity_tb": round(actual_raw, 1),
 "usable_capacity_tb": round(actual_usable, 1),
 "replication": config["replication"],
 },
 }

planner = CephCapacityPlanner()

for i in range(3):
 planner.add_node(NodeSpec(
 hostname=f"ceph-node{i+1}",
 cpu_cores=16, ram_gb=64,
 disks=[DiskSpec(4, "ssd", iops=50000) for _ in range(6)],
 network_gbps=25,
 ))

print("Raw:", json.dumps(planner.calculate_raw_capacity(), indent=2))
print("Usable:", json.dumps(planner.calculate_usable_capacity(), indent=2))
print("PGs:", json.dumps(planner.calculate_pg_count(pool_pct_of_data=80), indent=2))
print("Forecast:", json.dumps(planner.forecast_growth(10, 2, months=12), indent=2))
print("Recommend:", json.dumps(planner.recommend_hardware(50, "mixed"), indent=2))

Monitoring Cluster Health

Monitor Ceph cluster health และ performance

#!/usr/bin/env python3
# ceph_monitor.py — Ceph Cluster Health Monitoring
import subprocess
import json
import logging
from datetime import datetime
from typing import Dict, List

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ceph_mon")

class CephMonitor:
 def __init__(self):
 self.history = []
 
 def _run_ceph(self, *args):
 cmd = ["ceph", "--format", "json"] + list(args)
 result = subprocess.run(cmd, capture_output=True, text=True)
 if result.returncode != 0:
 logger.error(f"ceph command failed: {result.stderr}")
 return {}
 return json.loads(result.stdout)
 
 def get_cluster_status(self):
 status = self._run_ceph("status")
 
 health = status.get("health", {})
 osd_map = status.get("osdmap", {}).get("osdmap", {})
 pg_map = status.get("pgmap", {})
 
 return {
 "timestamp": datetime.utcnow().isoformat(),
 "health": health.get("status", "UNKNOWN"),
 "health_checks": list(health.get("checks", {}).keys()),
 "osds": {
 "total": osd_map.get("num_osds", 0),
 "up": osd_map.get("num_up_osds", 0),
 "in": osd_map.get("num_in_osds", 0),
 },
 "pgs": {
 "total": pg_map.get("num_pgs", 0),
 "active_clean": pg_map.get("num_pgs_by_state", [{}])[0].get("count", 0)
 if pg_map.get("num_pgs_by_state") else 0,
 },
 "storage": {
 "total_bytes": pg_map.get("bytes_total", 0),
 "used_bytes": pg_map.get("bytes_used", 0),
 "available_bytes": pg_map.get("bytes_avail", 0),
 "used_pct": round(
 pg_map.get("bytes_used", 0) / max(pg_map.get("bytes_total", 1), 1) * 100, 1
 ),
 },
 "io": {
 "read_bytes_sec": pg_map.get("read_bytes_sec", 0),
 "write_bytes_sec": pg_map.get("write_bytes_sec", 0),
 "read_op_sec": pg_map.get("read_op_per_sec", 0),
 "write_op_sec": pg_map.get("write_op_per_sec", 0),
 },
 }
 
 def get_osd_status(self):
 osds = self._run_ceph("osd", "df")
 
 osd_list = []
 for osd in osds.get("nodes", []):
 utilization = osd.get("utilization", 0)
 
 status = "ok"
 if utilization > 85:
 status = "critical"
 elif utilization > 75:
 status = "warning"
 
 osd_list.append({
 "id": osd.get("id"),
 "name": osd.get("name"),
 "total_kb": osd.get("kb"),
 "used_kb": osd.get("kb_used"),
 "available_kb": osd.get("kb_avail"),
 "utilization_pct": round(utilization, 1),
 "status": status,
 })
 
 return sorted(osd_list, key=lambda x: x["utilization_pct"], reverse=True)
 
 def get_pool_stats(self):
 pools = self._run_ceph("osd", "pool", "stats")
 
 pool_list = []
 for pool in pools:
 pool_list.append({
 "name": pool.get("pool_name"),
 "id": pool.get("pool_id"),
 "stored_bytes": pool.get("stored", 0),
 "objects": pool.get("objects", 0),
 "read_bytes_sec": pool.get("rd_bytes", 0),
 "write_bytes_sec": pool.get("wr_bytes", 0),
 })
 
 return pool_list
 
 def health_check(self):
 status = self.get_cluster_status()
 osds = self.get_osd_status()
 
 issues = []
 
 if status["health"] != "HEALTH_OK":
 issues.append(f"Cluster health: {status['health']}")
 for check in status["health_checks"]:
 issues.append(f" - {check}")
 
 down_osds = status["osds"]["total"] - status["osds"]["up"]
 if down_osds > 0:
 issues.append(f"{down_osds} OSDs are down")
 
 if status["storage"]["used_pct"] > 80:
 issues.append(f"Storage usage: {status['storage']['used_pct']}%")
 
 for osd in osds:
 if osd["status"] == "critical":
 issues.append(f"OSD {osd['name']} at {osd['utilization_pct']}%")
 
 return {
 "timestamp": status["timestamp"],
 "healthy": len(issues) == 0,
 "issues": issues,
 "summary": status,
 }

# monitor = CephMonitor()
# health = monitor.health_check()
# print(json.dumps(health, indent=2))

Performance Tuning และ Optimization

ปรับแต่ง Ceph performance

# === Ceph Performance Tuning ===

# 1. BlueStore Tuning
# ===================================
# Dedicated WAL/DB on NVMe for HDD OSDs
ceph osd set-full-ratio 0.90
ceph osd set-nearfull-ratio 0.80
ceph osd set-backfillfull-ratio 0.85

# Per-OSD settings in ceph.conf
# [osd]
# bluestore_cache_size = 4294967296 # 4GB cache per OSD
# bluestore_cache_autotune = true
# bluestore_cache_meta_ratio = 0.4
# bluestore_cache_kv_ratio = 0.4
# bluestore_cache_data_ratio = 0.2
# 
# # Async IO
# bdev_async_discard = true
# bdev_enable_discard = true
# 
# # Recovery tuning (balance between recovery speed and client IO)
# osd_recovery_max_active = 3
# osd_recovery_max_active_hdd = 1
# osd_recovery_max_active_ssd = 10
# osd_max_backfills = 1
# osd_recovery_sleep = 0
# osd_recovery_priority = 5

# 2. Network Tuning
# ===================================
# Separate public and cluster networks
# [global]
# public_network = 10.0.1.0/24
# cluster_network = 10.0.2.0/24
# ms_type = async+posix
# ms_async_op_threads = 5

# System network tuning
sudo sysctl -w net.core.rmem_max=67108864
sudo sysctl -w net.core.wmem_max=67108864
sudo sysctl -w net.core.rmem_default=33554432
sudo sysctl -w net.core.wmem_default=33554432
sudo sysctl -w net.ipv4.tcp_rmem="4096 87380 33554432"
sudo sysctl -w net.ipv4.tcp_wmem="4096 65536 33554432"

# 3. PG Autoscaler
# ===================================
ceph mgr module enable pg_autoscaler
ceph osd pool set rbd-pool pg_autoscale_mode on

# Check PG status
ceph osd pool autoscale-status

# 4. Pool Optimization
# ===================================
# Set compression for suitable pools
ceph osd pool set archive-pool compression_algorithm snappy
ceph osd pool set archive-pool compression_mode aggressive
ceph osd pool set archive-pool compression_required_ratio 0.875

# 5. Benchmark
# ===================================
# RADOS bench (sequential write)
rados bench -p rbd-pool 60 write --no-cleanup
# RADOS bench (sequential read)
rados bench -p rbd-pool 60 seq
# RADOS bench (random read)
rados bench -p rbd-pool 60 rand

# RBD bench
rbd create test-image --size 10G --pool rbd-pool
rbd bench test-image --io-type write --io-size 4K --io-threads 16 --pool rbd-pool
rbd bench test-image --io-type read --io-size 4K --io-threads 16 --pool rbd-pool

# fio with RBD
# [global]
# ioengine=rbd
# clientname=admin
# pool=rbd-pool
# rbdname=test-image
# rw=randread
# bs=4k
# numjobs=4
# iodepth=32
# runtime=60

echo "Ceph tuning complete"

Scaling และ Maintenance

ขยาย cluster และ maintenance

# === Ceph Cluster Scaling ===

# 1. Add New OSD Host
# ===================================
# Add host to cluster
ceph orch host add ceph-node4 10.0.1.13

# Add OSDs on new host
ceph orch daemon add osd ceph-node4:/dev/sdb
ceph orch daemon add osd ceph-node4:/dev/sdc
ceph orch daemon add osd ceph-node4:/dev/sdd

# Monitor rebalancing
ceph -w
# Watch PG states: active+remapped, backfilling, recovering
# Wait until all PGs are active+clean

# 2. Remove OSD (Graceful)
# ===================================
# Mark OSD out (starts rebalancing)
ceph osd out osd.5

# Wait for rebalancing
while ! ceph osd safe-to-destroy osd.5; do
 echo "Waiting for safe-to-destroy..."
 sleep 30
done

# Stop and remove
ceph orch daemon rm osd.5
ceph osd purge osd.5 --yes-i-really-mean-it

# 3. Replace Failed Disk
# ===================================
# Identify failed OSD
ceph osd tree | grep down
ceph health detail

# Remove failed OSD
ceph osd out osd.3
ceph osd purge osd.3 --yes-i-really-mean-it

# Replace disk physically, then add new OSD
ceph orch daemon add osd ceph-node2:/dev/sdc

# 4. Maintenance Window
# ===================================
# Set noout flag (prevent rebalancing during maintenance)
ceph osd set noout

# Perform maintenance...
# Restart services, update firmware, etc.
sudo systemctl restart ceph-osd@3

# Unset flag after maintenance
ceph osd unset noout

# 5. Upgrade Ceph
# ===================================
# Check current version
ceph versions

# Set container image for upgrade
ceph orch upgrade start --image quay.io/ceph/ceph:v18.2.2

# Monitor upgrade progress
ceph orch upgrade status

# 6. Backup Strategies
# ===================================
# RBD snapshots
rbd snap create rbd-pool/myimage@snap1
rbd snap ls rbd-pool/myimage

# Export snapshot
rbd export rbd-pool/myimage@snap1 /backup/myimage-snap1.img

# Incremental export
rbd export-diff rbd-pool/myimage@snap1 --from-snap snap0 /backup/myimage-diff.img

# CephFS snapshots
mkdir /mnt/cephfs/.snap/daily-$(date +%Y%m%d)

# 7. Monitoring Alerts
# ===================================
# Prometheus alerting rules for Ceph
# groups:
# - name: ceph
# rules:
# - alert: CephHealthWarning
# expr: ceph_health_status == 1
# for: 5m
# - alert: CephHealthError
# expr: ceph_health_status == 2
# for: 1m
# - alert: CephOSDDown
# expr: count(ceph_osd_up == 0) > 0
# for: 5m
# - alert: CephStorageFull
# expr: ceph_cluster_total_used_bytes / ceph_cluster_total_bytes > 0.80
# for: 10m

echo "Ceph scaling and maintenance documented"

FAQ คำถามที่พบบ่อย

Q: Ceph ต้องการ hardware อะไรบ้าง?

A: ขั้นต่ำสำหรับ production 3 nodes แต่ละ node ต้องมี CPU 4+ cores (แนะนำ 1 core ต่อ OSD), RAM 4GB base + 5GB ต่อ OSD (เช่น 6 OSDs ต้อง 34GB), 1 SSD/NVMe ต่อ OSD สำหรับ data (หรือ HDD กับ NVMe WAL/DB), network 10GbE ขั้นต่ำ (25GbE สำหรับ NVMe clusters) SSD สำหรับ MON และ MGR (ไม่ต้องใหญ่ 50-100GB พอ) สำหรับ production จริงจัง แยก public network และ cluster network

Q: Replication 3 กับ Erasure Coding ต่างกันอย่างไร?

A: Replication 3 เก็บ data 3 copies ใช้ raw space 3 เท่า แต่ recovery เร็ว read performance ดี (อ่านจาก copy ไหนัก็ได้) เหมาะสำหรับ hot data ที่ต้องการ low latency Erasure Coding (เช่น k=4, m=2) ใช้ raw space 1.5 เท่า ประหยัดกว่ามาก แต่ write latency สูงกว่า, recovery ช้ากว่า, ใช้ CPU มากกว่า เหมาะสำหรับ cold/archive data ที่ต้องการ capacity efficiency

Q: PG count ตั้งเท่าไหรดี?

A: ใช้สูตร (target_pgs_per_osd * total_osds * pool_pct) / replication_factor แล้ว round up เป็น power of 2 เช่น 18 OSDs, replication 3, pool ใช้ 80% ของ data = (100 * 18 * 0.8) / 3 = 480 ปัดเป็น 512 PGs ต่ำกว่า 100 PGs per OSD จะเสีย performance สูงกว่า 200 PGs per OSD จะใช้ memory มาก Ceph Reef+ มี pg_autoscaler ที่ adjust อัตโนมัติ แนะนำเปิดใช้

Q: Ceph เหมาะกับ workload ประเภทไหน?

A: เหมาะมากสำหรับ object storage (S3 compatible) ใช้แทน MinIO สำหรับ large scale, block storage สำหรับ VMs (Proxmox, OpenStack), Kubernetes persistent volumes (Rook-Ceph), backup storage และ data lake ไม่เหมาะสำหรับ workloads ที่ต้องการ ultra-low latency (ใช้ local NVMe แทน), small clusters น้อยกว่า 3 nodes, single-tenant database ที่ต้องการ consistent latency