Ceph Storage Cluster Troubleshooting แก้ปัญหา
Ceph เป็น distributed storage system ที่ซับซ้อน การ troubleshoot ปัญหาต้องเข้าใจ architecture และรู้จักเครื่องมือที่ใช้วินิจฉัย บทความนี้รวบรวมปัญหาที่พบบ่อยในการดูแล Ceph cluster ทั้ง OSD failures, slow requests, PG issues, network problems และ performance degradation พร้อมวิธีแก้ไขแบบ step-by-step และ Python scripts สำหรับ automated health checks เหมาะสำหรับ system administrators และ DevOps engineers ที่ดูแล Ceph clusters
Diagnostic Commands
# diagnostics.py — Ceph diagnostic commands
import json
class CephDiagnostics:
ESSENTIAL_COMMANDS = {
"health": {
"command": "ceph health detail",
"description": "แสดง health status และ warnings ทั้งหมด",
"output_example": "HEALTH_WARN 1 osds down; Degraded data redundancy",
},
"status": {
"command": "ceph status (ceph -s)",
"description": "ภาพรวม cluster: MONs, OSDs, PGs, usage",
"output_example": "cluster health, mon/osd count, pg stats, io stats",
},
"osd_tree": {
"command": "ceph osd tree",
"description": "แสดง OSD hierarchy และ status (up/down, in/out)",
"output_example": "tree view: root → rack → host → osd (up/down)",
},
"pg_stat": {
"command": "ceph pg stat",
"description": "สรุปสถานะ PGs ทั้งหมด",
"output_example": "256 pgs: 250 active+clean, 6 active+undersized",
},
"osd_perf": {
"command": "ceph osd perf",
"description": "แสดง commit/apply latency ของทุก OSD",
"output_example": "osd.0 commit_latency(ms) 5 apply_latency(ms) 12",
},
"df": {
"command": "ceph df",
"description": "แสดง disk usage ทั้ง cluster และแต่ละ pool",
"output_example": "TOTAL: 48TB, USED: 28TB (58%), AVAIL: 20TB",
},
"log": {
"command": "ceph log last 50",
"description": "แสดง 50 log entries ล่าสุด",
"output_example": "recent cluster events and warnings",
},
}
def show_commands(self):
print("=== Essential Diagnostic Commands ===\n")
for key, cmd in self.ESSENTIAL_COMMANDS.items():
print(f" $ {cmd['command']}")
print(f" {cmd['description']}")
print()
def quick_check(self):
print("=== Quick Health Check Script ===")
script = """
#!/bin/bash
echo "=== Ceph Quick Health Check ==="
echo "--- Health ---"
ceph health detail
echo "--- OSD Status ---"
ceph osd stat
echo "--- PG Status ---"
ceph pg stat
echo "--- Disk Usage ---"
ceph df
echo "--- Slow OSDs ---"
ceph osd perf | sort -k3 -n -r | head -5
"""
print(script[:400])
diag = CephDiagnostics()
diag.show_commands()
diag.quick_check()
OSD Problems
# osd_problems.py — OSD troubleshooting
import json
import random
class OSDProblems:
ISSUES = {
"osd_down": {
"symptom": "OSD marked down",
"check": "ceph osd tree | grep down",
"causes": [
"Disk failure (smartctl -a /dev/sdX)",
"OSD daemon crashed (systemctl status ceph-osd@N)",
"Network connectivity lost",
"Out of memory (dmesg | grep oom)",
],
"fix": [
"1. ตรวจ logs: journalctl -u ceph-osd@N -n 100",
"2. ตรวจ disk: smartctl -a /dev/sdX",
"3. Restart OSD: systemctl restart ceph-osd@N",
"4. ถ้า disk fail: replace disk + redeploy OSD",
],
},
"slow_ops": {
"symptom": "Slow OSD requests (> 30s)",
"check": "ceph daemon osd.N perf dump | grep slow",
"causes": [
"Disk I/O saturation (iostat -x 1)",
"Network congestion",
"Undersized cluster (too few OSDs)",
"Large object writes",
],
"fix": [
"1. ตรวจ I/O: iostat -x 1 5",
"2. ตรวจ network: iperf3 ระหว่าง nodes",
"3. ตรวจ OSD journal/WAL: ใช้ SSD สำหรับ WAL+DB",
"4. ปรับ osd_op_thread_timeout และ osd_op_complaint_time",
],
},
"nearfull": {
"symptom": "OSD nearfull (> 85%) หรือ full (> 95%)",
"check": "ceph osd df | sort -k7 -n -r",
"causes": [
"Disk space หมด",
"Data ไม่ balanced ระหว่าง OSDs",
"Pool quota exceeded",
],
"fix": [
"1. เพิ่ม OSDs ใหม่",
"2. Rebalance: ceph osd reweight-by-utilization",
"3. ลบ data ที่ไม่ต้องการ",
"4. ปรับ nearfull ratio: ceph osd set-nearfull-ratio 0.90",
],
},
}
def show_issues(self):
print("=== OSD Problems ===\n")
for key, issue in self.ISSUES.items():
print(f"[{issue['symptom']}]")
print(f" Check: {issue['check']}")
print(f" Causes: {issue['causes'][0]}")
print(f" Fix: {issue['fix'][0]}")
print()
def osd_health(self):
print("=== OSD Health Status ===")
for i in range(6):
status = random.choice(["up", "up", "up", "up", "down"])
latency = random.randint(2, 50) if status == "up" else 0
usage = random.randint(40, 95) if status == "up" else 0
icon = "OK" if status == "up" and latency < 20 else "WARN" if status == "up" else "DOWN"
print(f" [{icon:>4}] osd.{i}: {status} | latency={latency}ms | usage={usage}%")
osd = OSDProblems()
osd.show_issues()
osd.osd_health()
PG (Placement Group) Issues
# pg_issues.py — PG troubleshooting
import json
class PGIssues:
STATES = {
"active+clean": {"description": "ปกติ — PG ทำงานได้และ replicas ครบ", "action": "ไม่ต้องทำอะไร"},
"active+undersized": {"description": "ทำงานได้แต่ replicas ไม่ครบ", "action": "ตรวจ OSD ที่ down → แก้ไข OSD"},
"active+degraded": {"description": "ทำงานได้แต่ข้อมูลบางส่วนสูญเสีย replica", "action": "รอ recovery หรือตรวจ OSD"},
"peering": {"description": "PGs กำลัง sync กัน", "action": "รอ — ปกติหลัง OSD restart/add"},
"recovering": {"description": "กำลัง recover data", "action": "รอ — อาจใช้เวลานานถ้า data เยอะ"},
"backfilling": {"description": "กำลัง backfill data ไป OSD ใหม่", "action": "รอ — ปรับ osd_max_backfills ถ้าช้า"},
"stale": {"description": "PG ไม่ได้ report status", "action": "ตรวจ OSDs ที่รับผิดชอบ PG นี้"},
"inconsistent": {"description": "Data ไม่ตรงกันระหว่าง replicas", "action": "ceph pg repair "},
"incomplete": {"description": "ข้อมูลไม่ครบ ไม่สามารถ serve ได้", "action": "Critical — ต้องแก้ OSD ที่มี data"},
}
COMMANDS = {
"list_unhealthy": "ceph pg ls | grep -v active+clean",
"pg_detail": "ceph pg query",
"repair": "ceph pg repair ",
"deep_scrub": "ceph pg deep-scrub ",
"dump_stuck": "ceph pg dump_stuck unclean",
"recovery_speed": "ceph tell 'osd.*' config set osd_max_backfills 2",
}
def show_states(self):
print("=== PG States ===\n")
for state, info in self.STATES.items():
icon = "OK" if state == "active+clean" else "WARN" if "active" in state else "ERR"
print(f" [{icon:>4}] {state:<25} → {info['action']}")
def show_commands(self):
print(f"\n=== PG Commands ===")
for name, cmd in self.COMMANDS.items():
print(f" [{name}] $ {cmd}")
pg = PGIssues()
pg.show_states()
pg.show_commands()
Automated Health Check Script
# health_check.py — Automated Ceph health check
import json
import random
import subprocess
class CephHealthCheck:
PYTHON_SCRIPT = """
# ceph_health_check.py — Production health check
import subprocess
import json
import sys
class CephChecker:
def __init__(self):
self.issues = []
def run_cmd(self, cmd):
result = subprocess.run(cmd.split(), capture_output=True, text=True)
return result.stdout.strip()
def check_health(self):
health = self.run_cmd("ceph health -f json")
data = json.loads(health)
status = data.get("status", "UNKNOWN")
if status != "HEALTH_OK":
checks = data.get("checks", {})
for check_name, check_data in checks.items():
severity = check_data.get("severity", "UNKNOWN")
summary = check_data["summary"]["message"]
self.issues.append(f"[{severity}] {check_name}: {summary}")
return status
def check_osds(self):
osd_stat = self.run_cmd("ceph osd stat -f json")
data = json.loads(osd_stat)
total = data.get("num_osds", 0)
up = data.get("num_up_osds", 0)
in_count = data.get("num_in_osds", 0)
if up < total:
self.issues.append(f"[WARN] OSDs down: {total - up}/{total}")
return {"total": total, "up": up, "in": in_count}
def check_usage(self):
df = self.run_cmd("ceph df -f json")
data = json.loads(df)
stats = data.get("stats", {})
total = stats.get("total_bytes", 1)
used = stats.get("total_used_raw_bytes", 0)
pct = (used / total) * 100
if pct > 80:
self.issues.append(f"[WARN] Cluster usage: {pct:.1f}%")
return round(pct, 1)
def report(self):
health = self.check_health()
osds = self.check_osds()
usage = self.check_usage()
print(f"Health: {health}")
print(f"OSDs: {osds['up']}/{osds['total']} up")
print(f"Usage: {usage}%")
if self.issues:
print(f"\\nIssues ({len(self.issues)}):")
for issue in self.issues:
print(f" {issue}")
return 1
return 0
checker = CephChecker()
sys.exit(checker.report())
"""
def show_script(self):
print("=== Automated Health Check ===")
print(self.PYTHON_SCRIPT[:700])
def simulate(self):
print(f"\n=== Simulated Output ===")
health = random.choice(["HEALTH_OK", "HEALTH_OK", "HEALTH_WARN"])
osds_total = 12
osds_up = osds_total - random.choice([0, 0, 0, 1])
usage = random.uniform(40, 85)
print(f" Health: {health}")
print(f" OSDs: {osds_up}/{osds_total} up")
print(f" Usage: {usage:.1f}%")
if health != "HEALTH_OK":
print(f" Issues: [WARN] 1 OSD down, recovery in progress")
check = CephHealthCheck()
check.show_script()
check.simulate()
Performance Tuning
# performance.py — Ceph performance tuning
import json
import random
class PerformanceTuning:
TUNING = {
"osd_journal": {
"name": "OSD Journal/WAL on SSD",
"description": "ใช้ SSD สำหรับ BlueStore WAL+DB แยกจาก HDD data",
"impact": "Write latency ลดลง 50-80%",
"command": "ceph-volume lvm create --data /dev/sdb --block.wal /dev/nvme0n1p1 --block.db /dev/nvme0n1p2",
},
"recovery_speed": {
"name": "Recovery/Backfill Speed",
"description": "ปรับ speed ของ recovery ให้เหมาะสม (ไม่กระทบ client I/O)",
"impact": "Balance recovery speed vs client performance",
"command": "ceph tell 'osd.*' config set osd_recovery_max_active 3",
},
"pg_count": {
"name": "PG Count Optimization",
"description": "ตั้ง PG count ให้เหมาะ: (OSDs × 100) / replicas → round up to power of 2",
"impact": "กระจาย data ได้ดีขึ้น ลด hotspots",
"command": "ceph osd pool set pg_num 256",
},
"network": {
"name": "Network Optimization",
"description": "แยก public network (client) กับ cluster network (replication)",
"impact": "Replication ไม่กระทบ client traffic",
"command": "ceph config set global cluster_network 10.0.1.0/24",
},
"cache_tier": {
"name": "Cache Tier (SSD ↔ HDD)",
"description": "ใช้ SSD pool เป็น cache หน้า HDD pool",
"impact": "Read performance ดีขึ้น 5-10x สำหรับ hot data",
"command": "ceph osd tier add hdd-pool ssd-pool && ceph osd tier cache-mode ssd-pool writeback",
},
}
def show_tuning(self):
print("=== Performance Tuning ===\n")
for key, tune in self.TUNING.items():
print(f"[{tune['name']}]")
print(f" {tune['description']}")
print(f" Impact: {tune['impact']}")
print()
def benchmark(self):
print("=== Benchmark Commands ===")
cmds = [
"rados bench 60 write -p testpool --no-cleanup",
"rados bench 60 seq -p testpool",
"rados bench 60 rand -p testpool",
"rbd bench --io-type write --io-size 4K --io-total 1G testpool/testimg",
"fio --name=ceph-test --ioengine=rbd --pool=testpool --rbdname=testimg --iodepth=32 --rw=randwrite --bs=4k --runtime=60",
]
for cmd in cmds:
print(f" $ {cmd}")
perf = PerformanceTuning()
perf.show_tuning()
perf.benchmark()
FAQ - คำถามที่พบบ่อย
Q: HEALTH_WARN แต่ทุกอย่างทำงานปกติ ต้องแก้ไหม?
A: ควรแก้ HEALTH_WARN ทั่วไป: clock skew (sync NTP), nearfull OSD (เพิ่ม storage), noout flag set (ลืมถอด flag) บาง WARN ไม่เร่งด่วน: too few PGs (ปรับเมื่อ maintenance) แต่ WARN อาจกลายเป็น ERROR ถ้าปล่อยนาน ดู ceph health detail เสมอ
Q: OSD down แล้วจะสูญเสียข้อมูลไหม?
A: ไม่ ถ้าใช้ replication 3x (default) ต้อง OSD down 3 ตัวที่เก็บ data เดียวกันพร้อมกันถึงจะสูญเสีย Ceph จะ auto-recover เมื่อ OSD กลับมา ถ้า OSD down นาน (> 10 นาที default): Ceph เริ่ม recovery ไป OSD อื่น สำคัญ: อย่าให้ usage > 80% (ต้องมี space สำหรับ recovery)
Q: Recovery ช้ามาก ทำอย่างไร?
A: ปรับ recovery speed: ceph tell osd.* config set osd_recovery_max_active 5 เพิ่ม backfill: ceph tell osd.* config set osd_max_backfills 3 ลด priority: ceph tell osd.* config set osd_recovery_sleep 0.1 (ช้าลงแต่ไม่กระทบ client) ข้อควรระวัง: recovery เร็ว = client I/O ช้า, ต้อง balance
Q: ควร deep-scrub บ่อยแค่ไหน?
A: Default: ทุก 7 วัน (osd_deep_scrub_interval) แนะนำ: ไม่เปลี่ยนจาก default Deep-scrub ตรวจ data integrity (checksum verify) ใช้ I/O มาก — schedule ช่วง low-traffic Manual: ceph pg deep-scrub
