Ceph Testing
Ceph Storage Cluster Testing Strategy QA Health Check Performance Benchmark Failure Injection Recovery Verification CRUSH OSD MON RBD RGW CephFS Production
| Storage Type | Interface | Use Case | Test Tool | Key Metric |
|---|---|---|---|---|
| RBD (Block) | Block Device | VM Disk K8s PV | fio rbd bench | IOPS Latency |
| RGW (Object) | S3 / Swift API | Backup Media | cosbench s3cmd | Throughput |
| CephFS (File) | POSIX Mount | Shared Files | fio iozone | IOPS Throughput |
| RADOS (Raw) | librados | Internal | rados bench | Throughput Latency |
Health Check
# === Ceph Health Check Commands ===
# Basic Status
# ceph health
# ceph status
# ceph -s
# Detailed Component Status
# ceph osd tree # OSD hierarchy and status
# ceph osd df # OSD disk usage
# ceph mon stat # Monitor quorum
# ceph mgr stat # Manager active/standby
# ceph df # Cluster storage usage
# ceph pg stat # Placement Group status
# ceph osd pool ls detail # Pool configuration
# Health Warnings
# ceph health detail # Detailed health warnings
# ceph crash ls # Recent daemon crashes
# ceph log last 50 # Recent cluster logs
# Automated Health Check Script
# #!/bin/bash
# echo "=== Ceph Cluster Health ==="
# HEALTH=$(ceph health -f json | jq -r '.status')
# echo "Status: $HEALTH"
#
# echo "=== OSD Status ==="
# TOTAL_OSD=$(ceph osd stat -f json | jq '.num_osds')
# UP_OSD=$(ceph osd stat -f json | jq '.num_up_osds')
# IN_OSD=$(ceph osd stat -f json | jq '.num_in_osds')
# echo "Total: $TOTAL_OSD | Up: $UP_OSD | In: $IN_OSD"
#
# echo "=== Storage Usage ==="
# ceph df -f json | jq '.stats | {total: .total_bytes, used: .total_used_raw_bytes, avail: .total_avail_bytes}'
#
# echo "=== PG Status ==="
# ceph pg stat
#
# if [ "$HEALTH" != "HEALTH_OK" ]; then
# echo "WARNING: Cluster not healthy!"
# ceph health detail
# exit 1
# fi
from dataclasses import dataclass
@dataclass
class HealthCheck:
component: str
check_command: str
expected: str
alert_on: str
frequency: str
checks = [
HealthCheck("Overall Health", "ceph health", "HEALTH_OK", "HEALTH_WARN or HEALTH_ERR", "30s"),
HealthCheck("OSD Status", "ceph osd stat", "All up and in", "Any OSD down", "30s"),
HealthCheck("MON Quorum", "ceph mon stat", "Quorum maintained", "Lost quorum", "30s"),
HealthCheck("PG Status", "ceph pg stat", "All active+clean", "degraded/undersized", "1min"),
HealthCheck("Disk Usage", "ceph osd df", "< 75% per OSD", "> 80% any OSD", "5min"),
HealthCheck("Cluster Usage", "ceph df", "< 70% total", "> 80% total", "5min"),
HealthCheck("Scrub Errors", "ceph health detail", "No scrub errors", "Any inconsistency", "1hr"),
]
print("=== Health Checks ===")
for c in checks:
print(f" [{c.component}] {c.check_command}")
print(f" Expected: {c.expected} | Alert: {c.alert_on} | Freq: {c.frequency}")
Performance Testing
# === Ceph Performance Benchmarks ===
# RADOS Bench — Object Storage
# rados bench -p testpool 60 write --no-cleanup # 60s write test
# rados bench -p testpool 60 seq # Sequential read
# rados bench -p testpool 60 rand # Random read
# rados -p testpool cleanup # Cleanup test data
# RBD Bench — Block Storage
# rbd create testimage --size 10G --pool rbd
# rbd bench testimage --io-type write --io-size 4K --io-threads 16 --io-total 1G
# rbd bench testimage --io-type read --io-size 4K --io-threads 16 --io-total 1G
# rbd rm testimage --pool rbd
# FIO — Detailed Block Testing
# fio --name=randwrite --ioengine=rbd --pool=rbd --rbdname=testimage \
# --iodepth=32 --rw=randwrite --bs=4k --size=1G --numjobs=4 \
# --runtime=60 --time_based --group_reporting
# S3 Bench — Object Gateway
# s3cmd put testfile.bin s3://testbucket/ --multipart-chunk-size-mb=64
# cosbench submit workload.xml
@dataclass
class BenchmarkResult:
test: str
tool: str
io_pattern: str
throughput: str
iops: str
latency_avg: str
latency_p99: str
results = [
BenchmarkResult("RADOS Write", "rados bench", "Sequential", "1.2 GB/s", "N/A", "8ms", "25ms"),
BenchmarkResult("RADOS Read", "rados bench", "Sequential", "2.1 GB/s", "N/A", "4ms", "12ms"),
BenchmarkResult("RBD 4K Write", "fio", "Random", "120 MB/s", "30,720", "0.5ms", "2.1ms"),
BenchmarkResult("RBD 4K Read", "fio", "Random", "280 MB/s", "71,680", "0.2ms", "0.8ms"),
BenchmarkResult("RBD 64K Write", "fio", "Sequential", "800 MB/s", "12,800", "2.5ms", "8ms"),
BenchmarkResult("S3 PUT", "cosbench", "Object 1MB", "500 MB/s", "500 ops/s", "15ms", "50ms"),
]
print("\n=== Benchmark Results (12 OSD NVMe) ===")
for r in results:
print(f" [{r.test}] Tool: {r.tool} | Pattern: {r.io_pattern}")
print(f" Throughput: {r.throughput} | IOPS: {r.iops}")
print(f" Latency avg: {r.latency_avg} | p99: {r.latency_p99}")
Failure Testing
# === Failure Injection Tests ===
# OSD Failure
# systemctl stop ceph-osd@5
# watch ceph -s # Watch recovery
# ceph osd tree | grep osd.5 # Verify marked down
# # Wait for recovery complete
# systemctl start ceph-osd@5
# # Verify HEALTH_OK
# Node Failure (simulate)
# ssh node3 "sudo systemctl stop ceph-osd.target"
# ceph osd tree # Check multiple OSDs down
# # Verify data still accessible
# rados -p testpool get testobj /tmp/verify.bin
# md5sum /tmp/verify.bin # Compare with original
# Network Partition (iptables)
# iptables -A INPUT -s node3 -j DROP
# iptables -A OUTPUT -d node3 -j DROP
# # Monitor cluster behavior
# # Cleanup: iptables -F
@dataclass
class FailureTest:
scenario: str
inject_method: str
expected_behavior: str
recovery_time: str
verify: str
pass_criteria: str
tests = [
FailureTest("Single OSD Failure", "systemctl stop ceph-osd@N",
"Cluster degrades, starts recovery", "5-15 min (depends on data)",
"ceph -s shows active+clean", "All PGs clean, no data loss"),
FailureTest("Full Node Failure", "Power off node",
"Multiple OSDs down, recovery starts", "15-60 min",
"All PGs active+clean after recovery", "No data loss, HEALTH_OK"),
FailureTest("Network Partition", "iptables block",
"Affected OSDs marked down", "After network restore: 5-10 min",
"No split-brain, data consistent", "Scrub shows no errors"),
FailureTest("MON Failure (1 of 3)", "Stop monitor daemon",
"Quorum maintained with 2/3", "Immediate (no recovery needed)",
"ceph mon stat shows quorum", "Reads/writes unaffected"),
FailureTest("Disk Full (>85%)", "Fill OSD to near full",
"OSD marked nearfull, writes may block", "After cleanup: immediate",
"ceph health shows no nearfull", "No data loss"),
FailureTest("Rolling Upgrade", "Upgrade one node at a time",
"Brief degradation per node", "2-5 min per node",
"HEALTH_OK after all nodes upgraded", "Version consistent"),
]
print("Failure Test Plan:")
for t in tests:
print(f" [{t.scenario}]")
print(f" Inject: {t.inject_method}")
print(f" Expected: {t.expected_behavior}")
print(f" Recovery: {t.recovery_time}")
print(f" Verify: {t.verify}")
print(f" Pass: {t.pass_criteria}")
qa_summary = {
"Health Checks": "12/12 passed",
"Performance Benchmarks": "All within baseline ±10%",
"OSD Failure Test": "Passed — recovery 8 min",
"Node Failure Test": "Passed — recovery 25 min",
"Network Partition": "Passed — no split-brain",
"Data Integrity (scrub)": "Passed — 0 errors",
"Cluster Full Test": "Passed — graceful handling",
}
print(f"\n\nQA Summary:")
for k, v in qa_summary.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- Baseline: วัด Performance Baseline ก่อนทุก Change
- Scrub: Deep Scrub ตรวจ Data Integrity หลังทุก Failure Test
- Staging: ทดสอบ Failure ใน Staging ก่อน Production เสมอ
- Monitor: ดู ceph -s ตลอดระหว่างทดสอบ
- Runbook: สร้าง Runbook สำหรับทุก Failure Scenario
Ceph Storage คืออะไร
Open Source Distributed Storage Block RBD Object RGW S3 File CephFS CRUSH Replicate Self-healing Cloud OpenStack Kubernetes
ทดสอบ Ceph Cluster อย่างไร
Health Check ceph status OSD MON MGR Performance rados bench fio Failure Test ถอด OSD ปิด Node Recovery Scrub Data Integrity
Performance Benchmark ทำอย่างไร
rados bench Object fio RBD Block rbd bench cosbench S3 RGW Write Read Sequential Random Throughput IOPS Latency Baseline Compare
Failure Testing ทำอย่างไร
OSD Failure stop ceph-osd Node Failure Power off Network Partition iptables Disk Full MON Failure Recovery Time Data Integrity Scrub Runbook
สรุป
Ceph Storage Cluster Testing Strategy QA Health Check Performance Benchmark Failure Injection Recovery OSD MON CRUSH Scrub Data Integrity Production
