Ceph Storage Cluster Testing Strategy QA —

Ceph Testing

Ceph Storage Cluster Testing Strategy QA Health Check Performance Benchmark Failure Injection Recovery Verification CRUSH OSD MON RBD RGW CephFS Production

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Vector Database Pinecone RBAC ABAC Policy

Storage Type	Interface	Use Case	Test Tool	Key Metric
RBD (Block)	Block Device	VM Disk K8s PV	fio rbd bench	IOPS Latency
RGW (Object)	S3 / Swift API	Backup Media	cosbench s3cmd	Throughput
CephFS (File)	POSIX Mount	Shared Files	fio iozone	IOPS Throughput
RADOS (Raw)	librados	Internal	rados bench	Throughput Latency

Health Check

# === Ceph Health Check Commands ===



# Basic Status

# ceph health

# ceph status

# ceph -s



# Detailed Component Status

# ceph osd tree              # OSD hierarchy and status

# ceph osd df                # OSD disk usage

# ceph mon stat              # Monitor quorum

# ceph mgr stat              # Manager active/standby

# ceph df                    # Cluster storage usage

# ceph pg stat               # Placement Group status

# ceph osd pool ls detail    # Pool configuration



# Health Warnings

# ceph health detail         # Detailed health warnings

# ceph crash ls              # Recent daemon crashes

# ceph log last 50           # Recent cluster logs



# Automated Health Check Script

# #!/bin/bash

# echo "=== Ceph Cluster Health ==="

# HEALTH=$(ceph health -f json | jq -r '.status')

# echo "Status: $HEALTH"

#

# echo "=== OSD Status ==="

# TOTAL_OSD=$(ceph osd stat -f json | jq '.num_osds')

# UP_OSD=$(ceph osd stat -f json | jq '.num_up_osds')

# IN_OSD=$(ceph osd stat -f json | jq '.num_in_osds')

# echo "Total: $TOTAL_OSD | Up: $UP_OSD | In: $IN_OSD"

#

# echo "=== Storage Usage ==="

# ceph df -f json | jq '.stats | {total: .total_bytes, used: .total_used_raw_bytes, avail: .total_avail_bytes}'

#

# echo "=== PG Status ==="

# ceph pg stat

#

# if [ "$HEALTH" != "HEALTH_OK" ]; then

#   echo "WARNING: Cluster not healthy!"

#   ceph health detail

#   exit 1

# fi



from dataclasses import dataclass



@dataclass

class HealthCheck:

    component: str

    check_command: str

    expected: str

    alert_on: str

    frequency: str



checks = [

    HealthCheck("Overall Health", "ceph health", "HEALTH_OK", "HEALTH_WARN or HEALTH_ERR", "30s"),

    HealthCheck("OSD Status", "ceph osd stat", "All up and in", "Any OSD down", "30s"),

    HealthCheck("MON Quorum", "ceph mon stat", "Quorum maintained", "Lost quorum", "30s"),

    HealthCheck("PG Status", "ceph pg stat", "All active+clean", "degraded/undersized", "1min"),

    HealthCheck("Disk Usage", "ceph osd df", "< 75% per OSD", "> 80% any OSD", "5min"),

    HealthCheck("Cluster Usage", "ceph df", "< 70% total", "> 80% total", "5min"),

    HealthCheck("Scrub Errors", "ceph health detail", "No scrub errors", "Any inconsistency", "1hr"),

]



print("=== Health Checks ===")

for c in checks:

    print(f"  [{c.component}] {c.check_command}")

    print(f"    Expected: {c.expected} | Alert: {c.alert_on} | Freq: {c.frequency}")

Performance Testing

# === Ceph Performance Benchmarks ===



# RADOS Bench — Object Storage

# rados bench -p testpool 60 write --no-cleanup    # 60s write test

# rados bench -p testpool 60 seq                    # Sequential read

# rados bench -p testpool 60 rand                   # Random read

# rados -p testpool cleanup                         # Cleanup test data



# RBD Bench — Block Storage

# rbd create testimage --size 10G --pool rbd

# rbd bench testimage --io-type write --io-size 4K --io-threads 16 --io-total 1G

# rbd bench testimage --io-type read --io-size 4K --io-threads 16 --io-total 1G

# rbd rm testimage --pool rbd



# FIO — Detailed Block Testing

# fio --name=randwrite --ioengine=rbd --pool=rbd --rbdname=testimage \

#   --iodepth=32 --rw=randwrite --bs=4k --size=1G --numjobs=4 \

#   --runtime=60 --time_based --group_reporting



# S3 Bench — Object Gateway

# s3cmd put testfile.bin s3://testbucket/ --multipart-chunk-size-mb=64

# cosbench submit workload.xml



@dataclass

class BenchmarkResult:

    test: str

    tool: str

    io_pattern: str

    throughput: str

    iops: str

    latency_avg: str

    latency_p99: str



results = [

    BenchmarkResult("RADOS Write", "rados bench", "Sequential", "1.2 GB/s", "N/A", "8ms", "25ms"),

    BenchmarkResult("RADOS Read", "rados bench", "Sequential", "2.1 GB/s", "N/A", "4ms", "12ms"),

    BenchmarkResult("RBD 4K Write", "fio", "Random", "120 MB/s", "30,720", "0.5ms", "2.1ms"),

    BenchmarkResult("RBD 4K Read", "fio", "Random", "280 MB/s", "71,680", "0.2ms", "0.8ms"),

    BenchmarkResult("RBD 64K Write", "fio", "Sequential", "800 MB/s", "12,800", "2.5ms", "8ms"),

    BenchmarkResult("S3 PUT", "cosbench", "Object 1MB", "500 MB/s", "500 ops/s", "15ms", "50ms"),

]



print("\n=== Benchmark Results (12 OSD NVMe) ===")

for r in results:

    print(f"  [{r.test}] Tool: {r.tool} | Pattern: {r.io_pattern}")

    print(f"    Throughput: {r.throughput} | IOPS: {r.iops}")

    print(f"    Latency avg: {r.latency_avg} | p99: {r.latency_p99}")

Failure Testing

# === Failure Injection Tests ===



# OSD Failure

# systemctl stop ceph-osd@5

# watch ceph -s                    # Watch recovery

# ceph osd tree | grep osd.5      # Verify marked down

# # Wait for recovery complete

# systemctl start ceph-osd@5

# # Verify HEALTH_OK



# Node Failure (simulate)

# ssh node3 "sudo systemctl stop ceph-osd.target"

# ceph osd tree                    # Check multiple OSDs down

# # Verify data still accessible

# rados -p testpool get testobj /tmp/verify.bin

# md5sum /tmp/verify.bin           # Compare with original



# Network Partition (iptables)

# iptables -A INPUT -s node3 -j DROP

# iptables -A OUTPUT -d node3 -j DROP

# # Monitor cluster behavior

# # Cleanup: iptables -F



@dataclass

class FailureTest:

    scenario: str

    inject_method: str

    expected_behavior: str

    recovery_time: str

    verify: str

    pass_criteria: str



tests = [

    FailureTest("Single OSD Failure", "systemctl stop ceph-osd@N",

        "Cluster degrades, starts recovery", "5-15 min (depends on data)",

        "ceph -s shows active+clean", "All PGs clean, no data loss"),

    FailureTest("Full Node Failure", "Power off node",

        "Multiple OSDs down, recovery starts", "15-60 min",

        "All PGs active+clean after recovery", "No data loss, HEALTH_OK"),

    FailureTest("Network Partition", "iptables block",

        "Affected OSDs marked down", "After network restore: 5-10 min",

        "No split-brain, data consistent", "Scrub shows no errors"),

    FailureTest("MON Failure (1 of 3)", "Stop monitor daemon",

        "Quorum maintained with 2/3", "Immediate (no recovery needed)",

        "ceph mon stat shows quorum", "Reads/writes unaffected"),

    FailureTest("Disk Full (>85%)", "Fill OSD to near full",

        "OSD marked nearfull, writes may block", "After cleanup: immediate",

        "ceph health shows no nearfull", "No data loss"),

    FailureTest("Rolling Upgrade", "Upgrade one node at a time",

        "Brief degradation per node", "2-5 min per node",

        "HEALTH_OK after all nodes upgraded", "Version consistent"),

]



print("Failure Test Plan:")

for t in tests:

    print(f"  [{t.scenario}]")

    print(f"    Inject: {t.inject_method}")

    print(f"    Expected: {t.expected_behavior}")

    print(f"    Recovery: {t.recovery_time}")

    print(f"    Verify: {t.verify}")

    print(f"    Pass: {t.pass_criteria}")



qa_summary = {

    "Health Checks": "12/12 passed",

    "Performance Benchmarks": "All within baseline ±10%",

    "OSD Failure Test": "Passed — recovery 8 min",

    "Node Failure Test": "Passed — recovery 25 min",

    "Network Partition": "Passed — no split-brain",

    "Data Integrity (scrub)": "Passed — 0 errors",

    "Cluster Full Test": "Passed — graceful handling",

}



print(f"\n\nQA Summary:")

for k, v in qa_summary.items():

    print(f"  [{k}]: {v}")

เคล็ดลับ

Baseline: วัด Performance Baseline ก่อนทุก Change
Scrub: Deep Scrub ตรวจ Data Integrity หลังทุก Failure Test
Staging: ทดสอบ Failure ใน Staging ก่อน Production เสมอ
Monitor: ดู ceph -s ตลอดระหว่างทดสอบ
Runbook: สร้าง Runbook สำหรับทุก Failure Scenario

Ceph Storage คืออะไร

Open Source Distributed Storage Block RBD Object RGW S3 File CephFS CRUSH Replicate Self-healing Cloud OpenStack Kubernetes

แนะนำเพิ่มเติม — ดูสัญญาณเทรดที่ XM Signal

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ CDK Construct Interview Preparation

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Elasticsearch OpenSearch DevOps Culture