Opsgenie Alert Testing Strategy QA

Opsgenie Alert Testing

Opsgenie Alert Management Atlassian Routing Escalation On-call Schedule Notification Testing Strategy QA Chaos Testing Incident Response Alert Fatigue

Feature	Description	Benefit
Alert Routing	ส่ง Alert ไปทีมที่ใช่	ตอบสนองเร็ว ไม่ส่งผิดคน
Escalation	ส่งต่อถ้าไม่ตอบ	ไม่มี Alert ตกหล่น
On-call	ตารางเวร 24/7	มีคนรับผิดชอบเสมอ
Deduplication	รวม Alert ซ้ำ	ลด Alert Fatigue
Integration	เชื่อม 200+ Tools	รวมศูนย์ Alerts

Opsgenie Configuration

# === Opsgenie Setup & Configuration ===

# pip install opsgenie-sdk

# import opsgenie_sdk
#
# configuration = opsgenie_sdk.Configuration()
# configuration.api_key['Authorization'] = 'YOUR_API_KEY'
#
# alert_api = opsgenie_sdk.AlertApi(opsgenie_sdk.ApiClient(configuration))
#
# # Create Alert
# body = opsgenie_sdk.CreateAlertPayload(
#     message="High CPU Usage on web-server-01",
#     alias="cpu-high-web01",
#     description="CPU usage exceeded 90% for 5 minutes",
#     responders=[
#         {"type": "team", "name": "platform-team"},
#     ],
#     priority="P2",
#     tags=["infrastructure", "cpu", "production"],
#     details={"server": "web-server-01", "cpu": "92%", "region": "ap-southeast-1"},
# )
# response = alert_api.create_alert(body)
#
# # Acknowledge Alert
# alert_api.acknowledge_alert(
#     identifier="cpu-high-web01",
#     identifier_type="alias",
#     body=opsgenie_sdk.AcknowledgeAlertPayload(
#         user="oncall@example.com",
#         note="Investigating high CPU, scaling up"
#     )
# )
#
# # Close Alert
# alert_api.close_alert(
#     identifier="cpu-high-web01",
#     identifier_type="alias",
#     body=opsgenie_sdk.CloseAlertPayload(
#         user="oncall@example.com",
#         note="Scaled up to 4 instances, CPU normalized"
#     )
# )

from dataclasses import dataclass, field
from typing import List, Dict, Optional
from enum import Enum

class Priority(Enum):
    P1 = "Critical — Phone Call + SMS ทันที"
    P2 = "High — SMS + Push ภายใน 5 นาที"
    P3 = "Medium — Push + Email ภายใน 30 นาที"
    P4 = "Low — Email เท่านั้น"
    P5 = "Info — Log ไม่แจ้งเตือน"

@dataclass
class AlertRule:
    name: str
    condition: str
    priority: str
    team: str
    notification: List[str]
    escalation_min: int

rules = [
    AlertRule("CPU > 90%", "cpu_percent > 90 for 5m", "P2", "Platform",
              ["SMS", "Push", "Slack"], 15),
    AlertRule("Disk > 95%", "disk_percent > 95", "P1", "Platform",
              ["Phone", "SMS", "Push", "Slack"], 5),
    AlertRule("API Error Rate > 5%", "error_rate > 0.05 for 3m", "P1", "Backend",
              ["Phone", "SMS", "Push"], 5),
    AlertRule("Response Time P99 > 2s", "p99_latency > 2000 for 5m", "P2", "Backend",
              ["SMS", "Push", "Slack"], 15),
    AlertRule("SSL Cert < 7 days", "ssl_days_remaining < 7", "P3", "DevOps",
              ["Email", "Slack"], 60),
]

print("=== Alert Rules ===")
for rule in rules:
    print(f"\n  [{rule.priority}] {rule.name}")
    print(f"    Condition: {rule.condition}")
    print(f"    Team: {rule.team} | Escalate: {rule.escalation_min}min")
    print(f"    Notify: {', '.join(rule.notification)}")

Testing Strategy

# === Alert Testing Strategy ===

@dataclass
class AlertTest:
    name: str
    type: str
    description: str
    expected: str
    status: str

tests = [
    AlertTest("Routing Test", "Functional",
        "ส่ง Test Alert ตรวจว่าไปถูกทีม",
        "Alert ถึง Platform Team ภายใน 30 วินาที", "PASS"),
    AlertTest("Escalation Test", "Functional",
        "ไม่ Acknowledge ภายใน 15 นาที",
        "Escalate ไป Secondary On-call", "PASS"),
    AlertTest("Notification Channel", "Integration",
        "ตรวจทุก Channel: Phone SMS Push Email Slack",
        "ทุก Channel ส่งสำเร็จ", "PASS"),
    AlertTest("Deduplication", "Functional",
        "ส่ง Alert ซ้ำ 5 ครั้ง",
        "เห็นแค่ 1 Alert มี Count 5", "PASS"),
    AlertTest("Priority Mapping", "Functional",
        "ส่ง Alert ทุก Priority Level",
        "P1 Phone+SMS, P2 SMS+Push, P3 Push+Email", "PASS"),
    AlertTest("On-call Rotation", "Schedule",
        "ตรวจว่า On-call หมุนเวียนถูกต้อง",
        "สัปดาห์นี้ Alice สัปดาห์หน้า Bob", "PASS"),
    AlertTest("Override", "Schedule",
        "Bob ลา Alice Override",
        "Alert ไปหา Alice แทน Bob", "PASS"),
    AlertTest("Chaos Test", "Chaos",
        "จำลอง Server Down ตรวจ Full Flow",
        "Alert -> Ack -> Investigate -> Resolve < 30min", "PASS"),
]

print("=== Alert Test Suite ===")
passed = sum(1 for t in tests if t.status == "PASS")
print(f"  Results: {passed}/{len(tests)} PASSED\n")
for t in tests:
    icon = "PASS" if t.status == "PASS" else "FAIL"
    print(f"  [{icon}] {t.name} ({t.type})")
    print(f"    {t.description}")
    print(f"    Expected: {t.expected}")

# QA Checklist
qa_checklist = {
    "Alert Creation": [
        "Alert มี Message ชัดเจน",
        "Priority ตรงกับ Severity",
        "Tags ครบถ้วน สำหรับ Filter",
        "Details มี Context เพียงพอ Debug",
    ],
    "Routing": [
        "Route ไปถูกทีม",
        "ไม่ส่ง Alert ไปคนที่ไม่เกี่ยว",
        "Time-based Routing ทำงาน (เช่า/ค่ำ)",
    ],
    "Escalation": [
        "Escalation Policy ตั้งถูกต้อง",
        "Timeout เหมาะสมตาม Priority",
        "มี Final Escalation (Manager)",
    ],
    "Notification": [
        "ทุก Channel ทำงาน",
        "Quiet Hours สำหรับ Low Priority",
        "Do Not Disturb สำหรับ Off-duty",
    ],
}

print(f"\n\n=== QA Checklist ===")
for category, items in qa_checklist.items():
    print(f"\n  [{category}]")
    for item in items:
        print(f"    - {item}")

On-call และ Runbook

# === On-call Schedule & Runbook ===

@dataclass
class OnCallSchedule:
    team: str
    rotation: str
    members: List[str]
    current_primary: str
    current_secondary: str

schedules = [
    OnCallSchedule("Platform", "Weekly", ["Alice", "Bob", "Charlie", "Diana"],
                   "Alice", "Bob"),
    OnCallSchedule("Backend", "Weekly", ["Eve", "Frank", "Grace"],
                   "Eve", "Frank"),
    OnCallSchedule("Security", "Daily", ["Heidi", "Ivan"],
                   "Heidi", "Ivan"),
]

print("=== On-call Schedules ===")
for s in schedules:
    print(f"\n  [{s.team} Team] Rotation: {s.rotation}")
    print(f"    Members: {', '.join(s.members)}")
    print(f"    Primary: {s.current_primary} | Secondary: {s.current_secondary}")

# Runbook Template
runbook = {
    "Alert": "High CPU Usage (> 90%)",
    "Priority": "P2",
    "Team": "Platform",
    "Steps": [
        "1. Login to monitoring dashboard",
        "2. Identify which server has high CPU",
        "3. Check top processes: top -o %CPU",
        "4. Check recent deployments: git log --since='2h'",
        "5. If deployment related: rollback",
        "6. If traffic spike: scale up instances",
        "7. If process leak: restart service",
        "8. Verify CPU normalized",
        "9. Update incident timeline",
        "10. Close alert with resolution note",
    ],
    "Automation": [
        "Auto-scale ถ้า CPU > 80% เกิน 10 นาที",
        "Auto-restart ถ้า Process memory > 90%",
        "Auto-rollback ถ้า Error rate > 10% หลัง Deploy",
    ],
}

print(f"\n\n=== Runbook: {runbook['Alert']} ===")
print(f"  Priority: {runbook['Priority']} | Team: {runbook['Team']}")
print(f"\n  Steps:")
for step in runbook['Steps']:
    print(f"    {step}")
print(f"\n  Automation:")
for auto in runbook['Automation']:
    print(f"    - {auto}")

เคล็ดลับ

Test Regularly: ทดสอบ Alert ทุก Sprint ไม่ใช่แค่ตอน Setup
Runbook: เขียน Runbook สำหรับทุก Alert ลดเวลา Resolution
Dedup: ตั้ง Deduplication ลด Alert Fatigue
Priority: P1 ต้อง Phone Call P5 แค่ Log ไม่แจ้ง
Review: Review Alerts ทุกเดือน ลบที่ไม่จำเป็น ปรับ Threshold

การนำความรู้ไปประยุกต์ใช้งานจริง

แหล่งเรียนรู้ที่แนะนำ ได้แก่ Official Documentation ที่อัพเดทล่าสุดเสมอ Online Course จาก Coursera Udemy edX ช่อง YouTube คุณภาพทั้งไทยและอังกฤษ และ Community อย่าง Discord Reddit Stack Overflow ที่ช่วยแลกเปลี่ยนประสบการณ์กับนักพัฒนาทั่วโลก

เปรียบเทียบข้อดีและข้อเสีย

ข้อดี	ข้อเสีย
ประสิทธิภาพสูง ทำงานได้เร็วและแม่นยำ ลดเวลาทำงานซ้ำซ้อน	ต้องใช้เวลาเรียนรู้เบื้องต้นพอสมควร มี Learning Curve สูง
มี Community ขนาดใหญ่ มีคนช่วยเหลือและแหล่งเรียนรู้มากมาย	บางฟีเจอร์อาจยังไม่เสถียร หรือมีการเปลี่ยนแปลงบ่อยในเวอร์ชันใหม่
รองรับ Integration กับเครื่องมือและบริการอื่นได้หลากหลาย	ต้นทุนอาจสูงสำหรับ Enterprise License หรือ Cloud Service
เป็น Open Source หรือมีเวอร์ชันฟรีให้เริ่มต้นใช้งาน	ต้องการ Hardware หรือ Infrastructure ที่เพียงพอ

จากตารางเปรียบเทียบจะเห็นว่าข้อดีมีมากกว่าข้อเสียอย่างชัดเจน โดยเฉพาะในแง่ของประสิทธิภาพและความสามารถในการ Scale สำหรับข้อเสียส่วนใหญ่สามารถแก้ไขได้ด้วยการเรียนรู้อย่างเป็นระบบและวางแผนทรัพยากรให้เหมาะสม

Opsgenie คืออะไร

Alert Management Atlassian Routing Escalation On-call Schedule Notification Phone SMS Email Slack 200+ Integrations Incident Response

Alert Testing สำคัญอย่างไร

ตรวจ Alerts ทำงานถูกต้อง ส่งถึงคนที่ใช่ ป้องกัน False Negative Alert Fatigue Routing Escalation Chaos Testing Regular

On-call Schedule คืออะไร

ตารางเวร 24/7 หมุนเวียน Weekly Daily Primary Secondary Escalation Override วันหยุดลา Opsgenie PagerDuty

ลด Alert Fatigue อย่างไร

Priority ชัดเจน Group Deduplicate Threshold ลด False Positive Review ทุกเดือน Automate Runbook

สรุป

Opsgenie Alert Management Routing Escalation On-call Schedule Testing Strategy QA Chaos Testing Deduplication Priority Runbook Automation Alert Fatigue Notification Incident Response

Opsgenie Alert Testing Strategy QA —