OpsGenie Alert ?????????????????????
OpsGenie ???????????? incident management platform ????????? Atlassian ??????????????????????????????????????? alerts ????????????????????? monitoring ??????????????? route alerts ???????????????????????????????????????????????????????????? ?????????????????? on-call schedules ????????? escalation policies ????????????????????????????????? backup recovery strategy ???????????????????????? operations ?????????????????????????????? incidents ?????????????????????????????????????????????????????????????????????
OpsGenie ??????????????????????????? central hub ????????? alerts ????????? monitoring tools (Prometheus, Datadog, CloudWatch, Nagios) ???????????? deduplicate, prioritize, route ??????????????????????????????????????????????????? ????????????????????????????????????????????? (SMS, email, phone call, Slack, Microsoft Teams) ?????? auto-escalation ??????????????????????????????????????? alert
?????????????????? Backup Recovery Strategy OpsGenie ????????????????????? ????????????????????????????????????????????????????????? backup ?????????????????????, trigger automated recovery procedures, track recovery progress, escalate ????????? recovery ???????????????????????????, ????????????????????????????????? backup/recovery ?????????????????? SLA compliance
??????????????????????????????????????????????????? OpsGenie
Setup OpsGenie ?????????????????? backup monitoring
# === OpsGenie Setup for Backup Recovery ===
# 1. Install OpsGenie CLI (Lamp)
curl -sL https://github.com/opsgenie/opsgenie-lamp/releases/latest/download/lamp-linux-amd64 -o /usr/local/bin/lamp
chmod +x /usr/local/bin/lamp
# 2. Configure API Key
cat > /etc/opsgenie/lamp.conf << 'EOF'
apiKey=your-opsgenie-api-key-here
baseUrl=https://api.opsgenie.com
EOF
# 3. Create Alert via CLI
lamp createAlert \
--message "Backup Failed: production-db" \
--priority P1 \
--description "Daily backup of production database failed at 02:15 AM" \
--tags "backup,database,production" \
--entity "production-db-backup" \
--alias "backup-fail-production-db"
# 4. Integration Setup (Terraform)
cat > opsgenie.tf << 'EOF'
terraform {
required_providers {
opsgenie = {
source = "opsgenie/opsgenie"
version = "~> 0.6"
}
}
}
provider "opsgenie" {
api_key = var.opsgenie_api_key
}
# Team
resource "opsgenie_team" "backup_team" {
name = "Backup Operations"
description = "Team responsible for backup and recovery"
member {
id = opsgenie_user.ops_lead.id
role = "admin"
}
member {
id = opsgenie_user.ops_eng1.id
role = "user"
}
}
# Escalation Policy
resource "opsgenie_escalation" "backup_escalation" {
name = "Backup Failure Escalation"
owner_team_id = opsgenie_team.backup_team.id
rules {
condition = "if-not-acked"
notify_type = "default"
delay = 5 # minutes
recipient {
type = "schedule"
id = opsgenie_schedule.on_call.id
}
}
rules {
condition = "if-not-acked"
notify_type = "default"
delay = 15
recipient {
type = "user"
id = opsgenie_user.ops_lead.id
}
}
}
# API Integration for monitoring
resource "opsgenie_api_integration" "backup_monitor" {
name = "Backup Monitor"
type = "API"
owner_team_id = opsgenie_team.backup_team.id
responders {
type = "team"
id = opsgenie_team.backup_team.id
}
}
EOF
# 5. On-Call Schedule
cat > schedule.yaml << 'EOF'
name: "Backup On-Call"
timezone: "Asia/Bangkok"
rotations:
- name: "Weekly Rotation"
type: "weekly"
participants:
- type: "user"
username: "ops-eng-1@company.com"
- type: "user"
username: "ops-eng-2@company.com"
- type: "user"
username: "ops-eng-3@company.com"
startDate: "2024-01-01T00:00:00+07:00"
timeRestriction:
type: "weekday-and-time-of-day"
restrictions:
- startHour: 0
endHour: 24
EOF
echo "OpsGenie setup complete"
Backup Strategy ????????? Alert Management
?????????????????? backup strategy ??????????????? alerting
#!/usr/bin/env python3
# backup_strategy.py ??? Backup Strategy with OpsGenie Alerts
import json
import logging
import urllib.request
from typing import Dict, List
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("backup")
class BackupAlertManager:
def __init__(self, api_key, base_url="https://api.opsgenie.com"):
self.api_key = api_key
self.base_url = base_url
def create_alert(self, message, priority="P2", description="", tags=None, alias=None):
"""Create an OpsGenie alert"""
payload = {
"message": message,
"priority": priority,
"description": description,
"tags": tags or [],
"alias": alias or f"alert-{int(datetime.utcnow().timestamp())}",
"source": "backup-monitor",
}
try:
data = json.dumps(payload).encode()
req = urllib.request.Request(
f"{self.base_url}/v2/alerts",
data=data,
headers={
"Content-Type": "application/json",
"Authorization": f"GenieKey {self.api_key}",
},
)
response = urllib.request.urlopen(req)
return json.loads(response.read())
except Exception as e:
logger.error(f"Alert creation failed: {e}")
return {"error": str(e)}
def close_alert(self, alias):
"""Close an alert when issue is resolved"""
try:
req = urllib.request.Request(
f"{self.base_url}/v2/alerts/{alias}/close?identifierType=alias",
data=json.dumps({"note": "Backup recovered successfully"}).encode(),
headers={
"Content-Type": "application/json",
"Authorization": f"GenieKey {self.api_key}",
},
method="POST",
)
response = urllib.request.urlopen(req)
return json.loads(response.read())
except Exception as e:
return {"error": str(e)}
def backup_check(self, backup_results):
"""Check backup results and create alerts if needed"""
alerts_created = []
for backup in backup_results:
if backup["status"] == "failed":
alert = self.create_alert(
message=f"Backup Failed: {backup['name']}",
priority="P1" if backup["type"] == "production" else "P2",
description=f"Backup {backup['name']} failed at {backup['timestamp']}. Error: {backup.get('error', 'Unknown')}",
tags=["backup", backup["type"], backup["database"]],
alias=f"backup-fail-{backup['name']}",
)
alerts_created.append(alert)
elif backup["status"] == "warning":
alert = self.create_alert(
message=f"Backup Warning: {backup['name']} took too long",
priority="P3",
description=f"Duration: {backup['duration']}min (threshold: {backup['threshold']}min)",
tags=["backup", "slow"],
alias=f"backup-slow-{backup['name']}",
)
alerts_created.append(alert)
return {"alerts_created": len(alerts_created), "details": alerts_created}
# Example usage
manager = BackupAlertManager("your-api-key")
backup_results = [
{"name": "prod-db-daily", "status": "success", "type": "production", "database": "postgresql", "duration": 15},
{"name": "prod-db-incremental", "status": "failed", "type": "production", "database": "postgresql", "error": "Disk full", "timestamp": "2024-06-15 02:15:00"},
{"name": "staging-db", "status": "warning", "type": "staging", "database": "mysql", "duration": 45, "threshold": 30},
]
result = manager.backup_check(backup_results)
print(f"Alerts created: {result['alerts_created']}")
Recovery Automation ???????????? Python
Automated recovery procedures
#!/usr/bin/env python3
# recovery_automation.py ??? Automated Recovery with OpsGenie
import json
import logging
import time
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("recovery")
class RecoveryAutomation:
def __init__(self):
self.playbooks = {}
def register_playbook(self, name, steps):
self.playbooks[name] = {"steps": steps, "created": time.time()}
def execute_recovery(self, playbook_name, context):
"""Execute a recovery playbook"""
playbook = self.playbooks.get(playbook_name)
if not playbook:
return {"error": f"Playbook {playbook_name} not found"}
results = []
for i, step in enumerate(playbook["steps"]):
logger.info(f"Step {i+1}/{len(playbook['steps'])}: {step['name']}")
# Simulate step execution
result = {
"step": i + 1,
"name": step["name"],
"status": "success",
"duration_sec": step.get("estimated_duration", 10),
}
results.append(result)
if step.get("verify"):
result["verification"] = "passed"
all_success = all(r["status"] == "success" for r in results)
return {
"playbook": playbook_name,
"status": "completed" if all_success else "failed",
"steps_completed": len(results),
"total_duration": sum(r["duration_sec"] for r in results),
"results": results,
}
def setup_playbooks(self):
"""Pre-configure recovery playbooks"""
self.register_playbook("database_backup_recovery", [
{"name": "Check disk space", "command": "df -h /backup", "estimated_duration": 5},
{"name": "Clean old backups", "command": "find /backup -mtime +7 -delete", "estimated_duration": 30},
{"name": "Verify database connectivity", "command": "pg_isready -h localhost", "estimated_duration": 5},
{"name": "Retry backup", "command": "pg_dump -Fc production > /backup/prod.dump", "estimated_duration": 300, "verify": True},
{"name": "Verify backup integrity", "command": "pg_restore --list /backup/prod.dump", "estimated_duration": 30, "verify": True},
{"name": "Upload to S3", "command": "aws s3 cp /backup/prod.dump s3://backups/", "estimated_duration": 120},
{"name": "Close OpsGenie alert", "command": "lamp closeAlert --alias backup-fail-prod-db", "estimated_duration": 5},
])
self.register_playbook("file_backup_recovery", [
{"name": "Check source filesystem", "command": "ls -la /data/", "estimated_duration": 5},
{"name": "Check backup destination", "command": "df -h /backup/files", "estimated_duration": 5},
{"name": "Retry rsync", "command": "rsync -avz /data/ /backup/files/", "estimated_duration": 600, "verify": True},
{"name": "Verify file count", "command": "diff <(ls /data/) <(ls /backup/files/)", "estimated_duration": 30, "verify": True},
{"name": "Close alert", "command": "lamp closeAlert --alias backup-fail-files", "estimated_duration": 5},
])
recovery = RecoveryAutomation()
recovery.setup_playbooks()
result = recovery.execute_recovery("database_backup_recovery", {"server": "prod-db-01"})
print(f"Recovery: {result['status']}")
print(f"Steps: {result['steps_completed']}, Duration: {result['total_duration']}s")
for step in result["results"]:
print(f" {step['step']}. {step['name']}: {step['status']}")
Escalation Policies ????????? On-Call
?????????????????? escalation ????????? on-call schedules
# === Escalation and On-Call Configuration ===
cat > escalation_config.yaml << 'EOF'
escalation_policies:
backup_failure:
name: "Backup Failure Escalation"
rules:
- level: 1
delay_minutes: 0
notify:
- type: "on-call"
schedule: "backup-on-call"
channels: ["push", "sms"]
- level: 2
delay_minutes: 10
condition: "if-not-acknowledged"
notify:
- type: "on-call"
schedule: "backup-on-call"
- type: "user"
username: "team-lead@company.com"
channels: ["push", "sms", "voice"]
- level: 3
delay_minutes: 30
condition: "if-not-acknowledged"
notify:
- type: "user"
username: "ops-manager@company.com"
- type: "user"
username: "cto@company.com"
channels: ["voice", "sms"]
data_loss:
name: "Data Loss Emergency"
rules:
- level: 1
delay_minutes: 0
notify:
- type: "team"
team: "backup-team"
channels: ["push", "sms", "voice"]
- level: 2
delay_minutes: 5
notify:
- type: "team"
team: "engineering-leads"
channels: ["voice"]
on_call_schedules:
backup_on_call:
name: "Backup On-Call"
timezone: "Asia/Bangkok"
rotation:
type: "weekly"
start_day: "monday"
start_time: "09:00"
participants:
- "ops-eng-1@company.com"
- "ops-eng-2@company.com"
- "ops-eng-3@company.com"
overrides:
holidays:
- date: "2024-04-13"
user: "ops-eng-1@company.com"
note: "Songkran holiday override"
notification_rules:
weekday_business_hours:
time: "09:00-18:00"
channels: ["push", "email"]
weekday_after_hours:
time: "18:00-09:00"
channels: ["push", "sms"]
delay: 3
weekend:
channels: ["push", "sms", "voice"]
delay: 0
note: "Immediate for P1/P2"
EOF
# Apply with OpsGenie API
python3 -c "
import yaml
with open('escalation_config.yaml') as f:
config = yaml.safe_load(f)
policies = config['escalation_policies']
print('Escalation Policies:')
for name, policy in policies.items():
print(f' {policy[\"name\"]}: {len(policy[\"rules\"])} levels')
for rule in policy['rules']:
print(f' L{rule[\"level\"]}: +{rule[\"delay_minutes\"]}min ??? {rule[\"channels\"]}')
"
echo "Escalation configured"
Monitoring ????????? Reporting
Dashboard ??????????????????????????? backup
#!/usr/bin/env python3
# backup_dashboard.py ??? Backup Monitoring Dashboard
import json
import logging
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("dashboard")
class BackupDashboard:
def __init__(self):
self.metrics = {}
def dashboard_data(self):
return {
"backup_status": {
"total_jobs": 25,
"successful": 23,
"failed": 1,
"warning": 1,
"success_rate": 92.0,
"last_failure": "prod-db-incremental (02:15 AM)",
},
"recovery_metrics": {
"rto_target": "4 hours",
"rto_actual": "45 minutes",
"rpo_target": "1 hour",
"rpo_actual": "15 minutes",
"last_recovery_test": "2024-06-01",
"test_result": "passed",
},
"storage": {
"local_used_gb": 850,
"local_total_gb": 2000,
"s3_used_gb": 5200,
"monthly_cost_usd": 120,
"retention_days": 30,
},
"alert_stats_30d": {
"total_alerts": 8,
"acknowledged_avg_min": 3.5,
"resolved_avg_min": 25,
"escalated": 1,
"false_positives": 2,
},
}
def sla_report(self):
return {
"backup_sla": {
"target": "99.5% backup success rate",
"actual": "99.2%",
"status": "met" if 99.2 >= 99.5 else "missed",
"incidents": 3,
},
"recovery_sla": {
"rto_target": "4 hours",
"rto_met": True,
"rpo_target": "1 hour",
"rpo_met": True,
},
"recommendations": [
"??????????????? disk space ?????????????????? backup server (????????? 42.5%)",
"??????????????? recovery drill ???????????????????????? (?????????????????? 2 ?????????????????????????????????)",
"?????? false positive alerts (2 ????????????????????? 30 ?????????)",
"??????????????? cross-region backup ?????????????????? disaster recovery",
],
}
dashboard = BackupDashboard()
data = dashboard.dashboard_data()
print(f"Backup Jobs: {data['backup_status']['successful']}/{data['backup_status']['total_jobs']} success")
print(f"Success Rate: {data['backup_status']['success_rate']}%")
print(f"RTO: {data['recovery_metrics']['rto_actual']} (target: {data['recovery_metrics']['rto_target']})")
print(f"RPO: {data['recovery_metrics']['rpo_actual']} (target: {data['recovery_metrics']['rpo_target']})")
print(f"Alert MTTA: {data['alert_stats_30d']['acknowledged_avg_min']} min")
sla = dashboard.sla_report()
print(f"\nSLA: {sla['backup_sla']['actual']} (target: {sla['backup_sla']['target']})")
print("Recommendations:")
for rec in sla["recommendations"]:
print(f" - {rec}")
FAQ ??????????????????????????????????????????
Q: OpsGenie ????????? PagerDuty ???????????????????????????????????????????
A: OpsGenie ????????????????????? Atlassian integrate ????????? Jira, Confluence, Bitbucket ???????????????????????? ??????????????????????????? $9/user/??????????????? ?????? free tier ?????????????????? 5 users PagerDuty ???????????? standalone incident management ???????????????????????? ecosystem ??????????????????????????? ?????? AIOps features ??????????????????????????? $21/user/??????????????? ??????????????? OpsGenie ?????????????????? Atlassian ecosystem (Jira, Confluence), ?????????????????????, ????????????????????? free tier ??????????????? PagerDuty ?????????????????????????????? AIOps, enterprise features, ??????????????????????????? vendor ?????? ??????????????????????????? core functions (alerting, on-call, escalation) ???????????????????????????????????????????????????
Q: RTO ????????? RPO ????????????????????? ????????????????????????????????????????????????????
A: RTO (Recovery Time Objective) ?????????????????????????????????????????????????????????????????????????????????????????????????????? ???????????? 4 ????????????????????? ????????????????????????????????????????????????????????????????????????????????????????????? 4 ????????????????????????????????? incident RPO (Recovery Point Objective) ???????????????????????????????????????????????????????????????????????????????????? ???????????? 1 ????????????????????? ????????????????????????????????????????????????????????? 1 ??????????????????????????????????????? ????????????????????????????????? Production database RTO 1-4 ????????????????????? RPO 15 ????????????-1 ?????????????????????, Web application RTO 30 ????????????-2 ????????????????????? RPO 1-4 ?????????????????????, File storage RTO 4-24 ????????????????????? RPO 24 ????????????????????? ?????????????????????????????? ??????????????????????????????????????????????????? ???????????? balance ????????????????????? cost ????????? business requirements
Q: ??????????????? backup recovery ????????????????????????????
A: Recovery testing ?????????????????????????????????????????? ???????????????????????? ??????????????? restore ??????????????? staging server ????????????????????? data integrity, ??????????????????????????? Full disaster recovery drill ?????????????????????????????????????????????????????? ????????????????????? recovery ?????????????????????????????? RTO/RPO ???????????????????????????????????????, ?????????????????????????????????????????????????????? infrastructure ???????????????????????????????????????????????? architecture, database version, cloud provider ????????????????????? Restore backup ??????????????? test environment, ????????????????????? data completeness, ??????????????? application functionality, ????????????????????? recovery, ?????????????????????????????????????????????????????????
Q: Backup 3-2-1 Rule ??????????????????????
A: ?????? 3-2-1 ???????????? best practice ?????????????????? backup 3 copies ????????????????????????????????????????????????????????? 3 ????????? (original + 2 backups), 2 media types ?????????????????? storage ??????????????????????????? 2 ?????????????????? (???????????? local disk + cloud), 1 offsite ??????????????????????????????????????? 1 ??????????????????????????????????????? (different region/datacenter) ???????????????????????? Original data ?????? production server, Backup 1 ?????? local backup server (same datacenter), Backup 2 ?????? cloud storage (AWS S3 different region) ??????????????????????????? ???????????????????????????????????? 3-2-1-1-0 ??????????????? 1 immutable backup (????????????????????? ransomware) ????????? 0 errors (verify ????????????????????????)
