SiamCafe · Blog
PagerDuty Incident Identity Access Management —
บทความ

PagerDuty Incident Identity Access Management —

เผยแพร่ 28 พฤษภาคม 2569

PagerDuty Incident IAM

PagerDuty Incident Management On-call Escalation Identity Access Management IAM RBAC SSO MFA Authentication Authorization Audit Log Event Intelligence

FeaturePagerDutyOpsgenieVictorOps
Integrations700+200+100+
Event IntelligenceAI-poweredBasicBasic
AutomationRunbook + APIRunbookManual
SSOSAML, OIDCSAMLSAML
Pricing$21/user/mo$9/user/mo$15/user/mo

PagerDuty Setup และ On-call

=== PagerDuty Configuration ===

PagerDuty API

curl -X POST https://api.pagerduty.com/incidents \

-H "Authorization: Token token=YOUR_API_KEY" \

-H "Content-Type: application/json" \

-d '{

"incident": {

"type": "incident",

"title": "Database CPU > 90%",

"service": {"id": "PSERVICE1", "type": "service_reference"},

"urgency": "high",

"body": {

"type": "incident_body",

"details": "PostgreSQL primary CPU at 95% for 10 min"

}

}

}'

Terraform PagerDuty Configuration

resource "pagerduty_team" "platform" {

name = "Platform Engineering"

}

resource "pagerduty_user" "alice" {

name = "Alice"

email = "alice@example.com"

role = "user"

}

resource "pagerduty_schedule" "primary" {

name = "Primary On-call"

time_zone = "Asia/Bangkok"

layer {

name = "Weekly Rotation"

start = "2024-01-01T00:00:00+07:00"

rotation_virtual_start = "2024-01-01T00:00:00+07:00"

rotation_turn_length_seconds = 604800 # 1 week

users = [pagerduty_user.alice.id]

}

}

resource "pagerduty_escalation_policy" "main" {

name = "Main Escalation"

num_loops = 2

rule {

escalation_delay_in_minutes = 5

target {

type = "schedule_reference"

id = pagerduty_schedule.primary.id

}

}

rule {

escalation_delay_in_minutes = 10

target {

type = "user_reference"

id = pagerduty_user.alice.id

}

}

}

from dataclasses import dataclass, field

from typing import List

@dataclass

class OnCallSchedule:

team: str

primary: str

secondary: str

rotation: str

escalation_levels: int

active_incidents: int

schedules = [

OnCallSchedule("Platform", "Alice", "Bob", "Weekly", 3, 2),

OnCallSchedule("Backend", "Charlie", "Diana", "Weekly", 3, 0),

OnCallSchedule("Frontend", "Eve", "Frank", "Bi-weekly", 2, 1),

OnCallSchedule("Database", "Grace", "Henry", "Weekly", 3, 0),

OnCallSchedule("Security", "Ivan", "Julia", "Daily", 3, 3),

]

print("=== On-call Dashboard ===")

for s in schedules:

print(f" [{s.team}] Primary: {s.primary} | Secondary: {s.secondary}")

print(f" Rotation: {s.rotation} | Escalation: {s.escalation_levels} levels | "

f"Active: {s.active_incidents}")

IAM และ RBAC

=== Identity Access Management ===

SSO Configuration (SAML)

PagerDuty Admin > SSO Settings

Identity Provider: Okta / Azure AD / Google Workspace

SAML SSO URL: https://idp.example.com/saml/sso

Certificate: Upload IdP X.509 Certificate

Attribute Mapping:

email -> user.email

name -> user.displayName

role -> user.role

RBAC Policy (Python)

from enum import Enum

from functools import wraps

class Permission(Enum):

INCIDENT_VIEW = "incident:view"

INCIDENT_CREATE = "incident:create"

INCIDENT_ACKNOWLEDGE = "incident:acknowledge"

INCIDENT_RESOLVE = "incident:resolve"

SERVICE_MANAGE = "service:manage"

SCHEDULE_MANAGE = "schedule:manage"

USER_MANAGE = "user:manage"

ADMIN = "admin:all"

ROLES = {

"viewer": [Permission.INCIDENT_VIEW],

"responder": [

Permission.INCIDENT_VIEW,

Permission.INCIDENT_ACKNOWLEDGE,

Permission.INCIDENT_RESOLVE,

],

"manager": [

Permission.INCIDENT_VIEW,

Permission.INCIDENT_CREATE,

Permission.INCIDENT_ACKNOWLEDGE,

Permission.INCIDENT_RESOLVE,

Permission.SERVICE_MANAGE,

Permission.SCHEDULE_MANAGE,

],

"admin": [Permission.ADMIN],

}

def require_permission(permission):

def decorator(func):

@wraps(func)

def wrapper(user, *args, **kwargs):

user_perms = ROLES.get(user.role, [])

if Permission.ADMIN in user_perms or permission in user_perms:

return func(user, *args, **kwargs)

raise PermissionError(f"User lacks {permission.value}")

return wrapper

return decorator

@dataclass

class IAMPolicy:

role: str

users: int

permissions: List[str]

mfa_required: bool

sso: bool

policies = [

IAMPolicy("Admin", 2, ["All"], True, True),

IAMPolicy("Manager", 5, ["Incidents", "Services", "Schedules"], True, True),

IAMPolicy("Responder", 15, ["View", "Acknowledge", "Resolve"], True, True),

IAMPolicy("Viewer", 10, ["View Only"], False, True),

IAMPolicy("API Service", 3, ["Create Incidents", "Update Status"], False, False),

]

print("\n=== IAM Policies ===")

for p in policies:

perms = ", ".join(p.permissions)

print(f" [{p.role}] Users: {p.users}")

print(f" Permissions: {perms}")

print(f" MFA: {p.mfa_required} | SSO: {p.sso}")

Automation และ Runbook

=== Incident Automation ===

PagerDuty Event Rules

{

"rule": {

"conditions": {

"operator": "and",

"subconditions": [

{"field": "severity", "operator": "equals", "value": "critical"},

{"field": "source", "operator": "contains", "value": "production"}

]

},

"actions": {

"route": {"value": "PSERVICE1"},

"severity": {"value": "critical"},

"annotate": {"value": "Auto-routed critical production alert"}

}

}

}

Runbook Automation

import requests

def auto_remediate(incident):

"""Automated incident response"""

title = incident['title'].lower()

if 'cpu' in title and 'high' in title:

# Scale up

scale_service(incident['service'], replicas='+2')

add_note(incident['id'], "Auto-scaled service +2 replicas")

elif 'disk' in title and 'full' in title:

# Clean logs

clean_old_logs(incident['service'])

add_note(incident['id'], "Auto-cleaned logs older than 7 days")

elif 'certificate' in title and 'expir' in title:

# Renew cert

renew_certificate(incident['service'])

add_note(incident['id'], "Auto-renewed SSL certificate")

@dataclass

class IncidentMetric:

month: str

total: int

p1: int

p2: int

p3: int

mtta_min: float

mttr_min: float

auto_resolved: int

metrics = [

IncidentMetric("Jan 2024", 45, 3, 12, 30, 2.5, 25, 15),

IncidentMetric("Feb 2024", 38, 2, 10, 26, 2.1, 22, 18),

IncidentMetric("Mar 2024", 32, 1, 8, 23, 1.8, 18, 20),

]

print("Incident Metrics:")

for m in metrics:

auto_pct = (m.auto_resolved / m.total) * 100

print(f"\n [{m.month}] Total: {m.total} (P1:{m.p1} P2:{m.p2} P3:{m.p3})")

print(f" MTTA: {m.mtta_min}min | MTTR: {m.mttr_min}min | "

f"Auto: {m.auto_resolved} ({auto_pct:.0f}%)")

เคล็ดลับ

  • Escalation: ตั้ง 3 ระดับ Primary Secondary Manager อย่างน้อย
  • MFA: บังคับ MFA ทุก User โดยเฉพาะ Admin และ Manager
  • SSO: ใช้ SSO ลด Password Fatigue เพิ่ม Security
  • Least Privilege: ให้สิทธิ์น้อยที่สุดที่จำเป็น เพิ่มทีหลังได้
  • Runbook: สร้าง Runbook สำหรับ Incident ที่เกิดบ่อย Automate ได้

PagerDuty คืออะไร

Incident Management Platform On-call Phone SMS Slack Escalation Event Intelligence Runbook Automation 700+ Integrations