PagerDuty IaC
PagerDuty Incident Infrastructure as Code Terraform Provider API Automation On-call Escalation Schedule Service Event Intelligence
| Resource | Terraform Type | API Endpoint | Purpose |
|---|---|---|---|
| User | pagerduty_user | GET/POST /users | สร้าง/จัดการ User |
| Team | pagerduty_team | GET/POST /teams | จัดกลุ่ม User |
| Service | pagerduty_service | GET/POST /services | Service ที่ Monitor |
| Escalation Policy | pagerduty_escalation_policy | GET/POST /escalation_policies | กำหนด Escalation |
| Schedule | pagerduty_schedule | GET/POST /schedules | On-call Rotation |
| Integration | pagerduty_service_integration | GET/POST /services/{id}/integrations | รับ Event จาก Tool |
Terraform Configuration
# === PagerDuty Terraform Provider ===
# terraform {
# required_providers {
# pagerduty = {
# source = "PagerDuty/pagerduty"
# version = "~> 3.0"
# }
# }
# }
#
# provider "pagerduty" {
# token = var.pagerduty_token
# }
#
# # Team
# resource "pagerduty_team" "platform" {
# name = "Platform Engineering"
# description = "Platform team responsible for infrastructure"
# }
#
# # User
# resource "pagerduty_user" "engineer1" {
# name = "John Doe"
# email = "john@example.com"
# role = "user"
# }
#
# resource "pagerduty_team_membership" "eng1_platform" {
# user_id = pagerduty_user.engineer1.id
# team_id = pagerduty_team.platform.id
# role = "responder"
# }
#
# # Schedule (Weekly Rotation)
# resource "pagerduty_schedule" "platform_oncall" {
# name = "Platform On-call"
# time_zone = "Asia/Bangkok"
# layer {
# name = "Primary"
# start = "2024-01-01T00:00:00+07:00"
# rotation_virtual_start = "2024-01-01T00:00:00+07:00"
# rotation_turn_length_seconds = 604800 # 7 days
# users = [pagerduty_user.engineer1.id]
# }
# }
#
# # Escalation Policy
# resource "pagerduty_escalation_policy" "platform" {
# name = "Platform Escalation"
# num_loops = 2
# rule {
# escalation_delay_in_minutes = 5
# target {
# type = "schedule_reference"
# id = pagerduty_schedule.platform_oncall.id
# }
# }
# }
#
# # Service
# resource "pagerduty_service" "api_gateway" {
# name = "API Gateway"
# escalation_policy = pagerduty_escalation_policy.platform.id
# alert_creation = "create_alerts_and_incidents"
# auto_resolve_timeout = 14400 # 4 hours
# acknowledgement_timeout = 600 # 10 minutes
# }
#
# # Prometheus Integration
# resource "pagerduty_service_integration" "prometheus" {
# name = "Prometheus"
# service = pagerduty_service.api_gateway.id
# vendor = data.pagerduty_vendor.prometheus.id
# }
from dataclasses import dataclass
@dataclass
class TerraformResource:
resource: str
key_args: str
depends_on: str
tip: str
resources = [
TerraformResource("pagerduty_team",
"name description",
"None",
"สร้าง Team ก่อน แล้ว Assign User"),
TerraformResource("pagerduty_user",
"name email role",
"None",
"ใช้ email เป็น Unique Identifier"),
TerraformResource("pagerduty_schedule",
"name time_zone layer(users rotation)",
"pagerduty_user",
"rotation_turn_length_seconds 604800 = 1 สัปดาห์"),
TerraformResource("pagerduty_escalation_policy",
"name num_loops rule(delay target)",
"pagerduty_schedule",
"num_loops 2-3 ก่อน Unacknowledged"),
TerraformResource("pagerduty_service",
"name escalation_policy auto_resolve",
"pagerduty_escalation_policy",
"auto_resolve_timeout 14400 (4 ชม.) ป้องกัน Stale"),
TerraformResource("pagerduty_service_integration",
"name service vendor",
"pagerduty_service",
"Integration Key ใช้ตั้ง Monitoring Tool"),
]
print("=== Terraform Resources ===")
for r in resources:
print(f" [{r.resource}]")
print(f" Args: {r.key_args}")
print(f" Depends: {r.depends_on}")
print(f" Tip: {r.tip}")
API & Python Automation
# === PagerDuty API Automation ===
# pip install pdpyras requests
# import pdpyras
#
# # REST API Client
# session = pdpyras.APISession(api_key="YOUR_API_KEY")
#
# # List On-call
# oncalls = session.list_all("oncalls")
# for oc in oncalls:
# print(f" {oc['user']['summary']} → {oc['schedule']['summary']}")
#
# # Create Incident
# incident = session.rpost(
# "/incidents",
# json={"incident": {
# "type": "incident",
# "title": "API Gateway High Latency",
# "service": {"id": "PXXXXXX", "type": "service_reference"},
# "urgency": "high",
# "body": {"type": "incident_body", "details": "P99 > 500ms for 5 min"},
# }}
# )
#
# # Events API v2 (Trigger)
# import requests
# requests.post("https://events.pagerduty.com/v2/enqueue", json={
# "routing_key": "INTEGRATION_KEY",
# "event_action": "trigger",
# "payload": {
# "summary": "Disk usage > 90% on web-01",
# "severity": "critical",
# "source": "web-01",
# "component": "disk",
# },
# "dedup_key": "disk-web-01-sda1",
# })
@dataclass
class APIEndpoint:
method: str
endpoint: str
purpose: str
rate_limit: str
endpoints = [
APIEndpoint("POST", "/incidents",
"สร้าง Incident ใหม่",
"900 req/min"),
APIEndpoint("GET", "/incidents",
"List Incidents (Filter status urgency)",
"900 req/min"),
APIEndpoint("PUT", "/incidents/{id}",
"อัพเดท Incident (Acknowledge Resolve)",
"900 req/min"),
APIEndpoint("GET", "/oncalls",
"ดู On-call ปัจจุบัน",
"900 req/min"),
APIEndpoint("POST", "events.pagerduty.com/v2/enqueue",
"ส่ง Event (Trigger/Ack/Resolve)",
"Unlimited (Events API)"),
]
print("=== API Endpoints ===")
for e in endpoints:
print(f" [{e.method} {e.endpoint}]")
print(f" Purpose: {e.purpose}")
print(f" Rate: {e.rate_limit}")
Best Practices
# === IaC Best Practices ===
@dataclass
class BestPractice:
practice: str
implementation: str
benefit: str
avoid: str
practices = [
BestPractice("Git Version Control",
"เก็บ Terraform Config ใน Git ทุก Change ผ่าน PR",
"Audit Trail Review Code History Rollback",
"แก้ผ่าน UI ตรง ไม่ Sync กับ Code"),
BestPractice("Terraform State Remote",
"เก็บ State ใน S3 + DynamoDB Lock",
"Team Collaboration ป้องกัน Concurrent Apply",
"เก็บ State ใน Local File"),
BestPractice("Modules",
"สร้าง Module สำหรับ Service + Escalation + Schedule",
"Reuse Pattern เดียวกันทุก Service",
"Copy-paste Config ทุก Service"),
BestPractice("Import Existing",
"terraform import นำ Resource ที่สร้างผ่าน UI เข้า Code",
"จัดการ Resource เดิมด้วย IaC",
"สร้างใหม่ทับ Resource เดิม"),
BestPractice("CI/CD Pipeline",
"terraform plan ใน PR + terraform apply หลัง Merge",
"Auto Deploy ป้องกัน Manual Error",
"Apply จาก Local Machine"),
]
print("=== Best Practices ===")
for p in practices:
print(f" [{p.practice}]")
print(f" Impl: {p.implementation}")
print(f" Benefit: {p.benefit}")
print(f" Avoid: {p.avoid}")
เคล็ดลับ
- Terraform: ใช้ Terraform จัดการ PagerDuty ทั้งหมด ไม่แก้ผ่าน UI
- Module: สร้าง Module สำหรับ Service Pattern ใช้ซ้ำ
- Events API: ใช้ dedup_key ป้องกัน Duplicate Alert
- Escalation: ตั้ง num_loops 2-3 ก่อน Unacknowledged
- Auto-resolve: ตั้ง auto_resolve_timeout ป้องกัน Stale Incident
PagerDuty คืออะไร
Incident Management On-call Alert Escalation Schedule Service Event Intelligence Notification SMS Phone Slack Analytics MTTA MTTR
Terraform Provider ทำอะไร
จัดการ PagerDuty ด้วย Code User Team Service Escalation Schedule Integration Git Version Control CI/CD Apply Import Existing
API ใช้อย่างไร
REST API v2 API Key pdpyras Python POST incidents GET oncalls Events API v2 Trigger Acknowledge Resolve dedup_key Routing Key
Automation ทำอะไร
Auto-remediation Runbook Incident Creation On-call Rotation Escalation Status Page Post-mortem Analytics ChatOps Slack Bot
สรุป
PagerDuty Incident IaC Terraform Provider API pdpyras Automation On-call Escalation Schedule Service Events Production
