Incident.io Production Setup Guide — คู่มือตั้งค่า Incident Management 2026
Incident.io เป็น incident management platform ที่ทำงานร่วมกับ Slack เป็นหลัก ช่วยให้ทีม DevOps/SRE จัดการ incidents ได้อย่างมีระบบ ตั้งแต่ declaration, triage, response ไปจนถึง post-mortem และ follow-ups ข้อดีของ Incident.io คือ integration กับ Slack ที่ seamless ทำให้ทุกู้คืนในทีมเข้าถึงได้ง่าย ไม่ต้องเรียนรู้ tool ใหม่ บทความนี้อธิบายวิธี setup Incident.io สำหรับ production พร้อม workflows, automation และ Python integration tools
Incident.io Architecture
# incidentio_arch.py — Incident.io architecture
import json
class IncidentIOArchitecture:
COMPONENTS = {
"slack_bot": {
"name": "Slack Bot Integration",
"description": "Bot ใน Slack สำหรับ declare incidents, update status, assign roles",
"commands": "/incident new, /incident update, /incident resolve",
},
"web_dashboard": {
"name": "Web Dashboard",
"description": "Dashboard สำหรับ overview incidents, analytics, settings",
"features": "Incident list, timeline, metrics, custom fields, catalog",
},
"catalog": {
"name": "Catalog",
"description": "Database ของ services, teams, functionalities — link กับ incidents",
"benefit": "รู้ว่า incident กระทบ service ไหน, team ไหนรับผิดชอบ",
},
"workflows": {
"name": "Workflows (Automation)",
"description": "Automate actions เมื่อ incident เกิด — notify, create channel, assign roles",
"triggers": "Incident declared, severity changed, status updated, resolved",
},
"post_mortem": {
"name": "Post-mortem",
"description": "สร้าง post-mortem อัตโนมัติจาก incident timeline — follow-up tracking",
},
}
INTEGRATIONS = {
"slack": "Primary interface — declare, manage, resolve incidents ใน Slack",
"pagerduty": "Escalation — page on-call engineers automatically",
"opsgenie": "Alternative to PagerDuty for on-call management",
"jira": "Create follow-up tickets from post-mortem action items",
"statuspage": "Update public status page automatically",
"datadog": "Link metrics/alerts to incidents",
"github": "Link PRs and deployments to incidents",
}
def show_components(self):
print("=== Incident.io Components ===\n")
for key, comp in self.COMPONENTS.items():
print(f"[{comp['name']}]")
print(f" {comp['description']}")
print()
def show_integrations(self):
print("=== Integrations ===")
for name, desc in self.INTEGRATIONS.items():
print(f" [{name}] {desc}")
arch = IncidentIOArchitecture()
arch.show_components()
arch.show_integrations()
Production Setup Steps
# setup.py — Production setup steps
import json
class ProductionSetup:
STEPS = {
"step1": {
"name": "1. Install Slack App",
"tasks": [
"ไปที่ incident.io → Connect Slack workspace",
"Grant permissions: channels, messages, users",
"ตั้งค่า dedicated incident channel prefix (#inc-)",
],
},
"step2": {
"name": "2. Configure Severity Levels",
"tasks": [
"SEV1 (Critical): ระบบล่มทั้งหมด, revenue impact",
"SEV2 (Major): Feature หลักใช้ไม่ได้, ผู้ใช้ได้รับผลกระทบมาก",
"SEV3 (Minor): Feature รองมีปัญหา, workaround มี",
"SEV4 (Low): Cosmetic issues, ไม่กระทบ users",
],
},
"step3": {
"name": "3. Set Up Roles",
"tasks": [
"Incident Lead: ผู้รับผิดชอบหลัก — coordinate response",
"Communications Lead: อัพเดท stakeholders, status page",
"Technical Lead: debug, fix, deploy — hands-on",
],
},
"step4": {
"name": "4. Build Catalog",
"tasks": [
"เพิ่ม services ทั้งหมด (API, Web, Database, etc.)",
"กำหนด owners (teams) สำหรับแต่ละ service",
"Link functionalities (Login, Payment, Search, etc.)",
],
},
"step5": {
"name": "5. Configure Workflows",
"tasks": [
"Auto-create Slack channel เมื่อ incident declared",
"Auto-page on-call สำหรับ SEV1/SEV2",
"Auto-notify leadership สำหรับ SEV1",
"Auto-create post-mortem เมื่อ resolved",
],
},
"step6": {
"name": "6. Set Up Integrations",
"tasks": [
"PagerDuty/OpsGenie: on-call escalation",
"Jira: follow-up action items",
"Statuspage: public status updates",
"Datadog/Grafana: link alerts to incidents",
],
},
}
def show_steps(self):
print("=== Production Setup ===\n")
for key, step in self.STEPS.items():
print(f"[{step['name']}]")
for task in step['tasks'][:3]:
print(f" • {task}")
print()
setup = ProductionSetup()
setup.show_steps()
Python Integration
# python_integration.py — Python tools for Incident.io
import json
class IncidentIOIntegration:
CODE = """
# incident_manager.py — Python integration with Incident.io API
import requests
import json
from datetime import datetime
class IncidentManager:
def __init__(self, api_key, base_url="https://api.incident.io/v2"):
self.base_url = base_url
self.headers = {
'Authorization': f'Bearer {api_key}',
'Content-Type': 'application/json',
}
def create_incident(self, name, summary, severity_id, mode="standard"):
'''Create a new incident'''
payload = {
'idempotency_key': f'inc-{datetime.utcnow().strftime("%Y%m%d%H%M%S")}',
'incident_type_id': None, # Use default
'mode': mode, # standard, retrospective, test
'name': name,
'summary': summary,
'severity_id': severity_id,
'visibility': 'public',
}
resp = requests.post(
f"{self.base_url}/incidents",
headers=self.headers,
json=payload,
timeout=10,
)
return resp.json()
def update_incident(self, incident_id, summary=None, severity_id=None):
'''Update incident details'''
payload = {}
if summary:
payload['summary'] = summary
if severity_id:
payload['severity_id'] = severity_id
resp = requests.put(
f"{self.base_url}/incidents/{incident_id}",
headers=self.headers,
json=payload,
timeout=10,
)
return resp.json()
def list_incidents(self, status=None, severity=None, page_size=25):
'''List incidents with filters'''
params = {'page_size': page_size}
if status:
params['status'] = status
resp = requests.get(
f"{self.base_url}/incidents",
headers=self.headers,
params=params,
timeout=10,
)
return resp.json()
def get_incident(self, incident_id):
'''Get incident details'''
resp = requests.get(
f"{self.base_url}/incidents/{incident_id}",
headers=self.headers,
timeout=10,
)
return resp.json()
def add_update(self, incident_id, message):
'''Add timeline update to incident'''
payload = {
'incident_id': incident_id,
'message': message,
}
resp = requests.post(
f"{self.base_url}/incident_updates",
headers=self.headers,
json=payload,
timeout=10,
)
return resp.json()
def get_severities(self):
'''Get available severity levels'''
resp = requests.get(
f"{self.base_url}/severities",
headers=self.headers,
timeout=10,
)
return resp.json()
class IncidentMetrics:
def __init__(self, manager):
self.manager = manager
def mttr(self, days=30):
'''Calculate Mean Time To Resolve'''
incidents = self.manager.list_incidents(status='resolved', page_size=100)
durations = []
for inc in incidents.get('incidents', []):
created = datetime.fromisoformat(inc['created_at'].replace('Z', '+00:00'))
resolved = inc.get('resolved_at')
if resolved:
resolved = datetime.fromisoformat(resolved.replace('Z', '+00:00'))
duration = (resolved - created).total_seconds() / 60
durations.append(duration)
if not durations:
return {'mttr_minutes': 0, 'count': 0}
return {
'mttr_minutes': round(sum(durations) / len(durations)),
'median_minutes': round(sorted(durations)[len(durations) // 2]),
'count': len(durations),
'fastest_minutes': round(min(durations)),
'slowest_minutes': round(max(durations)),
}
def incident_frequency(self, days=30):
'''Calculate incident frequency'''
incidents = self.manager.list_incidents(page_size=100)
by_severity = {}
for inc in incidents.get('incidents', []):
sev = inc.get('severity', {}).get('name', 'unknown')
by_severity[sev] = by_severity.get(sev, 0) + 1
total = sum(by_severity.values())
return {
'total_incidents': total,
'per_day': round(total / max(days, 1), 1),
'by_severity': by_severity,
}
# manager = IncidentManager("your-api-key")
# inc = manager.create_incident("API Latency Spike", "P95 > 5s", "sev2_id")
# metrics = IncidentMetrics(manager)
# mttr = metrics.mttr()
"""
def show_code(self):
print("=== Incident Manager ===")
print(self.CODE[:600])
integration = IncidentIOIntegration()
integration.show_code()
Workflow Automation
# workflows.py — Incident.io workflow automation
import json
class IncidentWorkflows:
WORKFLOWS = {
"sev1_declared": {
"name": "SEV1 Incident Declared",
"trigger": "Incident created with severity = SEV1",
"actions": [
"Create dedicated Slack channel (#inc-XXXX)",
"Page on-call engineer via PagerDuty",
"Notify #incidents channel",
"Notify VP Engineering + CTO via DM",
"Start 15-minute update timer",
"Create Statuspage incident (investigating)",
],
},
"sev2_declared": {
"name": "SEV2 Incident Declared",
"trigger": "Incident created with severity = SEV2",
"actions": [
"Create Slack channel",
"Page on-call engineer",
"Notify #incidents channel",
"Start 30-minute update timer",
],
},
"incident_resolved": {
"name": "Incident Resolved",
"trigger": "Incident status changed to resolved",
"actions": [
"Update Statuspage (resolved)",
"Notify #incidents channel (resolved)",
"Create post-mortem document",
"Schedule post-mortem meeting (within 48 hours)",
"Create Jira tickets for follow-up actions",
],
},
"escalation": {
"name": "Auto-Escalation",
"trigger": "No acknowledgment within 5 minutes (SEV1) / 15 minutes (SEV2)",
"actions": [
"Escalate to next on-call",
"Notify engineering manager",
"Add escalation note to timeline",
],
},
}
def show_workflows(self):
print("=== Incident Workflows ===\n")
for key, wf in self.WORKFLOWS.items():
print(f"[{wf['name']}]")
print(f" Trigger: {wf['trigger']}")
for action in wf['actions'][:4]:
print(f" • {action}")
print()
workflows = IncidentWorkflows()
workflows.show_workflows()
Metrics & Reporting
# metrics.py — Incident metrics and reporting
import json
class IncidentMetricsReport:
KEY_METRICS = {
"mttr": {
"name": "MTTR (Mean Time To Resolve)",
"description": "เวลาเฉลี่ยตั้งแต่ declare ถึง resolve",
"target": "SEV1: < 1 hour, SEV2: < 4 hours, SEV3: < 24 hours",
},
"mtta": {
"name": "MTTA (Mean Time To Acknowledge)",
"description": "เวลาเฉลี่ยตั้งแต่ alert ถึง acknowledge",
"target": "SEV1: < 5 minutes, SEV2: < 15 minutes",
},
"mttd": {
"name": "MTTD (Mean Time To Detect)",
"description": "เวลาเฉลี่ยตั้งแต่ issue เกิดถึง detect",
"target": "< 5 minutes (automated monitoring)",
},
"incident_rate": {
"name": "Incident Frequency",
"description": "จำนวน incidents ต่อสัปดาห์/เดือน",
"target": "ลดลงเรื่อยๆ — trend ควรลง",
},
"follow_up_completion": {
"name": "Follow-up Completion Rate",
"description": "% ของ post-mortem action items ที่ทำเสร็จ",
"target": "> 90% ภายใน 2 สัปดาห์",
},
}
SLO = {
"availability": "99.9% uptime = max 8.76 hours downtime/year",
"latency": "P95 < 200ms, P99 < 500ms",
"error_rate": "< 0.1% of requests return 5xx",
"incident_response": "SEV1 acknowledged within 5 minutes",
}
def show_metrics(self):
print("=== Key Metrics ===\n")
for key, m in self.KEY_METRICS.items():
print(f"[{m['name']}]")
print(f" {m['description']}")
print(f" Target: {m['target']}")
print()
def show_slo(self):
print("=== SLO Targets ===")
for name, target in self.SLO.items():
print(f" [{name}] {target}")
metrics = IncidentMetricsReport()
metrics.show_metrics()
metrics.show_slo()
FAQ - คำถามที่พบบ่อย
Q: Incident.io กับ PagerDuty อันไหนดีกว่า?
A: Incident.io: Slack-native, incident lifecycle management, post-mortem, catalog — เน้น coordination PagerDuty: On-call management, alerting, escalation — เน้น notification ใช้ร่วมกัน: PagerDuty สำหรับ on-call + alerting → Incident.io สำหรับ incident management + post-mortem ถ้าเลือกอันเดียว: team เล็ก → PagerDuty, team ใหญ่ที่ต้องการ coordination → Incident.io
Q: Incident.io ราคาเท่าไหร่?
A: Free tier: basic incident management, 5 incidents/month Starter: ~$16/user/month — unlimited incidents, workflows, integrations Pro: ~$25/user/month — catalog, custom fields, advanced analytics Enterprise: custom pricing — SSO, SCIM, dedicated support คุ้มเมื่อ: team > 10 คน + incidents > 5/เดือน + ต้องการ process ที่ดี ROI: ลด MTTR 30-50% = คุ้มค่ากว่า downtime cost
Q: ต้องใช้ Slack ไหม?
A: Incident.io ออกแบบมาสำหรับ Slack เป็นหลัก — Slack = primary interface ถ้าไม่ใช้ Slack: พิจารณา alternatives — Rootly (Slack), FireHydrant (Slack/MS Teams), OpsLevel ถ้าใช้ MS Teams: Incident.io มี Teams integration (beta) แต่ Slack ยังดีกว่า สำคัญ: ทีมต้องใช้ Slack อยู่แล้ว — ถ้าไม่ใช้ Slack, Incident.io อาจไม่เหมาะ
Q: Post-mortem ควรทำทุก incident ไหม?
A: SEV1: ต้องทำทุกครั้ง — blameless post-mortem ภายใน 48 ชั่วโมง SEV2: ควรทำ — อย่างน้อย lightweight post-mortem SEV3-4: optional — ทำเฉพาะ incidents ที่น่าสนใจหรือ recurring Blameless: focus ที่ process improvement ไม่ใช่ blame individuals Follow-up: action items ต้อง track ใน Jira/Linear — ไม่ใช่แค่เขียนแล้วลืม
