Incident.io กับ Career Development IT

Incident.io และ Incident Management

Incident.io เป็นแพลตฟอร์ม Incident Management ที่ทำงานผ่าน Slack ช่วยให้ทีม IT จัดการ Incidents ได้อย่างเป็นระบบ ตั้งแต่ Declaration ไปจนถึง Resolution และ Post-mortem สิ่งที่ทำให้ Incident.io โดดเด่นคือ Integration กับ Slack ที่ทีมใช้อยู่แล้ว ไม่ต้องสลับไปใช้เครื่องมืออื่น

ทักษะ Incident Management เป็นหนึ่งในทักษะที่สำคัญที่สุดสำหรับ IT Career โดยเฉพาะสาย SRE, DevOps และ Platform Engineering การรับมือ Incidents ได้ดีแสดงถึง Technical Skills, Communication และ Leadership ที่เป็นที่ต้องการในอุตสาหกรรม

Setup Incident.io Workflow

# === Incident.io Setup และ Workflow ===

# 1. ติดตั้ง Incident.io
# - สมัครที่ https://incident.io
# - เชื่อมต่อ Slack Workspace
# - ตั้งค่า Permissions

# 2. Severity Levels
# SEV1 (Critical): ระบบล่มทั้งหมด ผู้ใช้ทุกู้คืนได้รับผลกระทบ
# SEV2 (Major): ฟีเจอร์สำคัญใช้งานไม่ได้ ผู้ใช้จำนวนมากได้รับผลกระทบ
# SEV3 (Minor): ฟีเจอร์บางส่วนมีปัญหา ผู้ใช้บางส่วนได้รับผลกระทบ
# SEV4 (Low): ปัญหาเล็กน้อย ไม่กระทบผู้ใช้โดยตรง

# 3. Incident Roles
# Incident Lead: รับผิดชอบการจัดการ Incident ทั้งหมด
# Communications Lead: สื่อสารกับ Stakeholders และ Customers
# Technical Lead: วิเคราะห์และแก้ไขปัญหาทางเทคนิค
# Scribe: บันทึก Timeline และ Actions

# 4. Workflow Automation (Incident.io Config)
# Trigger: /incident ใน Slack
# Actions:
#   - สร้าง Incident Channel (#inc-YYYY-MM-DD-title)
#   - Invite On-call Engineers
#   - Post ใน #incidents Channel
#   - Create PagerDuty Alert (SEV1/SEV2)
#   - Create Jira Ticket
#   - Start Status Page Update (SEV1)

# 5. Escalation Policy
# SEV1: Page On-call -> Page Team Lead (5 min) -> Page VP Eng (15 min)
# SEV2: Page On-call -> Page Team Lead (15 min)
# SEV3: Notify On-call via Slack
# SEV4: Create Ticket, address in next sprint

# 6. Communication Templates
# Initial: "เรากำลังตรวจสอบปัญหา [description] ที่ส่งผลกระทบต่อ [impact]"
# Update: "อัพเดท: [สิ่งที่ทำไปแล้ว] ขั้นตอนต่อไป: [next steps] ETA: [time]"
# Resolved: "Incident resolved. Root cause: [cause]. เราจะทำ Post-mortem ภายใน 48 ชม."

# === Terraform Config สำหรับ PagerDuty Integration ===
# resource "pagerduty_service" "api" {
#   name                    = "API Service"
#   escalation_policy       = pagerduty_escalation_policy.engineering.id
#   alert_creation          = "create_alerts_and_incidents"
#   auto_resolve_timeout    = 14400
#   acknowledgement_timeout = 600
# }
#
# resource "pagerduty_service_integration" "incident_io" {
#   name    = "Incident.io"
#   service = pagerduty_service.api.id
#   vendor  = data.pagerduty_vendor.incident_io.id
# }

Python — Incident Automation Scripts

# incident_automation.py — Automation Scripts สำหรับ Incident Management
import json
import requests
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from typing import List, Optional
from enum import Enum

class Severity(Enum):
    SEV1 = 1
    SEV2 = 2
    SEV3 = 3
    SEV4 = 4

class IncidentStatus(Enum):
    INVESTIGATING = "investigating"
    IDENTIFIED = "identified"
    MONITORING = "monitoring"
    RESOLVED = "resolved"

@dataclass
class Incident:
    id: str
    title: str
    severity: Severity
    status: IncidentStatus
    lead: str
    created_at: datetime
    resolved_at: Optional[datetime] = None
    timeline: List[dict] = field(default_factory=list)
    action_items: List[dict] = field(default_factory=list)
    root_cause: str = ""
    impact: str = ""

class IncidentManager:
    """จัดการ Incidents และสร้าง Reports"""

    def __init__(self):
        self.incidents: List[Incident] = []

    def create_incident(self, title, severity, lead):
        """สร้าง Incident ใหม่"""
        inc = Incident(
            id=f"INC-{len(self.incidents)+1:04d}",
            title=title,
            severity=severity,
            status=IncidentStatus.INVESTIGATING,
            lead=lead,
            created_at=datetime.now(),
        )
        inc.timeline.append({
            "time": datetime.now().isoformat(),
            "action": "Incident declared",
            "by": lead,
        })
        self.incidents.append(inc)
        return inc

    def update_status(self, incident_id, status, note=""):
        """อัพเดทสถานะ"""
        inc = self._find(incident_id)
        if inc:
            inc.status = status
            inc.timeline.append({
                "time": datetime.now().isoformat(),
                "action": f"Status -> {status.value}: {note}",
                "by": inc.lead,
            })
            if status == IncidentStatus.RESOLVED:
                inc.resolved_at = datetime.now()

    def _find(self, incident_id):
        return next((i for i in self.incidents if i.id == incident_id), None)

    def generate_postmortem(self, incident_id):
        """สร้าง Post-mortem Template"""
        inc = self._find(incident_id)
        if not inc:
            return None

        ttd = (inc.resolved_at - inc.created_at).total_seconds() / 60 \
            if inc.resolved_at else 0

        report = f"""
# Post-mortem: {inc.id} — {inc.title}
Date: {inc.created_at:%Y-%m-%d}
Severity: {inc.severity.name}
Duration: {ttd:.0f} minutes
Lead: {inc.lead}

## Summary
{inc.title}

## Impact
{inc.impact or 'TBD'}

## Root Cause
{inc.root_cause or 'TBD'}

## Timeline
"""
        for entry in inc.timeline:
            report += f"- {entry['time']}: {entry['action']} ({entry['by']})\n"

        report += f"""
## What Went Well
- TBD

## What Could Be Improved
- TBD

## Action Items
"""
        for ai in inc.action_items:
            report += f"- [{ai.get('priority','P2')}] {ai['task']} — {ai.get('owner','TBD')}\n"

        return report

    def metrics_report(self, days=30):
        """สร้าง Metrics Report"""
        cutoff = datetime.now() - timedelta(days=days)
        recent = [i for i in self.incidents if i.created_at > cutoff]

        by_severity = {}
        ttrs = []

        for inc in recent:
            sev = inc.severity.name
            by_severity[sev] = by_severity.get(sev, 0) + 1

            if inc.resolved_at:
                ttr = (inc.resolved_at - inc.created_at).total_seconds() / 60
                ttrs.append(ttr)

        avg_ttr = sum(ttrs) / len(ttrs) if ttrs else 0

        print(f"\n{'='*50}")
        print(f"Incident Metrics ({days} days)")
        print(f"{'='*50}")
        print(f"  Total Incidents: {len(recent)}")
        for sev, count in sorted(by_severity.items()):
            print(f"    {sev}: {count}")
        print(f"  Avg TTR: {avg_ttr:.0f} minutes")
        print(f"  Resolved: {len(ttrs)}/{len(recent)}")

# ตัวอย่าง
mgr = IncidentManager()
inc = mgr.create_incident("API Response Time > 5s", Severity.SEV2, "Alice")
mgr.update_status(inc.id, IncidentStatus.IDENTIFIED, "Database connection pool exhausted")
inc.root_cause = "Connection pool limit too low for traffic spike"
inc.impact = "30% of API requests failed for 45 minutes"
inc.action_items = [
    {"task": "Increase connection pool to 100", "owner": "Bob", "priority": "P1"},
    {"task": "Add connection pool monitoring", "owner": "Carol", "priority": "P2"},
    {"task": "Implement circuit breaker", "owner": "Dave", "priority": "P2"},
]
mgr.update_status(inc.id, IncidentStatus.RESOLVED, "Connection pool increased")

print(mgr.generate_postmortem(inc.id))
mgr.metrics_report(30)

Career Development Path

# career_path.py — IT Career Development Path สำหรับ Incident Management

career_paths = {
    "Junior Engineer": {
        "years": "0-2",
        "incident_role": "Responder",
        "skills": [
            "Basic troubleshooting",
            "Follow runbooks",
            "Escalate appropriately",
            "Write incident notes",
        ],
        "certifications": ["AWS SAA", "Linux+", "ITIL Foundation"],
    },
    "Mid-level Engineer": {
        "years": "2-5",
        "incident_role": "Technical Lead",
        "skills": [
            "Root cause analysis",
            "Write runbooks",
            "Lead technical investigation",
            "Mentor juniors during incidents",
            "Write post-mortems",
        ],
        "certifications": ["AWS SAP", "CKA", "SRE Foundation"],
    },
    "Senior Engineer": {
        "years": "5-8",
        "incident_role": "Incident Commander",
        "skills": [
            "Lead cross-team incidents",
            "Design resilient systems",
            "Improve incident process",
            "Stakeholder communication",
            "Chaos engineering",
        ],
        "certifications": ["AWS DevOps Pro", "CKS", "DORA Metrics"],
    },
    "Staff/Principal Engineer": {
        "years": "8+",
        "incident_role": "Incident Program Owner",
        "skills": [
            "Define incident strategy",
            "Build incident culture",
            "Cross-org coordination",
            "Reduce MTTR organization-wide",
            "Design incident automation",
        ],
        "certifications": ["TOGAF", "Leadership Training"],
    },
}

print("IT Career Path — Incident Management Track")
print("=" * 55)

for level, info in career_paths.items():
    print(f"\n{'='*55}")
    print(f"{level} ({info['years']} years)")
    print(f"Incident Role: {info['incident_role']}")
    print(f"Skills:")
    for skill in info["skills"]:
        print(f"  - {skill}")
    print(f"Certifications: {', '.join(info['certifications'])}")

Best Practices

Blameless Culture: Post-mortem ไม่โทษคน มุ่งปรับปรุงระบบและ Process
Runbooks: สร้าง Runbooks สำหรับ Common Incidents ให้ทุกู้คืนแก้ไขได้เร็ว
Communication: อัพเดท Stakeholders ทุก 15-30 นาทีในช่วง Incident
Action Items: ทุก Post-mortem ต้องมี Action Items ที่มี Owner และ Deadline
Practice: ทำ Game Day หรือ Chaos Engineering ฝึกซ้อม Incident Response
Metrics: ติดตาม MTTR, MTTD, Incident Frequency ใช้ข้อมูลปรับปรุง

Incident.io คืออะไร

แพลตฟอร์ม Incident Management ทำงานร่วมกับ Slack จัดการ Incidents ตั้งแต่ Declaration Triage Communication Resolution Post-mortem มี Automation สร้าง Channels Assign Roles Track Status อัตโนมัติ