SigNoz Observability Post-mortem Analysis คืออะไร
SigNoz เป็น open-source observability platform ที่รวม logs, metrics และ traces ไว้ในที่เดียว เป็นทางเลือกของ Datadog และ New Relic ที่ self-hosted ได้ ไม่มีค่าใช้จ่ายรายเดือน Post-mortem Analysis คือกระบวนการวิเคราะห์หลังเกิด incident เพื่อหา root cause, timeline, impact และ preventive measures SigNoz ช่วย post-mortem ได้ดีเพราะมี distributed tracing, log correlation และ custom dashboards สำหรับวิเคราะห์เหตุการณ์ย้อนหลัง บทความนี้อธิบายการใช้ SigNoz สำหรับ post-mortem analysis ครบทุกขั้นตอน
SigNoz Architecture & Features
# signoz_arch.py — SigNoz architecture overview
import json
class SigNozArchitecture:
COMPONENTS = {
"otel_collector": {
"name": "OpenTelemetry Collector",
"description": "รับ telemetry data (logs, metrics, traces) จาก applications ผ่าน OTLP protocol",
},
"clickhouse": {
"name": "ClickHouse (Storage)",
"description": "Columnar database สำหรับเก็บ telemetry data — query เร็วมากบน time-series data",
},
"query_service": {
"name": "Query Service",
"description": "Backend API สำหรับ query data จาก ClickHouse — filter, aggregate, correlate",
},
"frontend": {
"name": "Frontend (React)",
"description": "Web UI สำหรับ dashboards, trace explorer, log viewer, alerts",
},
"alert_manager": {
"name": "Alert Manager",
"description": "ระบบ alerting — trigger alerts ตาม conditions บน metrics/logs/traces",
},
}
FEATURES = {
"distributed_tracing": "ติดตาม requests ข้าม microservices — visualize latency bottlenecks",
"log_management": "Centralized logs พร้อม full-text search, filtering, correlation กับ traces",
"metrics_monitoring": "Application + infrastructure metrics พร้อม custom dashboards",
"exceptions": "Track exceptions พร้อม stack traces, frequency, affected users",
"alerts": "Metric-based + log-based alerts พร้อม Slack, PagerDuty, webhook integration",
}
def show_architecture(self):
print("=== SigNoz Architecture ===\n")
for key, comp in self.COMPONENTS.items():
print(f"[{comp['name']}]")
print(f" {comp['description']}")
print()
def show_features(self):
print("=== Key Features ===")
for feat, desc in self.FEATURES.items():
print(f" [{feat}] {desc}")
arch = SigNozArchitecture()
arch.show_architecture()
arch.show_features()
Post-mortem Framework
# postmortem.py — Post-mortem analysis framework
import json
class PostMortemFramework:
TEMPLATE = {
"incident_summary": {
"name": "1. Incident Summary",
"fields": ["Title", "Severity (P1-P4)", "Duration", "Impact", "Detection method"],
},
"timeline": {
"name": "2. Timeline",
"fields": [
"Detection time — เมื่อไหร่รู้ว่ามีปัญหา",
"Response time — เริ่ม investigate เมื่อไหร่",
"Mitigation time — แก้ไขชั่วคราวเมื่อไหร่",
"Resolution time — แก้ไขถาวรเมื่อไหร่",
"Key events — เหตุการณ์สำคัญตามลำดับเวลา",
],
},
"root_cause": {
"name": "3. Root Cause Analysis",
"fields": [
"Primary cause — สาเหตุหลัก",
"Contributing factors — ปัจจัยที่ทำให้แย่ลง",
"5 Whys analysis — ถามทำไม 5 ครั้ง",
"Evidence from SigNoz — traces, logs, metrics ที่ยืนยัน",
],
},
"impact": {
"name": "4. Impact Assessment",
"fields": [
"Users affected — จำนวน users ที่ได้รับผลกระทบ",
"Services affected — services ที่ล่ม",
"Data loss — ข้อมูลที่สูญหาย (ถ้ามี)",
"Revenue impact — ผลกระทบต่อรายได้",
"SLA breach — ละเมิด SLA หรือไม่",
],
},
"action_items": {
"name": "5. Action Items",
"fields": [
"Immediate fixes — แก้ไขทันที",
"Short-term improvements — ปรับปรุงภายใน 1-2 สัปดาห์",
"Long-term prevention — ป้องกันระยะยาว",
"Monitoring improvements — เพิ่ม alerts/dashboards",
],
},
}
def show_template(self):
print("=== Post-mortem Template ===\n")
for key, section in self.TEMPLATE.items():
print(f"[{section['name']}]")
for field in section["fields"][:4]:
print(f" • {field}")
print()
pm = PostMortemFramework()
pm.show_template()
SigNoz Queries for Investigation
# investigation.py — SigNoz queries for post-mortem investigation
import json
class SigNozInvestigation:
QUERIES = {
"error_traces": {
"name": "Find Error Traces",
"description": "ค้นหา traces ที่มี errors ในช่วงเวลา incident",
"query": """
-- SigNoz Trace Explorer query
service_name = 'payment-service'
AND status_code = 'ERROR'
AND timestamp >= '2024-01-15T10:00:00Z'
AND timestamp <= '2024-01-15T12:00:00Z'
ORDER BY duration_nano DESC
""",
},
"slow_traces": {
"name": "Find Slow Traces",
"description": "ค้นหา traces ที่ช้าผิดปกติ — latency spikes",
"query": """
-- Slow traces (P99 latency)
service_name = 'api-gateway'
AND duration_nano > 5000000000 -- > 5 seconds
AND timestamp >= '2024-01-15T10:00:00Z'
ORDER BY duration_nano DESC
LIMIT 100
""",
},
"error_logs": {
"name": "Correlated Error Logs",
"description": "ค้นหา logs ที่เกี่ยวข้องกับ error traces",
"query": """
-- Error logs during incident window
severity_text IN ('ERROR', 'FATAL')
AND timestamp >= '2024-01-15T10:00:00Z'
AND timestamp <= '2024-01-15T12:00:00Z'
AND service_name = 'payment-service'
ORDER BY timestamp ASC
""",
},
"metric_anomaly": {
"name": "Metric Anomalies",
"description": "ดู metrics ที่เปลี่ยนแปลงผิดปกติ",
"query": """
-- Error rate spike
SELECT
toStartOfMinute(timestamp) AS minute,
countIf(status_code = 'ERROR') AS errors,
count(*) AS total,
errors / total * 100 AS error_rate
FROM signoz_traces.distributed_signoz_index_v2
WHERE service_name = 'payment-service'
AND timestamp >= '2024-01-15T08:00:00Z'
GROUP BY minute
ORDER BY minute
""",
},
}
PYTHON_QUERY = """
# signoz_query.py — Query SigNoz API for post-mortem data
import requests
import json
from datetime import datetime, timedelta
class SigNozClient:
def __init__(self, base_url="http://localhost:3301", api_key=None):
self.base_url = base_url
self.headers = {}
if api_key:
self.headers["SIGNOZ-API-KEY"] = api_key
def get_error_traces(self, service, start, end, limit=100):
'''Get error traces for a service during incident window'''
params = {
"service": service,
"start": int(start.timestamp() * 1e9),
"end": int(end.timestamp() * 1e9),
"status": "error",
"limit": limit,
}
resp = requests.get(
f"{self.base_url}/api/v1/traces",
params=params, headers=self.headers
)
return resp.json()
def get_service_metrics(self, service, start, end):
'''Get service metrics (latency, error rate, throughput)'''
params = {
"service": service,
"start": int(start.timestamp() * 1e9),
"end": int(end.timestamp() * 1e9),
"step": 60, # 1-minute intervals
}
resp = requests.get(
f"{self.base_url}/api/v1/services/{service}/metrics",
params=params, headers=self.headers
)
return resp.json()
def search_logs(self, query, start, end, limit=500):
'''Search logs during incident window'''
payload = {
"query": query,
"start": int(start.timestamp() * 1e9),
"end": int(end.timestamp() * 1e9),
"limit": limit,
"orderBy": "timestamp",
}
resp = requests.post(
f"{self.base_url}/api/v3/query_range",
json=payload, headers=self.headers
)
return resp.json()
def build_timeline(self, service, start, end):
'''Build incident timeline from traces and logs'''
traces = self.get_error_traces(service, start, end)
timeline = []
for trace in traces.get("data", []):
timeline.append({
"time": trace.get("timestamp"),
"type": "trace_error",
"service": trace.get("serviceName"),
"operation": trace.get("operationName"),
"duration_ms": trace.get("durationNano", 0) / 1e6,
"error": trace.get("statusMessage", ""),
})
timeline.sort(key=lambda x: x["time"])
return timeline
# Usage
# client = SigNozClient("http://signoz:3301")
# start = datetime(2024, 1, 15, 10, 0)
# end = datetime(2024, 1, 15, 12, 0)
# timeline = client.build_timeline("payment-service", start, end)
"""
def show_queries(self):
print("=== Investigation Queries ===\n")
for key, q in self.QUERIES.items():
print(f"[{q['name']}]")
print(f" {q['description']}")
print()
def show_python(self):
print("=== Python SigNoz Client ===")
print(self.PYTHON_QUERY[:600])
inv = SigNozInvestigation()
inv.show_queries()
inv.show_python()
Automated Post-mortem Report
# report.py — Generate post-mortem report automatically
import json
import random
class AutoPostMortemReport:
CODE = """
# auto_postmortem.py — Automated post-mortem report generator
import json
from datetime import datetime, timedelta
from jinja2 import Template
class PostMortemGenerator:
def __init__(self, signoz_client):
self.client = signoz_client
def generate(self, incident):
'''Generate post-mortem report from SigNoz data'''
start = datetime.fromisoformat(incident["start_time"])
end = datetime.fromisoformat(incident["end_time"])
service = incident["service"]
# Gather data from SigNoz
error_traces = self.client.get_error_traces(service, start, end)
metrics = self.client.get_service_metrics(service, start, end)
timeline = self.client.build_timeline(service, start, end)
# Build report
report = {
"title": incident["title"],
"severity": incident["severity"],
"duration_minutes": (end - start).total_seconds() / 60,
"generated_at": datetime.utcnow().isoformat(),
"timeline": timeline[:20],
"metrics_summary": {
"peak_error_rate": self._peak_error_rate(metrics),
"peak_latency_ms": self._peak_latency(metrics),
"total_errors": len(error_traces.get("data", [])),
},
"affected_operations": self._top_operations(error_traces),
"action_items": [
{"priority": "P1", "item": "Add circuit breaker for downstream calls"},
{"priority": "P2", "item": "Improve error handling in payment flow"},
{"priority": "P3", "item": "Add latency alert at P95 > 2s threshold"},
],
}
return report
def _peak_error_rate(self, metrics):
rates = [m.get("error_rate", 0) for m in metrics.get("data", [])]
return max(rates) if rates else 0
def _peak_latency(self, metrics):
latencies = [m.get("p99", 0) for m in metrics.get("data", [])]
return max(latencies) if latencies else 0
def _top_operations(self, traces):
from collections import Counter
ops = Counter(t.get("operationName") for t in traces.get("data", []))
return [{"operation": op, "count": cnt} for op, cnt in ops.most_common(5)]
# generator = PostMortemGenerator(signoz_client)
# report = generator.generate({
# "title": "Payment Service Outage",
# "severity": "P1",
# "service": "payment-service",
# "start_time": "2024-01-15T10:00:00",
# "end_time": "2024-01-15T12:00:00",
# })
"""
def show_code(self):
print("=== Auto Post-mortem Generator ===")
print(self.CODE[:600])
def sample_report(self):
print(f"\n=== Sample Post-mortem Report ===")
print(f" Title: Payment Service Outage")
print(f" Severity: P1")
print(f" Duration: {random.randint(30, 180)} minutes")
print(f" Peak error rate: {random.uniform(20, 80):.1f}%")
print(f" Peak latency P99: {random.randint(2000, 15000)}ms")
print(f" Total errors: {random.randint(500, 10000):,}")
print(f" Users affected: {random.randint(1000, 50000):,}")
print(f" Root cause: Database connection pool exhaustion")
report = AutoPostMortemReport()
report.show_code()
report.sample_report()
SigNoz Setup & Dashboards
# setup.py — SigNoz setup and custom dashboards
import json
class SigNozSetup:
DOCKER_COMPOSE = """
# docker-compose.yml — SigNoz quick setup
# Clone: git clone -b main https://github.com/SigNoz/signoz.git
# Run: cd signoz/deploy && docker compose -f docker/clickhouse-setup/docker-compose.yaml up -d
# Application instrumentation (Python)
# pip install opentelemetry-distro opentelemetry-exporter-otlp
# opentelemetry-bootstrap -a install
# Run with auto-instrumentation:
# OTEL_RESOURCE_ATTRIBUTES=service.name=my-app \\
# OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz:4317 \\
# opentelemetry-instrument python app.py
"""
DASHBOARD = """
# dashboard_config.json — Custom post-mortem dashboard
{
"name": "Incident Investigation Dashboard",
"panels": [
{
"title": "Error Rate by Service",
"type": "timeseries",
"query": "rate(signoz_calls_total{status_code='STATUS_CODE_ERROR'}[5m])"
},
{
"title": "P99 Latency by Service",
"type": "timeseries",
"query": "histogram_quantile(0.99, signoz_latency_bucket)"
},
{
"title": "Top Error Operations",
"type": "table",
"query": "topk(10, sum by (operation) (signoz_calls_total{status_code='ERROR'}))"
},
{
"title": "Log Error Count",
"type": "timeseries",
"query": "count_over_time({severity='ERROR'}[5m])"
}
]
}
"""
def show_setup(self):
print("=== SigNoz Setup ===")
print(self.DOCKER_COMPOSE[:400])
def show_dashboard(self):
print(f"\n=== Dashboard Config ===")
print(self.DASHBOARD[:400])
def comparison(self):
print(f"\n=== SigNoz vs Alternatives ===")
tools = [
{"name": "SigNoz", "type": "Self-hosted", "cost": "Free (OSS)", "storage": "ClickHouse"},
{"name": "Datadog", "type": "SaaS", "cost": "$15-34/host/mo", "storage": "Managed"},
{"name": "Grafana Stack", "type": "Self/Cloud", "cost": "Free/Cloud pricing", "storage": "Loki+Mimir+Tempo"},
{"name": "New Relic", "type": "SaaS", "cost": "$0.30/GB ingested", "storage": "Managed"},
]
for t in tools:
print(f" [{t['name']:<14}] {t['type']:<12} {t['cost']:<20} Storage: {t['storage']}")
setup = SigNozSetup()
setup.show_setup()
setup.show_dashboard()
setup.comparison()
FAQ - คำถามที่พบบ่อย
Q: SigNoz ดีกว่า Datadog ตรงไหน?
A: ข้อดี SigNoz: ฟรี (open source), self-hosted ควบคุม data ได้, ไม่มี per-host pricing, unified UI สำหรับ logs+metrics+traces ข้อด้อย: ต้อง maintain เอง, community เล็กกว่า, features น้อยกว่า Datadog เหมาะ: ทีมที่มี DevOps capability + ต้องการ cost control ไม่เหมาะ: ทีมเล็กไม่อยาก manage infrastructure
Q: Post-mortem ควรทำเมื่อไหร่?
A: ทุก P1/P2 incident: ภายใน 24-48 ชั่วโมงหลัง resolution P3: ถ้า impact สูงหรือเกิดซ้ำ P4: ไม่จำเป็นต้องทำ formal post-mortem ข้อควรจำ: blameless culture — โฟกัสที่ระบบ ไม่ใช่ตัวบุคคล ทำ post-mortem แม้ไม่มี customers ร้องเรียน — near-misses สำคัญ
Q: ClickHouse ใน SigNoz รองรับ data ได้มากแค่ไหน?
A: ClickHouse เก่งมากด้าน analytics: Single node: 100GB-1TB/day ingestion Cluster: 10TB+/day (horizontal scaling) Compression: 10-30x — 1TB raw = 30-100GB on disk Query: sub-second บน billions of rows Retention: กำหนดได้ (TTL) — เช่น traces 7 days, metrics 30 days, logs 15 days
Q: OpenTelemetry จำเป็นไหม?
A: จำเป็นสำหรับ SigNoz — SigNoz รับ data ผ่าน OTLP protocol เท่านั้น ข้อดี: vendor-neutral standard — เปลี่ยน backend ได้ง่าย (SigNoz → Datadog → Grafana) Auto-instrumentation: รองรับ Python, Java, Node.js, Go, .NET — ไม่ต้องเขียน code เยอะ Manual instrumentation: เพิ่ม custom spans, attributes สำหรับ business logic
