ทำไมต้อง Scale Monitoring Infrastructure
เมื่อ infrastructure เติบโตจาก servers ไม่กี่ตัวเป็นหลายร้อยหรือหลายพัน monitoring ต้อง scale ตาม ปัญหาที่พบเมื่อ monitoring ไม่ scale ได้แก่ alert fatigue จากการแจ้งเตือนมากเกินไป, missed alerts เพราะ monitors ไม่ครอบคลุม, slow response time ของ monitoring dashboard, high cost จาก inefficient monitor configuration และ inconsistent monitoring standards ระหว่างทีม
Scaling Strategy สำหรับ monitoring ครอบคลุม Monitor Organization จัดกลุ่ม monitors ตาม service, team, environment, Alert Routing ส่ง alerts ไปหาคนที่เกี่ยวข้อง, Automation สร้างและจัดการ monitors อัตโนมัติ, Multi-Region monitoring จากหลาย locations และ Cost Management optimize จำนวนและ frequency ของ monitors
Better Uptime รองรับ scaling ด้วย API สำหรับ automate monitor management, teams และ escalation policies สำหรับ alert routing, status page groups สำหรับจัดระเบียบ services, integrations สำหรับ connect กับ existing tools และ Terraform provider สำหรับ Infrastructure as Code
Horizontal Scaling สำหรับ Monitoring
วิธี scale monitoring infrastructure
# === Monitoring Scaling Architecture ===
# Tier 1: External Monitoring (Better Uptime)
# ===================================
# - HTTP/HTTPS endpoint checks
# - SSL certificate monitoring
# - Domain expiration monitoring
# - Status pages for customers
# - Heartbeat monitoring for cron jobs
# Tier 2: Infrastructure Monitoring (Prometheus + Grafana)
# ===================================
# - Server metrics (CPU, memory, disk, network)
# - Container metrics (Docker, Kubernetes)
# - Database metrics (connections, queries, replication)
# - Custom application metrics
# Tier 3: Application Performance (APM)
# ===================================
# - Request latency (p50, p95, p99)
# - Error rates
# - Transaction traces
# - Database query performance
# Tier 4: Log Monitoring
# ===================================
# - Error log aggregation
# - Security event monitoring
# - Audit trail
# - Pattern-based alerting
# === Scaling Rules ===
# Monitor Frequency by Criticality:
# Critical (revenue-impacting): 30 seconds
# High (user-facing): 60 seconds
# Medium (internal services): 300 seconds (5 min)
# Low (documentation, blogs): 900 seconds (15 min)
# Alert Channels by Severity:
# P1 Critical: Phone call + SMS + Slack + PagerDuty
# P2 High: SMS + Slack + Email
# P3 Medium: Slack + Email
# P4 Low: Email only
# Monitor Count Guidelines:
# Per microservice: 3-5 monitors (health, latency, error rate, dependencies)
# Per database: 2-3 monitors (connectivity, replication lag, disk)
# Per queue: 2 monitors (depth, consumer lag)
# Per external API: 1-2 monitors (availability, response time)
# === Terraform Module for Scaling ===
# modules/monitoring/main.tf
#
# variable "services" {
# type = list(object({
# name = string
# url = string
# criticality = string # critical, high, medium, low
# team = string
# }))
# }
#
# locals {
# frequency_map = {
# critical = 30
# high = 60
# medium = 300
# low = 900
# }
# }
#
# resource "betteruptime_monitor" "service" {
# for_each = { for s in var.services : s.name => s }
#
# url = each.value.url
# monitor_type = "status"
# check_frequency = local.frequency_map[each.value.criticality]
# request_timeout = 15
# regions = ["us", "eu", "asia"]
#
# call = each.value.criticality == "critical"
# sms = contains(["critical", "high"], each.value.criticality)
# email = true
# }
echo "Monitoring scaling architecture defined"
สร้าง Multi-Region Monitoring Architecture
Monitor services จากหลาย regions
#!/usr/bin/env python3
# multi_region_monitor.py — Multi-Region Monitoring Setup
import requests
import json
import logging
from typing import Dict, List, Optional
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("multi_region")
@dataclass
class ServiceEndpoint:
name: str
url: str
region: str
criticality: str
team: str
expected_status: int = 200
timeout: int = 15
class MultiRegionMonitoring:
BASE_URL = "https://betteruptime.com/api/v2"
def __init__(self, api_token):
self.headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
def _post(self, path, data):
resp = requests.post(
f"{self.BASE_URL}{path}",
headers=self.headers, json=data,
)
resp.raise_for_status()
return resp.json()
def _get(self, path):
resp = requests.get(f"{self.BASE_URL}{path}", headers=self.headers)
resp.raise_for_status()
return resp.json()
def setup_service_monitoring(self, endpoints: List[ServiceEndpoint]):
frequency_map = {"critical": 30, "high": 60, "medium": 300, "low": 900}
created = []
for ep in endpoints:
config = {
"url": ep.url,
"monitor_type": "expected_status_code",
"expected_status_codes": [ep.expected_status],
"check_frequency": frequency_map.get(ep.criticality, 300),
"request_timeout": ep.timeout,
"regions": self._get_regions(ep.region),
"call": ep.criticality == "critical",
"sms": ep.criticality in ("critical", "high"),
"email": True,
"push": True,
}
try:
result = self._post("/monitors", config)
monitor_id = result.get("data", {}).get("id")
created.append({
"name": ep.name,
"region": ep.region,
"monitor_id": monitor_id,
})
logger.info(f"Created monitor: {ep.name} ({ep.region}) -> {monitor_id}")
except Exception as e:
logger.error(f"Failed: {ep.name} - {e}")
return created
def _get_regions(self, primary_region):
region_map = {
"asia": ["asia", "us", "eu"],
"us": ["us", "eu", "asia"],
"eu": ["eu", "us", "asia"],
"global": ["us", "eu", "asia"],
}
return region_map.get(primary_region, ["us", "eu", "asia"])
def setup_status_page(self, name, services):
page = self._post("/status-pages", {
"company_name": name,
"subdomain": name.lower().replace(" ", "-"),
"timezone": "Asia/Bangkok",
"subscribable": True,
})
page_id = page["data"]["id"]
for svc in services:
self._post(f"/status-pages/{page_id}/resources", {
"resource_type": "Monitor",
"public_name": svc["display_name"],
"widget_type": "history",
})
logger.info(f"Status page created: {page_id}")
return page_id
def get_global_status(self):
monitors = self._get("/monitors").get("data", [])
status = {"total": 0, "up": 0, "down": 0, "paused": 0, "by_region": {}}
for m in monitors:
attrs = m.get("attributes", {})
status["total"] += 1
if attrs.get("status") == "up":
status["up"] += 1
elif attrs.get("status") == "down":
status["down"] += 1
else:
status["paused"] += 1
status["availability_pct"] = round(
status["up"] / max(status["total"], 1) * 100, 2
)
return status
# monitoring = MultiRegionMonitoring("api-token")
# endpoints = [
# ServiceEndpoint("API Asia", "https://api-asia.example.com/health", "asia", "critical", "platform"),
# ServiceEndpoint("API US", "https://api-us.example.com/health", "us", "critical", "platform"),
# ServiceEndpoint("API EU", "https://api-eu.example.com/health", "eu", "critical", "platform"),
# ServiceEndpoint("Web App", "https://app.example.com", "global", "high", "frontend"),
# ServiceEndpoint("Docs", "https://docs.example.com", "global", "low", "docs"),
# ]
# monitoring.setup_service_monitoring(endpoints)
Automation สำหรับ Scaling Monitors
Automate monitor lifecycle management
#!/usr/bin/env python3
# monitor_automation.py — Automated Monitor Scaling
import requests
import json
import yaml
import logging
from pathlib import Path
from datetime import datetime
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("auto_scale")
class MonitorAutomation:
BASE_URL = "https://betteruptime.com/api/v2"
def __init__(self, api_token):
self.headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
self._monitors_cache = None
def _request(self, method, path, data=None):
resp = requests.request(
method, f"{self.BASE_URL}{path}",
headers=self.headers, json=data,
)
resp.raise_for_status()
return resp.json() if resp.content else {}
def get_all_monitors(self):
if self._monitors_cache:
return self._monitors_cache
all_monitors = []
page = 1
while True:
data = self._request("GET", f"/monitors?page={page}")
monitors = data.get("data", [])
if not monitors:
break
all_monitors.extend(monitors)
page += 1
self._monitors_cache = all_monitors
return all_monitors
def sync_from_config(self, config_file):
config = yaml.safe_load(Path(config_file).read_text())
existing = {
m["attributes"]["url"]: m
for m in self.get_all_monitors()
}
desired = config.get("monitors", [])
created, updated, removed = 0, 0, 0
desired_urls = set()
for monitor_config in desired:
url = monitor_config["url"]
desired_urls.add(url)
if url in existing:
# Update if changed
logger.info(f"Monitor exists: {url}")
updated += 1
else:
# Create new
self._request("POST", "/monitors", monitor_config)
logger.info(f"Created: {url}")
created += 1
# Remove monitors not in config (optional)
if config.get("remove_unmanaged", False):
for url, monitor in existing.items():
if url not in desired_urls:
monitor_id = monitor["id"]
self._request("DELETE", f"/monitors/{monitor_id}")
logger.info(f"Removed: {url}")
removed += 1
return {"created": created, "updated": updated, "removed": removed}
def auto_discover_kubernetes(self, kubeconfig=None):
"""Discover services from Kubernetes and create monitors"""
try:
from kubernetes import client, config
if kubeconfig:
config.load_kube_config(kubeconfig)
else:
config.load_incluster_config()
v1 = client.CoreV1Api()
services = v1.list_service_for_all_namespaces()
monitors = []
for svc in services.items:
annotations = svc.metadata.annotations or {}
# Only monitor services with monitoring annotation
if annotations.get("monitoring/enabled") != "true":
continue
url = annotations.get("monitoring/url", "")
criticality = annotations.get("monitoring/criticality", "medium")
if url:
monitors.append({
"url": url,
"monitor_type": "expected_status_code",
"expected_status_codes": [200],
"check_frequency": {"critical": 30, "high": 60}.get(criticality, 300),
"email": True,
"sms": criticality in ("critical", "high"),
})
# Create monitors
for m in monitors:
try:
self._request("POST", "/monitors", m)
logger.info(f"Auto-discovered: {m['url']}")
except Exception as e:
logger.error(f"Failed: {m['url']} - {e}")
return monitors
except ImportError:
logger.error("kubernetes package not installed")
return []
def generate_config_template(self, output="monitors.yaml"):
monitors = self.get_all_monitors()
config = {"monitors": []}
for m in monitors:
attrs = m.get("attributes", {})
config["monitors"].append({
"url": attrs.get("url"),
"monitor_type": attrs.get("monitor_type"),
"check_frequency": attrs.get("check_frequency"),
"request_timeout": attrs.get("request_timeout"),
"email": attrs.get("email"),
"sms": attrs.get("sms"),
"call": attrs.get("call"),
})
Path(output).write_text(yaml.dump(config, default_flow_style=False))
logger.info(f"Config template saved to {output}")
# auto = MonitorAutomation("api-token")
# auto.sync_from_config("monitors.yaml")
# auto.generate_config_template()
Alert Routing และ Escalation Policies
จัดการ alert routing สำหรับ large teams
#!/usr/bin/env python3
# alert_routing.py — Alert Routing and Escalation
import json
import logging
from datetime import datetime, time
from typing import Dict, List, Optional
from dataclasses import dataclass, field
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("alerts")
@dataclass
class TeamMember:
name: str
email: str
phone: Optional[str] = None
slack_id: Optional[str] = None
role: str = "engineer"
@dataclass
class Team:
name: str
members: List[TeamMember] = field(default_factory=list)
slack_channel: str = ""
services: List[str] = field(default_factory=list)
@dataclass
class EscalationPolicy:
name: str
steps: List[Dict] = field(default_factory=list)
class AlertRouter:
def __init__(self):
self.teams: Dict[str, Team] = {}
self.policies: Dict[str, EscalationPolicy] = {}
self.service_ownership: Dict[str, str] = {}
def add_team(self, team: Team):
self.teams[team.name] = team
for svc in team.services:
self.service_ownership[svc] = team.name
def add_escalation_policy(self, policy: EscalationPolicy):
self.policies[policy.name] = policy
def route_alert(self, service_name, severity, alert_data):
team_name = self.service_ownership.get(service_name)
if not team_name:
logger.warning(f"No team owns service: {service_name}")
return self._default_routing(alert_data)
team = self.teams[team_name]
actions = []
if severity == "critical":
actions.append({"type": "phone", "targets": [m.phone for m in team.members if m.phone]})
actions.append({"type": "sms", "targets": [m.phone for m in team.members if m.phone]})
actions.append({"type": "slack", "channel": team.slack_channel})
actions.append({"type": "email", "targets": [m.email for m in team.members]})
elif severity == "high":
actions.append({"type": "sms", "targets": [m.phone for m in team.members if m.phone]})
actions.append({"type": "slack", "channel": team.slack_channel})
actions.append({"type": "email", "targets": [m.email for m in team.members]})
elif severity == "medium":
actions.append({"type": "slack", "channel": team.slack_channel})
actions.append({"type": "email", "targets": [m.email for m in team.members]})
else:
actions.append({"type": "email", "targets": [m.email for m in team.members]})
routing = {
"service": service_name,
"team": team_name,
"severity": severity,
"actions": actions,
"timestamp": datetime.utcnow().isoformat(),
}
logger.info(f"Alert routed: {service_name} ({severity}) -> {team_name}")
return routing
def _default_routing(self, alert_data):
return {
"team": "oncall",
"actions": [{"type": "email", "targets": ["oncall@example.com"]}],
}
def get_routing_map(self):
routing_map = {}
for svc, team_name in self.service_ownership.items():
team = self.teams[team_name]
routing_map[svc] = {
"team": team_name,
"members": [m.name for m in team.members],
"slack": team.slack_channel,
}
return routing_map
# Setup
router = AlertRouter()
platform_team = Team(
name="platform",
members=[
TeamMember("Alice", "alice@example.com", "+66812345678", "U001", "lead"),
TeamMember("Bob", "bob@example.com", "+66823456789", "U002"),
],
slack_channel="#platform-alerts",
services=["api-gateway", "auth-service", "database"],
)
frontend_team = Team(
name="frontend",
members=[
TeamMember("Charlie", "charlie@example.com", "+66834567890", "U003"),
TeamMember("Diana", "diana@example.com", None, "U004"),
],
slack_channel="#frontend-alerts",
services=["web-app", "cdn", "static-assets"],
)
router.add_team(platform_team)
router.add_team(frontend_team)
# Route alert
result = router.route_alert("api-gateway", "critical", {"message": "API down"})
print(json.dumps(result, indent=2))
Cost Optimization และ Capacity Planning
Optimize monitoring costs
# === Monitoring Cost Optimization ===
# 1. Right-size Check Frequencies
# ===================================
# Problem: Checking everything every 30 seconds is expensive
# Solution: Match frequency to criticality
#
# Before optimization:
# 200 monitors x 30s checks = $400/month
#
# After optimization:
# 20 critical x 30s = $80/month
# 30 high x 60s = $60/month
# 100 medium x 300s = $50/month
# 50 low x 900s = $10/month
# Total: $200/month (50% savings)
# 2. Consolidate Monitors
# ===================================
# Instead of: separate monitors for /, /about, /contact, /blog
# Use: single monitor for health endpoint /health
# Plus: synthetic monitoring for critical user journeys
# 3. Use Heartbeats vs HTTP Monitors
# ===================================
# Heartbeats are cheaper than HTTP monitors
# Use for: cron jobs, background workers, batch processes
# Instead of: polling endpoints that don't need it
# 4. Capacity Planning Script
#!/usr/bin/env python3
# capacity_planning.py
import json
from datetime import datetime
class MonitoringCapacityPlanner:
PRICING = {
"free": {"monitors": 10, "frequency_min": 180, "cost": 0},
"starter": {"monitors": 50, "frequency_min": 60, "cost": 24},
"business": {"monitors": 200, "frequency_min": 30, "cost": 69},
"enterprise": {"monitors": 999, "frequency_min": 30, "cost": 199},
}
def __init__(self):
self.services = []
def add_services(self, services):
self.services.extend(services)
def calculate_monitors_needed(self):
total = 0
breakdown = {}
for svc in self.services:
monitors = svc.get("monitors_per_instance", 3)
instances = svc.get("instances", 1)
count = monitors * instances
total += count
breakdown[svc["name"]] = count
return {"total": total, "breakdown": breakdown}
def recommend_plan(self):
needed = self.calculate_monitors_needed()
total = needed["total"]
for plan_name, plan in sorted(self.PRICING.items(), key=lambda x: x[1]["cost"]):
if plan["monitors"] >= total:
return {
"plan": plan_name,
"monitors_needed": total,
"monitors_available": plan["monitors"],
"monthly_cost": plan["cost"],
"annual_cost": plan["cost"] * 12,
"cost_per_monitor": round(plan["cost"] / max(total, 1), 2),
"headroom": plan["monitors"] - total,
}
return {"plan": "enterprise_custom", "monitors_needed": total}
def forecast_growth(self, months=12, growth_rate=0.1):
current = self.calculate_monitors_needed()["total"]
forecast = []
for month in range(1, months + 1):
projected = int(current * (1 + growth_rate) ** month)
plan = None
cost = 0
for plan_name, plan_info in sorted(self.PRICING.items(), key=lambda x: x[1]["cost"]):
if plan_info["monitors"] >= projected:
plan = plan_name
cost = plan_info["cost"]
break
forecast.append({
"month": month,
"monitors": projected,
"plan": plan or "enterprise_custom",
"monthly_cost": cost,
})
return forecast
planner = MonitoringCapacityPlanner()
planner.add_services([
{"name": "API Gateway", "monitors_per_instance": 3, "instances": 3},
{"name": "Web App", "monitors_per_instance": 2, "instances": 2},
{"name": "Database", "monitors_per_instance": 2, "instances": 2},
{"name": "Cache", "monitors_per_instance": 1, "instances": 3},
{"name": "Queue", "monitors_per_instance": 2, "instances": 1},
{"name": "Cron Jobs", "monitors_per_instance": 5, "instances": 1},
])
print("Monitors needed:", json.dumps(planner.calculate_monitors_needed(), indent=2))
print("Recommended plan:", json.dumps(planner.recommend_plan(), indent=2))
print("12-month forecast:", json.dumps(planner.forecast_growth(12, 0.15), indent=2))
FAQ คำถามที่พบบ่อย
Q: Monitor กี่ตัวถึงจะเรียกว่า scale ใหญ่?
A: สำหรับ Better Uptime 50-100 monitors ถือว่า medium scale ต้องเริ่มจัด organization ดี 100-500 monitors ถือว่า large ต้องใช้ automation และ IaC 500+ monitors ถือว่า enterprise scale ต้องมี dedicated monitoring team สำหรับ self-hosted monitoring (Prometheus) 1000+ targets ต้อง shard Prometheus instances
Q: จะลด alert fatigue อย่างไร?
A: ตั้ง confirmation period ให้เหมาะสม (ไม่ alert จาก single failure), group related alerts เข้าด้วยกัน (เช่น database down ไม่ต้อง alert ทุก service ที่ depend), ใช้ severity levels อย่างถูกต้อง (ไม่ทำทุกอย่างเป็น critical), ตั้ง maintenance windows สำหรับ planned work, review และ tune alert thresholds เป็นประจำ (monthly) และ implement alert deduplication
Q: Infrastructure as Code สำหรับ monitoring คุ้มไหม?
A: คุ้มมากเมื่อมี monitors มากกว่า 20 ตัว ข้อดีคือ reproducible (สร้าง monitoring setup เดิมได้ทุกครั้ง), version controlled (track changes ใน git), reviewable (code review สำหรับ monitoring changes), scalable (เพิ่ม monitors ด้วย config เพียงไม่กี่บรรทัด) ใช้ Terraform Better Uptime provider หรือ API scripts สำหรับ automation
Q: Multi-region monitoring จำเป็นไหม?
A: จำเป็นสำหรับ services ที่ serve users หลาย regions ถ้า monitor จาก region เดียว อาจ miss regional outages เช่น CDN failure ใน Asia ที่ไม่กระทบ US Better Uptime check จากหลาย locations อัตโนมัติ แต่ควร configure ให้ check จาก regions ที่ users อยู่จริง สำหรับ services ที่ serve เฉพาะ Thailand monitor จาก Asia + 1 backup region เพียงพอ
