QuestDB Time Series Monitoring และ Alerting —

QuestDB Monitoring

QuestDB Time Series Monitoring Alerting SQL SAMPLE BY Grafana Dashboard Telegraf ILP PostgreSQL Alert Notification Production

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง golang package คือ — ข้อมูลครบถ้วน 2026

Component	Role	Protocol	Port
QuestDB	Time Series Database	ILP / PostgreSQL / REST	9009 / 8812 / 9000
Telegraf	Metrics Collection Agent	ILP (InfluxDB Line Protocol)	→ 9009
Grafana	Dashboard + Alerting	PostgreSQL Wire	← 8812
Alertmanager	Alert Routing + Dedup	HTTP API	9093

SQL Queries

# === QuestDB SQL Query Examples ===

# -- CPU Usage per host (5-minute average)
# SELECT hostname, avg(cpu_usage) as avg_cpu,
#        max(cpu_usage) as max_cpu,
#        min(cpu_usage) as min_cpu
# FROM metrics
# WHERE timestamp IN ('2024-01-01', '2024-01-31')
# SAMPLE BY 5m
# ALIGN TO CALENDAR;
#
# -- Latest value per host
# SELECT hostname, cpu_usage, memory_usage, timestamp
# FROM metrics
# LATEST ON timestamp PARTITION BY hostname;
#
# -- Request rate per service (per minute)
# SELECT service, count() as requests,
#        avg(response_time) as avg_latency,
#        max(response_time) as p99_latency
# FROM http_requests
# SAMPLE BY 1m
# FILL(0);
#
# -- Error rate percentage
# SELECT timestamp, service,
#        count() as total,
#        sum(CASE WHEN status >= 500 THEN 1 ELSE 0 END) as errors,
#        (sum(CASE WHEN status >= 500 THEN 1 ELSE 0 END) * 100.0 / count()) as error_rate
# FROM http_requests
# SAMPLE BY 5m;
#
# -- ASOF JOIN (correlate metrics with events)
# SELECT m.timestamp, m.cpu_usage, e.event_type
# FROM metrics m
# ASOF JOIN events e ON (m.hostname = e.hostname);

from dataclasses import dataclass

@dataclass
class QuestDBQuery:
    name: str
    sql_pattern: str
    use_case: str
    performance: str

queries = [
    QuestDBQuery("SAMPLE BY (Time Aggregation)",
        "SELECT avg(value) FROM t SAMPLE BY 5m",
        "รวมข้อมูลตามช่วงเวลา 1s 5m 1h 1d",
        "ล้าน rows ใน <100ms"),
    QuestDBQuery("LATEST ON (Last Value)",
        "SELECT * FROM t LATEST ON ts PARTITION BY host",
        "หาค่าล่าสุดของแต่ละ Host/Service",
        "<10ms ไม่ว่าข้อมูลเยอะแค่ไหน"),
    QuestDBQuery("ASOF JOIN (Time Correlation)",
        "FROM a ASOF JOIN b ON (a.host = b.host)",
        "เชื่อม Time Series ที่เวลาใกล้เคียงกัน",
        "เร็วกว่า Regular JOIN มาก"),
    QuestDBQuery("FILL (Gap Filling)",
        "SAMPLE BY 1m FILL(prev)",
        "เติมค่าช่วงที่ไม่มีข้อมูล prev/null/0/linear",
        "ไม่กระทบ Performance"),
    QuestDBQuery("WHERE timestamp IN (Range)",
        "WHERE ts IN ('2024-01-01','2024-01-31')",
        "กรองช่วงเวลา Partition Pruning อัตโนมัติ",
        "เร็วมาก อ่านเฉพาะ Partition ที่ต้องการ"),
]

print("=== QuestDB Queries ===")
for q in queries:
    print(f"  [{q.name}]")
    print(f"    SQL: {q.sql_pattern}")
    print(f"    Use: {q.use_case}")
    print(f"    Perf: {q.performance}")

Grafana Dashboard

# === Grafana + QuestDB Dashboard Setup ===

# Grafana Data Source Configuration
# Type: PostgreSQL
# Host: questdb-server:8812
# Database: qdb
# User: admin
# Password: quest
# SSL Mode: disable
# Version: 12+

# Panel Query with Grafana Variables
# SELECT $__timeGroup(timestamp, $__interval) as time,
#        hostname,
#        avg(cpu_usage) as cpu
# FROM metrics
# WHERE $__timeFilter(timestamp)
#   AND hostname = '$hostname'
# SAMPLE BY $__sampleByInterval
# ORDER BY time;

@dataclass
class GrafanaPanel:
    panel_type: str
    metric: str
    query: str
    alert_threshold: str

panels = [
    GrafanaPanel("Time Series",
        "CPU Usage (%)",
        "SELECT avg(cpu_usage) FROM metrics SAMPLE BY $__interval",
        "> 80% for 5m → Warning, > 95% → Critical"),
    GrafanaPanel("Time Series",
        "Memory Usage (%)",
        "SELECT avg(memory_pct) FROM metrics SAMPLE BY $__interval",
        "> 85% for 5m → Warning, > 95% → Critical"),
    GrafanaPanel("Stat",
        "Request Rate (req/s)",
        "SELECT count()/$__interval FROM http_requests",
        "< 10 req/s → Warning (ต่ำผิดปกติ)"),
    GrafanaPanel("Gauge",
        "Error Rate (%)",
        "SELECT (errors*100.0/total) FROM http_requests SAMPLE BY 5m",
        "> 1% → Warning, > 5% → Critical"),
    GrafanaPanel("Heatmap",
        "Response Latency (ms)",
        "SELECT response_time FROM http_requests",
        "p99 > 500ms → Warning, > 1000ms → Critical"),
    GrafanaPanel("Table",
        "Top 10 Slow Queries",
        "SELECT service, avg(response_time) ORDER BY 2 DESC LIMIT 10",
        "avg > 2000ms → Alert"),
]

print("=== Grafana Panels ===")
for p in panels:
    print(f"  [{p.panel_type}] {p.metric}")
    print(f"    Query: {p.query}")
    print(f"    Alert: {p.alert_threshold}")

Alert Configuration

# === Alert Rules Configuration ===

@dataclass
class AlertRule:
    name: str
    condition: str
    duration: str
    severity: str
    notification: str
    runbook: str

alerts = [
    AlertRule("High CPU Usage",
        "avg(cpu_usage) > 80%",
        "5 minutes",
        "Warning → Critical (>95%)",
        "Slack #infra-alerts + PagerDuty",
        "Check top processes, scale up if needed"),
    AlertRule("High Memory Usage",
        "avg(memory_pct) > 85%",
        "5 minutes",
        "Warning → Critical (>95%)",
        "Slack #infra-alerts + PagerDuty",
        "Check for memory leaks, restart pods"),
    AlertRule("High Error Rate",
        "error_rate > 1%",
        "3 minutes",
        "Warning → Critical (>5%)",
        "Slack #oncall + PagerDuty + Email",
        "Check logs, rollback if recent deploy"),
    AlertRule("High Latency",
        "p99_latency > 500ms",
        "5 minutes",
        "Warning → Critical (>1000ms)",
        "Slack #performance",
        "Check DB queries, cache hit rate"),
    AlertRule("Disk Full",
        "disk_usage > 80%",
        "1 minute",
        "Warning → Critical (>90%)",
        "Slack + PagerDuty + SMS",
        "Clean logs, extend volume, archive old data"),
    AlertRule("Service Down",
        "up == 0",
        "1 minute",
        "Critical",
        "PagerDuty + SMS + Phone Call",
        "Check pod status, restart, check dependencies"),
]

print("=== Alert Rules ===")
for a in alerts:
    print(f"\n  [{a.name}] Severity: {a.severity}")
    print(f"    Condition: {a.condition} for {a.duration}")
    print(f"    Notify: {a.notification}")
    print(f"    Runbook: {a.runbook}")