QuestDB Time Series Monitoring และ Alerting —
QuestDB Monitoring
QuestDB Time Series Monitoring Alerting SQL SAMPLE BY Grafana Dashboard Telegraf ILP PostgreSQL Alert Notification Production
| Component | Role | Protocol | Port |
|---|---|---|---|
| QuestDB | Time Series Database | ILP / PostgreSQL / REST | 9009 / 8812 / 9000 |
| Telegraf | Metrics Collection Agent | ILP (InfluxDB Line Protocol) | → 9009 |
| Grafana | Dashboard + Alerting | PostgreSQL Wire | ← 8812 |
| Alertmanager | Alert Routing + Dedup | HTTP API | 9093 |
SQL Queries
# === QuestDB SQL Query Examples ===
# -- CPU Usage per host (5-minute average)
# SELECT hostname, avg(cpu_usage) as avg_cpu,
# max(cpu_usage) as max_cpu,
# min(cpu_usage) as min_cpu
# FROM metrics
# WHERE timestamp IN ('2024-01-01', '2024-01-31')
# SAMPLE BY 5m
# ALIGN TO CALENDAR;
#
# -- Latest value per host
# SELECT hostname, cpu_usage, memory_usage, timestamp
# FROM metrics
# LATEST ON timestamp PARTITION BY hostname;
#
# -- Request rate per service (per minute)
# SELECT service, count() as requests,
# avg(response_time) as avg_latency,
# max(response_time) as p99_latency
# FROM http_requests
# SAMPLE BY 1m
# FILL(0);
#
# -- Error rate percentage
# SELECT timestamp, service,
# count() as total,
# sum(CASE WHEN status >= 500 THEN 1 ELSE 0 END) as errors,
# (sum(CASE WHEN status >= 500 THEN 1 ELSE 0 END) * 100.0 / count()) as error_rate
# FROM http_requests
# SAMPLE BY 5m;
#
# -- ASOF JOIN (correlate metrics with events)
# SELECT m.timestamp, m.cpu_usage, e.event_type
# FROM metrics m
# ASOF JOIN events e ON (m.hostname = e.hostname);
from dataclasses import dataclass
@dataclass
class QuestDBQuery:
name: str
sql_pattern: str
use_case: str
performance: str
queries = [
QuestDBQuery("SAMPLE BY (Time Aggregation)",
"SELECT avg(value) FROM t SAMPLE BY 5m",
"รวมข้อมูลตามช่วงเวลา 1s 5m 1h 1d",
"ล้าน rows ใน <100ms"),
QuestDBQuery("LATEST ON (Last Value)",
"SELECT * FROM t LATEST ON ts PARTITION BY host",
"หาค่าล่าสุดของแต่ละ Host/Service",
"<10ms ไม่ว่าข้อมูลเยอะแค่ไหน"),
QuestDBQuery("ASOF JOIN (Time Correlation)",
"FROM a ASOF JOIN b ON (a.host = b.host)",
"เชื่อม Time Series ที่เวลาใกล้เคียงกัน",
"เร็วกว่า Regular JOIN มาก"),
QuestDBQuery("FILL (Gap Filling)",
"SAMPLE BY 1m FILL(prev)",
"เติมค่าช่วงที่ไม่มีข้อมูล prev/null/0/linear",
"ไม่กระทบ Performance"),
QuestDBQuery("WHERE timestamp IN (Range)",
"WHERE ts IN ('2024-01-01','2024-01-31')",
"กรองช่วงเวลา Partition Pruning อัตโนมัติ",
"เร็วมาก อ่านเฉพาะ Partition ที่ต้องการ"),
]
print("=== QuestDB Queries ===")
for q in queries:
print(f" [{q.name}]")
print(f" SQL: {q.sql_pattern}")
print(f" Use: {q.use_case}")
print(f" Perf: {q.performance}")
Grafana Dashboard
# === Grafana + QuestDB Dashboard Setup ===
# Grafana Data Source Configuration
# Type: PostgreSQL
# Host: questdb-server:8812
# Database: qdb
# User: admin
# Password: quest
# SSL Mode: disable
# Version: 12+
# Panel Query with Grafana Variables
# SELECT $__timeGroup(timestamp, $__interval) as time,
# hostname,
# avg(cpu_usage) as cpu
# FROM metrics
# WHERE $__timeFilter(timestamp)
# AND hostname = '$hostname'
# SAMPLE BY $__sampleByInterval
# ORDER BY time;
@dataclass
class GrafanaPanel:
panel_type: str
metric: str
query: str
alert_threshold: str
panels = [
GrafanaPanel("Time Series",
"CPU Usage (%)",
"SELECT avg(cpu_usage) FROM metrics SAMPLE BY $__interval",
"> 80% for 5m → Warning, > 95% → Critical"),
GrafanaPanel("Time Series",
"Memory Usage (%)",
"SELECT avg(memory_pct) FROM metrics SAMPLE BY $__interval",
"> 85% for 5m → Warning, > 95% → Critical"),
GrafanaPanel("Stat",
"Request Rate (req/s)",
"SELECT count()/$__interval FROM http_requests",
"< 10 req/s → Warning (ต่ำผิดปกติ)"),
GrafanaPanel("Gauge",
"Error Rate (%)",
"SELECT (errors*100.0/total) FROM http_requests SAMPLE BY 5m",
"> 1% → Warning, > 5% → Critical"),
GrafanaPanel("Heatmap",
"Response Latency (ms)",
"SELECT response_time FROM http_requests",
"p99 > 500ms → Warning, > 1000ms → Critical"),
GrafanaPanel("Table",
"Top 10 Slow Queries",
"SELECT service, avg(response_time) ORDER BY 2 DESC LIMIT 10",
"avg > 2000ms → Alert"),
]
print("=== Grafana Panels ===")
for p in panels:
print(f" [{p.panel_type}] {p.metric}")
print(f" Query: {p.query}")
print(f" Alert: {p.alert_threshold}")
Alert Configuration
# === Alert Rules Configuration ===
@dataclass
class AlertRule:
name: str
condition: str
duration: str
severity: str
notification: str
runbook: str
alerts = [
AlertRule("High CPU Usage",
"avg(cpu_usage) > 80%",
"5 minutes",
"Warning → Critical (>95%)",
"Slack #infra-alerts + PagerDuty",
"Check top processes, scale up if needed"),
AlertRule("High Memory Usage",
"avg(memory_pct) > 85%",
"5 minutes",
"Warning → Critical (>95%)",
"Slack #infra-alerts + PagerDuty",
"Check for memory leaks, restart pods"),
AlertRule("High Error Rate",
"error_rate > 1%",
"3 minutes",
"Warning → Critical (>5%)",
"Slack #oncall + PagerDuty + Email",
"Check logs, rollback if recent deploy"),
AlertRule("High Latency",
"p99_latency > 500ms",
"5 minutes",
"Warning → Critical (>1000ms)",
"Slack #performance",
"Check DB queries, cache hit rate"),
AlertRule("Disk Full",
"disk_usage > 80%",
"1 minute",
"Warning → Critical (>90%)",
"Slack + PagerDuty + SMS",
"Clean logs, extend volume, archive old data"),
AlertRule("Service Down",
"up == 0",
"1 minute",
"Critical",
"PagerDuty + SMS + Phone Call",
"Check pod status, restart, check dependencies"),
]
print("=== Alert Rules ===")
for a in alerts:
print(f"\n [{a.name}] Severity: {a.severity}")
print(f" Condition: {a.condition} for {a.duration}")
print(f" Notify: {a.notification}")
print(f" Runbook: {a.runbook}")
เคล็ดลับ
- SAMPLE BY: ใช้ SAMPLE BY แทน GROUP BY สำหรับ Time Aggregation
- LATEST ON: ใช้ LATEST ON หาค่าล่าสุดเร็วมาก
- Partition: ตั้ง Partition ตามวัน สัปดาห์ เดือน ตาม Data Volume
- ILP: ใช้ ILP Protocol Ingest ข้อมูลเร็วกว่า SQL INSERT มาก
- Retention: ตั้ง Data Retention ลบข้อมูลเก่าอัตโนมัติ
QuestDB คืออะไร
Time Series Database SQL SAMPLE BY LATEST ON ILP PostgreSQL REST High Performance Open Source Java C Monitoring IoT Financial