ClickHouse Analytics กับ Zero Downtime
Zero Downtime Deployment สำหรับ ClickHouse
Zero Downtime Deployment ให้ Deploy ClickHouse โดยไม่หยุดบริการ ใช้ ReplicatedMergeTree, Distributed Tables, Rolling Updates ผู้ใช้ไม่รู้สึกถึงการเปลี่ยนแปลง
ClickHouse Cluster ที่ออกแบบดี รองรับ Node Failure, Schema Migration, Version Upgrade โดยไม่มี Downtime
ClickHouse Cluster Setup
=== ClickHouse Cluster สำหรับ High Availability ===
1. Docker Compose — 3-node Cluster
docker-compose.yml
version: '3.8'
services:
clickhouse-keeper-1:
image: clickhouse/clickhouse-keeper:latest
ports:
- "9181:9181"
volumes:
- keeper1_data:/var/lib/clickhouse-keeper
- ./keeper_config.xml:/etc/clickhouse-keeper/keeper_config.xml
clickhouse-1:
image: clickhouse/clickhouse-server:latest
ports:
- "8123:8123"
- "9000:9000"
volumes:
- ch1_data:/var/lib/clickhouse
- ./cluster_config.xml:/etc/clickhouse-server/config.d/cluster.xml
depends_on:
- clickhouse-keeper-1
clickhouse-2:
image: clickhouse/clickhouse-server:latest
ports:
- "8124:8123"
- "9001:9000"
volumes:
- ch2_data:/var/lib/clickhouse
- ./cluster_config.xml:/etc/clickhouse-server/config.d/cluster.xml
depends_on:
- clickhouse-keeper-1
clickhouse-3:
image: clickhouse/clickhouse-server:latest
ports:
- "8125:8123"
- "9002:9000"
volumes:
- ch3_data:/var/lib/clickhouse
- ./cluster_config.xml:/etc/clickhouse-server/config.d/cluster.xml
depends_on:
- clickhouse-keeper-1
2. Cluster Configuration
cluster_config.xml
<clickhouse>
<remote_servers>
<analytics_cluster>
<shard>
<replica>
<host>clickhouse-1</host>
<port>9000</port>
</replica>
<replica>
<host>clickhouse-2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>clickhouse-3</host>
<port>9000</port>
</replica>
</shard>
</analytics_cluster>
</remote_servers>
</clickhouse>
3. ReplicatedMergeTree Table
CREATE TABLE analytics.events ON CLUSTER analytics_cluster (
event_date Date,
event_time DateTime,
user_id UInt64,
event_type LowCardinality(String),
page_url String,
duration_ms UInt32
) ENGINE = ReplicatedMergeTree(
'/clickhouse/tables/{shard}/events',
'{replica}'
)
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, user_id, event_time);
4. Distributed Table
CREATE TABLE analytics.events_distributed ON CLUSTER analytics_cluster
AS analytics.events
ENGINE = Distributed(analytics_cluster, analytics, events, rand());
5. ตรวจสอบ Cluster
SELECT * FROM system.clusters;
SELECT * FROM system.replicas;
echo "ClickHouse Cluster:"
echo " Nodes: 3 (2 shards, 2 replicas)"
echo " Keeper: ClickHouse Keeper"
echo " Engine: ReplicatedMergeTree"
echo " Query: Distributed Table"
Rolling Update Strategy
# rolling_update.py — ClickHouse Rolling Update
from dataclasses import dataclass, field
from typing import List, Dict
from datetime import datetime
from enum import Enum
import time
class NodeStatus(Enum):
HEALTHY = "healthy"
DRAINING = "draining"
UPDATING = "updating"
VERIFYING = "verifying"
@dataclass
class ClusterNode:
name: str
host: str
port: int
version: str
status: NodeStatus = NodeStatus.HEALTHY
replica_lag: int = 0
class RollingUpdateManager:
"""ClickHouse Rolling Update Manager"""
def __init__(self):
self.nodes: List[ClusterNode] = []
self.update_log: List[Dict] = []
def add_node(self, node: ClusterNode):
self.nodes.append(node)
def pre_check(self):
"""ตรวจสอบก่อน Update"""
print(f"\n Pre-update Checks:")
checks = [
("All nodes healthy",
all(n.status == NodeStatus.HEALTHY for n in self.nodes)),
("No replica lag",
all(n.replica_lag == 0 for n in self.nodes)),
("Backup completed", True),
("Rollback plan ready", True),
]
all_pass = True
for check, ok in checks:
status = "PASS" if ok else "FAIL"
print(f" [{status}] {check}")
if not ok:
all_pass = False
return all_pass
def drain_node(self, node: ClusterNode):
"""Drain Node — หยุดรับ Traffic ใหม่"""
node.status = NodeStatus.DRAINING
self.log(f"Draining {node.name}")
# ClickHouse: Remove from Distributed table routing
# ALTER TABLE events_distributed DELETE WHERE 1=0
time.sleep(0.01)
def update_node(self, node: ClusterNode, new_version: str):
"""Update Node"""
node.status = NodeStatus.UPDATING
self.log(f"Updating {node.name}: {node.version} -> {new_version}")
time.sleep(0.01)
node.version = new_version
def verify_node(self, node: ClusterNode):
"""Verify Node หลัง Update"""
node.status = NodeStatus.VERIFYING
self.log(f"Verifying {node.name}")
# ตรวจสอบ
checks = [
f"Version: {node.version}",
"Replication: synced",
"Queries: responding",
"Metrics: normal",
]
for c in checks:
self.log(f" {c}")
node.status = NodeStatus.HEALTHY
node.replica_lag = 0
def rolling_update(self, new_version):
"""Execute Rolling Update"""
print(f"\n{'='*55}")
print(f"Rolling Update: -> {new_version}")
print(f"{'='*55}")
if not self.pre_check():
print("\n ABORT: Pre-checks failed")
return False
for i, node in enumerate(self.nodes):
print(f"\n Node {i+1}/{len(self.nodes)}: {node.name}")
# 1. Drain
self.drain_node(node)
print(f" Drained (stopped receiving queries)")
# 2. Update
self.update_node(node, new_version)
print(f" Updated to {new_version}")
# 3. Verify
self.verify_node(node)
print(f" Verified: HEALTHY")
# 4. Wait for replication sync
print(f" Waiting for replication sync...")
time.sleep(0.01)
print(f"\n Rolling Update Complete!")
print(f" All nodes: {new_version}")
return True
def log(self, message):
self.update_log.append({
"time": datetime.now().isoformat(),
"message": message,
})
# ตัวอย่าง
manager = RollingUpdateManager()
nodes = [
ClusterNode("ch-node-1", "10.0.0.1", 9000, "24.3.1"),
ClusterNode("ch-node-2", "10.0.0.2", 9000, "24.3.1"),
ClusterNode("ch-node-3", "10.0.0.3", 9000, "24.3.1"),
]
for node in nodes:
manager.add_node(node)
manager.rolling_update("24.8.1")
Schema Migration
=== ClickHouse Schema Migration ===
1. Non-breaking Changes (Online)
ALTER TABLE analytics.events ADD COLUMN country String DEFAULT '';
ALTER TABLE analytics.events ADD COLUMN device LowCardinality(String) DEFAULT '';
ALTER TABLE analytics.events MODIFY COLUMN duration_ms UInt64;
2. Breaking Changes — Materialized View Approach
Step 1: สร้าง Table ใหม่
CREATE TABLE analytics.events_v2 (
event_date Date,
event_time DateTime,
user_id UInt64,
event_type LowCardinality(String),
page_url String,
duration_ms UInt32,
country LowCardinality(String),
device LowCardinality(String),
session_id String
) ENGINE = ReplicatedMergeTree(...)
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, user_id, event_time);
Step 2: สร้าง Materialized View รับข้อมูลใหม่
CREATE MATERIALIZED VIEW analytics.events_v2_mv
TO analytics.events_v2
AS SELECT
event_date, event_time, user_id, event_type,
page_url, duration_ms,
'' AS country, '' AS device, '' AS session_id
FROM analytics.events;
Step 3: Backfill ข้อมูลเก่า
INSERT INTO analytics.events_v2
SELECT *, '' AS country, '' AS device, '' AS session_id
FROM analytics.events
WHERE event_date < today();
Step 4: Switch Queries ไปใช้ events_v2
(Update application config)
Step 5: Cleanup
DROP VIEW analytics.events_v2_mv;
RENAME TABLE analytics.events TO analytics.events_old;
RENAME TABLE analytics.events_v2 TO analytics.events;
3. Migration Script
migration_steps = [
{"version": "001", "description": "Add country column",
"sql": "ALTER TABLE events ADD COLUMN country String DEFAULT ''",
"rollback": "ALTER TABLE events DROP COLUMN country"},
{"version": "002", "description": "Add device column",
"sql": "ALTER TABLE events ADD COLUMN device LowCardinality(String) DEFAULT ''",
"rollback": "ALTER TABLE events DROP COLUMN device"},
{"version": "003", "description": "Add TTL 1 year",
"sql": "ALTER TABLE events MODIFY TTL event_date + INTERVAL 1 YEAR",
"rollback": "ALTER TABLE events REMOVE TTL"},
]
print("Schema Migrations:")
for m in migration_steps:
print(f" [{m['version']}] {m['description']}")
print(f" SQL: {m['sql'][:60]}...")
print(f" Rollback: {m['rollback'][:60]}...")
Best Practices
- ReplicatedMergeTree: ใช้ ReplicatedMergeTree เสมอใน Production สำหรับ HA
- Rolling Updates: Update ทีละ Node ตรวจสอบ Replication Sync ก่อนไปตัวถัดไป
- Schema Migration: ใช้ ALTER TABLE สำหรับ Non-breaking Materialized View สำหรับ Breaking
- Backup ก่อน: Backup ข้อมูลก่อน Update หรือ Migration ทุกครั้ง
- Health Checks: ตรวจ system.replicas และ system.mutations หลัง Update
- Rollback Plan: เตรียม Rollback Plan สำหรับทุก Migration Step
Zero Downtime Deployment คืออะไร
Deploy Application Database ไม่หยุดบริการ ผู้ใช้ไม่รู้สึก Rolling Updates Blue-Green Canary ClickHouse ใช้ ReplicatedMergeTree Distributed Tables