Zero Downtime Deployment สำหรับ ClickHouse
Zero Downtime Deployment ให้ Deploy ClickHouse โดยไม่หยุดบริการ ใช้ ReplicatedMergeTree, Distributed Tables, Rolling Updates ผู้ใช้ไม่รู้สึกถึงการเปลี่ยนแปลง
ClickHouse Cluster ที่ออกแบบดี รองรับ Node Failure, Schema Migration, Version Upgrade โดยไม่มี Downtime
ClickHouse Cluster Setup
# === ClickHouse Cluster สำหรับ High Availability ===
# 1. Docker Compose — 3-node Cluster
# docker-compose.yml
# version: '3.8'
# services:
# clickhouse-keeper-1:
# image: clickhouse/clickhouse-keeper:latest
# ports:
# - "9181:9181"
# volumes:
# - keeper1_data:/var/lib/clickhouse-keeper
# - ./keeper_config.xml:/etc/clickhouse-keeper/keeper_config.xml
#
# clickhouse-1:
# image: clickhouse/clickhouse-server:latest
# ports:
# - "8123:8123"
# - "9000:9000"
# volumes:
# - ch1_data:/var/lib/clickhouse
# - ./cluster_config.xml:/etc/clickhouse-server/config.d/cluster.xml
# depends_on:
# - clickhouse-keeper-1
#
# clickhouse-2:
# image: clickhouse/clickhouse-server:latest
# ports:
# - "8124:8123"
# - "9001:9000"
# volumes:
# - ch2_data:/var/lib/clickhouse
# - ./cluster_config.xml:/etc/clickhouse-server/config.d/cluster.xml
# depends_on:
# - clickhouse-keeper-1
#
# clickhouse-3:
# image: clickhouse/clickhouse-server:latest
# ports:
# - "8125:8123"
# - "9002:9000"
# volumes:
# - ch3_data:/var/lib/clickhouse
# - ./cluster_config.xml:/etc/clickhouse-server/config.d/cluster.xml
# depends_on:
# - clickhouse-keeper-1
# 2. Cluster Configuration
# cluster_config.xml
#
#
#
#
#
# clickhouse-1
# 9000
#
#
# clickhouse-2
# 9000
#
#
#
#
# clickhouse-3
# 9000
#
#
#
#
#
# 3. ReplicatedMergeTree Table
# CREATE TABLE analytics.events ON CLUSTER analytics_cluster (
# event_date Date,
# event_time DateTime,
# user_id UInt64,
# event_type LowCardinality(String),
# page_url String,
# duration_ms UInt32
# ) ENGINE = ReplicatedMergeTree(
# '/clickhouse/tables/{shard}/events',
# '{replica}'
# )
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_date, user_id, event_time);
# 4. Distributed Table
# CREATE TABLE analytics.events_distributed ON CLUSTER analytics_cluster
# AS analytics.events
# ENGINE = Distributed(analytics_cluster, analytics, events, rand());
# 5. ตรวจสอบ Cluster
# SELECT * FROM system.clusters;
# SELECT * FROM system.replicas;
echo "ClickHouse Cluster:"
echo " Nodes: 3 (2 shards, 2 replicas)"
echo " Keeper: ClickHouse Keeper"
echo " Engine: ReplicatedMergeTree"
echo " Query: Distributed Table"
Rolling Update Strategy
# rolling_update.py — ClickHouse Rolling Update
from dataclasses import dataclass, field
from typing import List, Dict
from datetime import datetime
from enum import Enum
import time
class NodeStatus(Enum):
HEALTHY = "healthy"
DRAINING = "draining"
UPDATING = "updating"
VERIFYING = "verifying"
@dataclass
class ClusterNode:
name: str
host: str
port: int
version: str
status: NodeStatus = NodeStatus.HEALTHY
replica_lag: int = 0
class RollingUpdateManager:
"""ClickHouse Rolling Update Manager"""
def __init__(self):
self.nodes: List[ClusterNode] = []
self.update_log: List[Dict] = []
def add_node(self, node: ClusterNode):
self.nodes.append(node)
def pre_check(self):
"""ตรวจสอบก่อน Update"""
print(f"\n Pre-update Checks:")
checks = [
("All nodes healthy",
all(n.status == NodeStatus.HEALTHY for n in self.nodes)),
("No replica lag",
all(n.replica_lag == 0 for n in self.nodes)),
("Backup completed", True),
("Rollback plan ready", True),
]
all_pass = True
for check, ok in checks:
status = "PASS" if ok else "FAIL"
print(f" [{status}] {check}")
if not ok:
all_pass = False
return all_pass
def drain_node(self, node: ClusterNode):
"""Drain Node — หยุดรับ Traffic ใหม่"""
node.status = NodeStatus.DRAINING
self.log(f"Draining {node.name}")
# ClickHouse: Remove from Distributed table routing
# ALTER TABLE events_distributed DELETE WHERE 1=0
time.sleep(0.01)
def update_node(self, node: ClusterNode, new_version: str):
"""Update Node"""
node.status = NodeStatus.UPDATING
self.log(f"Updating {node.name}: {node.version} -> {new_version}")
time.sleep(0.01)
node.version = new_version
def verify_node(self, node: ClusterNode):
"""Verify Node หลัง Update"""
node.status = NodeStatus.VERIFYING
self.log(f"Verifying {node.name}")
# ตรวจสอบ
checks = [
f"Version: {node.version}",
"Replication: synced",
"Queries: responding",
"Metrics: normal",
]
for c in checks:
self.log(f" {c}")
node.status = NodeStatus.HEALTHY
node.replica_lag = 0
def rolling_update(self, new_version):
"""Execute Rolling Update"""
print(f"\n{'='*55}")
print(f"Rolling Update: -> {new_version}")
print(f"{'='*55}")
if not self.pre_check():
print("\n ABORT: Pre-checks failed")
return False
for i, node in enumerate(self.nodes):
print(f"\n Node {i+1}/{len(self.nodes)}: {node.name}")
# 1. Drain
self.drain_node(node)
print(f" Drained (stopped receiving queries)")
# 2. Update
self.update_node(node, new_version)
print(f" Updated to {new_version}")
# 3. Verify
self.verify_node(node)
print(f" Verified: HEALTHY")
# 4. Wait for replication sync
print(f" Waiting for replication sync...")
time.sleep(0.01)
print(f"\n Rolling Update Complete!")
print(f" All nodes: {new_version}")
return True
def log(self, message):
self.update_log.append({
"time": datetime.now().isoformat(),
"message": message,
})
# ตัวอย่าง
manager = RollingUpdateManager()
nodes = [
ClusterNode("ch-node-1", "10.0.0.1", 9000, "24.3.1"),
ClusterNode("ch-node-2", "10.0.0.2", 9000, "24.3.1"),
ClusterNode("ch-node-3", "10.0.0.3", 9000, "24.3.1"),
]
for node in nodes:
manager.add_node(node)
manager.rolling_update("24.8.1")
Schema Migration
# === ClickHouse Schema Migration ===
# 1. Non-breaking Changes (Online)
# ALTER TABLE analytics.events ADD COLUMN country String DEFAULT '';
# ALTER TABLE analytics.events ADD COLUMN device LowCardinality(String) DEFAULT '';
# ALTER TABLE analytics.events MODIFY COLUMN duration_ms UInt64;
# 2. Breaking Changes — Materialized View Approach
# Step 1: สร้าง Table ใหม่
# CREATE TABLE analytics.events_v2 (
# event_date Date,
# event_time DateTime,
# user_id UInt64,
# event_type LowCardinality(String),
# page_url String,
# duration_ms UInt32,
# country LowCardinality(String),
# device LowCardinality(String),
# session_id String
# ) ENGINE = ReplicatedMergeTree(...)
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_date, user_id, event_time);
# Step 2: สร้าง Materialized View รับข้อมูลใหม่
# CREATE MATERIALIZED VIEW analytics.events_v2_mv
# TO analytics.events_v2
# AS SELECT
# event_date, event_time, user_id, event_type,
# page_url, duration_ms,
# '' AS country, '' AS device, '' AS session_id
# FROM analytics.events;
# Step 3: Backfill ข้อมูลเก่า
# INSERT INTO analytics.events_v2
# SELECT *, '' AS country, '' AS device, '' AS session_id
# FROM analytics.events
# WHERE event_date < today();
# Step 4: Switch Queries ไปใช้ events_v2
# (Update application config)
# Step 5: Cleanup
# DROP VIEW analytics.events_v2_mv;
# RENAME TABLE analytics.events TO analytics.events_old;
# RENAME TABLE analytics.events_v2 TO analytics.events;
# 3. Migration Script
migration_steps = [
{"version": "001", "description": "Add country column",
"sql": "ALTER TABLE events ADD COLUMN country String DEFAULT ''",
"rollback": "ALTER TABLE events DROP COLUMN country"},
{"version": "002", "description": "Add device column",
"sql": "ALTER TABLE events ADD COLUMN device LowCardinality(String) DEFAULT ''",
"rollback": "ALTER TABLE events DROP COLUMN device"},
{"version": "003", "description": "Add TTL 1 year",
"sql": "ALTER TABLE events MODIFY TTL event_date + INTERVAL 1 YEAR",
"rollback": "ALTER TABLE events REMOVE TTL"},
]
print("Schema Migrations:")
for m in migration_steps:
print(f" [{m['version']}] {m['description']}")
print(f" SQL: {m['sql'][:60]}...")
print(f" Rollback: {m['rollback'][:60]}...")
Best Practices
- ReplicatedMergeTree: ใช้ ReplicatedMergeTree เสมอใน Production สำหรับ HA
- Rolling Updates: Update ทีละ Node ตรวจสอบ Replication Sync ก่อนไปตัวถัดไป
- Schema Migration: ใช้ ALTER TABLE สำหรับ Non-breaking Materialized View สำหรับ Breaking
- Backup ก่อน: Backup ข้อมูลก่อน Update หรือ Migration ทุกครั้ง
- Health Checks: ตรวจ system.replicas และ system.mutations หลัง Update
- Rollback Plan: เตรียม Rollback Plan สำหรับทุก Migration Step
Zero Downtime Deployment คืออะไร
Deploy Application Database ไม่หยุดบริการ ผู้ใช้ไม่รู้สึก Rolling Updates Blue-Green Canary ClickHouse ใช้ ReplicatedMergeTree Distributed Tables
ClickHouse Replication ทำงานอย่างไร
ReplicatedMergeTree Engine กับ ZooKeeper/Keeper Replicate ข้อมูลทุก Replica อัตโนมัติ Node Down Node อื่นให้บริการ Distributed Table กระจาย Queries ทุก Shard
Schema Migration ใน ClickHouse ทำอย่างไร
ALTER TABLE ADD COLUMN MODIFY COLUMN Non-blocking ไม่หยุด Service Breaking Changes ใช้ Materialized Views สร้าง View ใหม่ Switch Queries ลบ View เก่า
Blue-Green Deployment สำหรับ Database ทำอย่างไร
สร้าง Cluster ใหม่ Green ข้าง Blue Migrate ข้อมูล INSERT SELECT Replication Green พร้อม Switch DNS Rollback กลับ Blue ใช้ Terraform จัดการ
สรุป
ClickHouse Zero Downtime Deployment ใช้ ReplicatedMergeTree HA Rolling Updates ทีละ Node Schema Migration ALTER TABLE Non-blocking Materialized Views สำหรับ Breaking Changes Backup ก่อนทุกครั้ง Health Checks หลัง Update Rollback Plan ทุก Step
