ClickHouse Analytics Distributed System —

ClickHouse Distributed Analytics

ClickHouse Columnar OLAP Distributed Shard Replica SQL Petabyte Analytics Log Time Series Dashboard Real-time Production

เนื้อหาเกี่ยวข้อง — แนะนำให้อ่าน MLflow Experiment Edge Computing

Feature	ClickHouse	PostgreSQL	Elasticsearch
Use Case	OLAP Analytics	OLTP + Analytics	Search + Log
Storage	Columnar	Row-based	Inverted Index
Speed (Analytical)	เร็วมาก 100x+	ช้า (Full Scan)	เร็ว (Aggregation)
Compression	10-40x	2-3x	1.5-3x
SQL Support	Full SQL	Full SQL	Limited (DSL)
Scale	Petabyte	Terabyte	Petabyte
UPDATE/DELETE	Async (ช้า)	Fast (MVCC)	Fast

Cluster Architecture

# === ClickHouse Distributed Cluster ===



# Cluster Config (config.xml)

# 

#   

#     

#       

#         

#           clickhouse-01

#           9000

#         

#         

#           clickhouse-02

#           9000

#         

#       

#       

#         

#           clickhouse-03

#           9000

#         

#         

#           clickhouse-04

#           9000

#         

#       

#     

#   

# 



# Create Replicated Table

# CREATE TABLE events ON CLUSTER analytics_cluster (

#     date Date,

#     user_id UInt64,

#     event_type LowCardinality(String),

#     properties String,

#     timestamp DateTime

# ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/events', '{replica}')

# PARTITION BY toYYYYMM(date)

# ORDER BY (date, user_id, event_type);



# Create Distributed Table

# CREATE TABLE events_distributed ON CLUSTER analytics_cluster

# AS events

# ENGINE = Distributed(analytics_cluster, default, events, rand());



from dataclasses import dataclass



@dataclass

class ClusterNode:

    shard: int

    replica: str

    host: str

    role: str

    data_size: str



nodes = [

    ClusterNode(1, "A", "clickhouse-01", "Primary", "~500GB"),

    ClusterNode(1, "B", "clickhouse-02", "Replica", "~500GB"),

    ClusterNode(2, "A", "clickhouse-03", "Primary", "~500GB"),

    ClusterNode(2, "B", "clickhouse-04", "Replica", "~500GB"),

    ClusterNode(3, "A", "clickhouse-05", "Primary", "~500GB"),

    ClusterNode(3, "B", "clickhouse-06", "Replica", "~500GB"),

]



print("=== Cluster Architecture (3 Shards x 2 Replicas) ===")

print(f"Total Data: ~1.5TB (500GB/shard x 3)")

print(f"Total Nodes: 6 (HA: ทุก Shard มี Replica)\n")

for n in nodes:

    print(f"  Shard {n.shard} Replica {n.replica}: {n.host} ({n.role}) {n.data_size}")

Query Optimization

# === Query Optimization Techniques ===



@dataclass

class OptTechnique:

    technique: str

    sql_example: str

    speedup: str

    when_to_use: str



techniques = [

    OptTechnique("Primary Key (ORDER BY)",

        "ORDER BY (date, user_id, event_type)",

        "10-100x (skip irrelevant granules)",

        "เลือก Column ที่ Filter WHERE บ่อยที่สุด"),

    OptTechnique("Partitioning",

        "PARTITION BY toYYYYMM(date)",

        "10-50x (skip irrelevant partitions)",

        "Time-based Data ที่ Query ตาม Date Range"),

    OptTechnique("Materialized View",

        "CREATE MATERIALIZED VIEW daily_stats ENGINE=SummingMergeTree "

        "AS SELECT date, count() FROM events GROUP BY date",

        "100-1000x (pre-aggregated)",

        "Dashboard Query ที่ Aggregate ซ้ำบ่อย"),

    OptTechnique("LowCardinality",

        "event_type LowCardinality(String)",

        "2-5x (dictionary encoding)",

        "Column ที่มี Distinct Values < 10,000"),

    OptTechnique("Projection",

        "ALTER TABLE events ADD PROJECTION p1 "

        "(SELECT * ORDER BY event_type, date)",

        "5-50x (pre-sorted copy)",

        "Query ที่ Filter ไม่ตรงกับ Primary Key"),

    OptTechnique("Skip Index (bloom_filter)",

        "INDEX idx_user bloom_filter(user_id) GRANULARITY 4",

        "2-10x (skip granules)",

        "Column ที่มี High Cardinality เช่น user_id"),

]



print("=== Optimization Techniques ===")

for t in techniques:

    print(f"\n  [{t.technique}] Speedup: {t.speedup}")

    print(f"    SQL: {t.sql_example}")

    print(f"    When: {t.when_to_use}")

Monitoring & Operations

# === Production Monitoring ===



# Prometheus Exporter

# docker run -d --name ch-exporter \

#   -e CLICKHOUSE_SERVER=clickhouse-01:8123 \

#   -p 9116:9116 \

#   f1yegor/clickhouse-exporter



# Key System Tables

# SELECT * FROM system.query_log WHERE type='QueryFinish' ORDER BY query_duration_ms DESC LIMIT 10;

# SELECT * FROM system.metrics;

# SELECT * FROM system.replicas WHERE is_leader=1;



@dataclass

class MonitorMetric:

    metric: str

    source: str

    threshold: str

    action: str



metrics = [

    MonitorMetric("Query Duration P99",

        "system.query_log",

        "> 30 seconds",

        "Optimize Query Primary Key Materialized View"),

    MonitorMetric("Memory Usage",

        "system.metrics (CurrentMetric_MemoryTracking)",

        "> 80% RAM",

        "ลด max_memory_usage ต่อ Query Scale Up RAM"),

    MonitorMetric("Replication Lag",

        "system.replicas (absolute_delay)",

        "> 10 seconds",

        "ตรวจ Network ZooKeeper HealthCheck"),

    MonitorMetric("Disk Usage",

        "system.disks",

        "> 80%",

        "TTL ลบข้อมูลเก่า Move ไป Cold Storage"),

    MonitorMetric("Merge Queue",

        "system.metrics (BackgroundMergesInProgress)",

        "> 20 pending merges",

        "เพิ่ม background_pool_size หรือ Scale"),

    MonitorMetric("Insert Rate",

        "system.events (InsertedRows)",

        "Drop > 50% จาก Normal",

        "ตรวจ Source Pipeline ClickHouse Health"),

]



print("=== Production Metrics ===")

for m in metrics:

    print(f"  [{m.metric}] Source: {m.source}")

    print(f"    Threshold: {m.threshold}")

    print(f"    Action: {m.action}")

เคล็ดลับ

ORDER BY: เลือก Primary Key ที่ตรงกับ WHERE บ่อยที่สุด
MV: ใช้ Materialized View สำหรับ Dashboard Query เร็ว 100x+
LowCardinality: ใช้ LowCardinality สำหรับ Low Distinct Column
TTL: ตั้ง TTL ลบข้อมูลเก่าอัตโนมัติ ประหยัด Storage
Keeper: ใช้ ClickHouse Keeper แทน ZooKeeper (เร็วกว่า ง่ายกว่า)

ClickHouse คืออะไร

Open Source Columnar OLAP Yandex Vectorized SIMD Compression 10-40x SQL Petabyte Log Analytics Time Series Dashboard Real-time

แนะนำเพิ่มเติม — อ่านเพิ่มเติมที่ SiamCafeBook

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Calico Network Policy Cache Strategy Redis

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ non farm payroll investing