Technology

ClickHouse Analytics Performance Tuning เพิ่มความเร็ว

clickhouse analytics performance tuning เพมความเรว
ClickHouse Analytics Performance Tuning เพิ่มความเร็ว | SiamCafe Blog
2025-12-09· อ. บอม — SiamCafe.net· 8,439 คำ

ClickHouse Performance

ClickHouse Column-oriented OLAP Analytics Performance Tuning MergeTree Partitioning Materialized Views Compression Vectorized SQL Billion Rows Milliseconds

DatabaseTypeAnalytics SpeedScaleเหมาะกับ
ClickHouseColumn OLAPเร็วสุดPetabyteAnalytics
PostgreSQLRow OLTPช้า (Analytics)TerabyteTransactional
BigQueryCloud OLAPเร็วPetabyteGCP Analytics
Apache DruidColumn OLAPเร็วPetabyteReal-time
DuckDBEmbedded OLAPเร็ว100GBLocal Analytics

Table Design

# === ClickHouse Table Design ===

# MergeTree — Primary Table Engine
# CREATE TABLE events (
#     event_date Date,
#     event_time DateTime,
#     user_id UInt64,
#     event_type LowCardinality(String),
#     page_url String,
#     country LowCardinality(String),
#     device LowCardinality(String),
#     duration_ms UInt32,
#     revenue Decimal(10, 2)
# )
# ENGINE = MergeTree()
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_type, user_id, event_time)
# TTL event_date + INTERVAL 90 DAY
# SETTINGS index_granularity = 8192;

# Key Design Decisions:
# 1. PARTITION BY toYYYYMM — แบ่งตามเดือน ลบข้อมูลเก่าง่าย
# 2. ORDER BY — ตรงกับ Query Pattern ที่ใช้บ่อย
# 3. LowCardinality — สำหรับ Column ที่มีค่าซ้ำน้อย
# 4. TTL — ลบข้อมูลเก่าอัตโนมัติ
# 5. index_granularity — Default 8192 เหมาะกับส่วนใหญ่

# Materialized View — Pre-aggregate
# CREATE MATERIALIZED VIEW events_daily_mv
# ENGINE = SummingMergeTree()
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_date, event_type, country)
# AS SELECT
#     event_date,
#     event_type,
#     country,
#     count() AS event_count,
#     uniqExact(user_id) AS unique_users,
#     sum(revenue) AS total_revenue,
#     avg(duration_ms) AS avg_duration
# FROM events
# GROUP BY event_date, event_type, country;

# Projection — Alternative Query Pattern
# ALTER TABLE events ADD PROJECTION events_by_country (
#     SELECT
#         country,
#         event_type,
#         count() AS cnt,
#         sum(revenue) AS rev
#     GROUP BY country, event_type
# );
# ALTER TABLE events MATERIALIZE PROJECTION events_by_country;

from dataclasses import dataclass

@dataclass
class TableConfig:
    table: str
    engine: str
    rows: str
    compressed_gb: float
    raw_gb: float
    ratio: float
    partition: str

tables = [
    TableConfig("events", "MergeTree", "5.2B", 85, 520, 6.1, "Monthly"),
    TableConfig("events_daily_mv", "SummingMergeTree", "120K", 0.5, 2, 4.0, "Monthly"),
    TableConfig("user_sessions", "MergeTree", "800M", 25, 180, 7.2, "Monthly"),
    TableConfig("error_logs", "MergeTree", "2.1B", 45, 350, 7.8, "Weekly"),
    TableConfig("metrics_1m", "MergeTree", "10B", 120, 800, 6.7, "Daily"),
]

print("=== ClickHouse Tables ===")
for t in tables:
    print(f"  [{t.table}] Engine: {t.engine}")
    print(f"    Rows: {t.rows} | Compressed: {t.compressed_gb}GB | Raw: {t.raw_gb}GB")
    print(f"    Compression: {t.ratio}x | Partition: {t.partition}")

Query Optimization

# === Query Performance Tuning ===

# Bad Query — Full Scan
# SELECT count(DISTINCT user_id)
# FROM events
# WHERE event_date >= '2025-01-01'
# -- Scans all columns, slow

# Good Query — Optimized
# SELECT uniqExact(user_id)
# FROM events
# WHERE event_date >= '2025-01-01'
#   AND event_type = 'page_view'
# -- Uses ORDER BY index, partition pruning

# Use EXPLAIN to analyze
# EXPLAIN pipeline
# SELECT event_type, count()
# FROM events
# WHERE event_date = '2025-01-15'
# GROUP BY event_type;

# Performance Settings
# SET max_threads = 16;
# SET max_memory_usage = 10000000000;  -- 10GB
# SET max_execution_time = 30;
# SET optimize_read_in_order = 1;
# SET allow_experimental_projection_optimization = 1;

# Common Optimization Patterns:
# 1. ใช้ Partition Pruning — WHERE event_date ตรง Partition
# 2. ORDER BY ตรง Query — ClickHouse ใช้ Primary Index
# 3. PREWHERE แทน WHERE — อ่าน Column น้อยลง
# 4. Sampling — ใช้ SAMPLE สำหรับ Approximate Query
# 5. Materialized View — Pre-compute aggregates

@dataclass
class QueryBenchmark:
    query: str
    rows_scanned: str
    time_ms: int
    optimized: bool
    technique: str

benchmarks = [
    QueryBenchmark("Count events by type (1 day)", "15M", 45, True, "Partition pruning"),
    QueryBenchmark("Unique users (1 month)", "450M", 850, True, "uniqExact + index"),
    QueryBenchmark("Revenue by country (MV)", "30K", 5, True, "Materialized View"),
    QueryBenchmark("Top pages (1 week)", "105M", 320, True, "ORDER BY + LIMIT"),
    QueryBenchmark("Funnel analysis", "50M", 180, True, "windowFunnel()"),
    QueryBenchmark("Full table scan (bad)", "5.2B", 45000, False, "No optimization"),
    QueryBenchmark("Same with index (good)", "15M", 45, True, "Primary key match"),
]

print("\n=== Query Benchmarks ===")
for b in benchmarks:
    status = "FAST" if b.time_ms < 1000 else "SLOW"
    opt = "Optimized" if b.optimized else "Not optimized"
    print(f"  [{status}] {b.query}")
    print(f"    Rows: {b.rows_scanned} | Time: {b.time_ms}ms | {opt}")
    print(f"    Technique: {b.technique}")

Production Setup

# === Production Configuration ===

# Docker Compose
# services:
#   clickhouse:
#     image: clickhouse/clickhouse-server:latest
#     ports:
#       - "8123:8123"  # HTTP
#       - "9000:9000"  # Native
#     volumes:
#       - ./data:/var/lib/clickhouse
#       - ./config:/etc/clickhouse-server/config.d
#     ulimits:
#       nofile:
#         soft: 262144
#         hard: 262144

# config.xml — Performance Settings
# 
#   100
#   0.8
#   
#     161061273600
#   
# 

# Monitoring — system tables
# SELECT * FROM system.query_log ORDER BY event_time DESC LIMIT 10;
# SELECT * FROM system.parts WHERE table = 'events';
# SELECT * FROM system.metrics;
# SELECT * FROM system.asynchronous_metrics;

production_config = {
    "Cluster": "3 nodes (ReplicatedMergeTree)",
    "CPU": "32 cores per node",
    "RAM": "128 GB per node",
    "Storage": "NVMe SSD 2TB per node",
    "Total Data": "520 GB compressed (3.2 TB raw)",
    "Total Rows": "18 Billion",
    "Daily Ingestion": "500M rows/day",
    "Peak QPS": "200 queries/sec",
    "p99 Latency": "850ms",
    "Uptime": "99.99%",
}

print("Production Cluster:")
for k, v in production_config.items():
    print(f"  {k}: {v}")

tuning_checklist = [
    "ORDER BY ตรงกับ Query Pattern ที่ใช้บ่อยที่สุด",
    "PARTITION BY ตามเวลา ลบข้อมูลเก่าง่าย",
    "LowCardinality สำหรับ String ที่มี Distinct < 10K",
    "Materialized View สำหรับ Dashboard Query",
    "TTL ลบข้อมูลเก่าอัตโนมัติ ประหยัด Storage",
    "CODEC เลือก Compression ตาม Data Type",
    "max_threads ตาม CPU Cores ที่มี",
    "Monitor system.query_log หา Slow Query",
]

print(f"\n\nTuning Checklist:")
for i, c in enumerate(tuning_checklist, 1):
    print(f"  {i}. {c}")

เคล็ดลับ

ClickHouse คืออะไร

Open Source Column OLAP Database เร็วสุด Yandex Analytics Billion Rows Milliseconds SQL Real-time Ingestion Compression Log Event BI Dashboard

ClickHouse เร็วกว่า Database อื่นอย่างไร

Column-oriented อ่านเฉพาะ Column Vectorized Batch LZ4 ZSTD 5-10x Compression Parallel CPU Skip Index Materialized Views เร็วกว่า PostgreSQL 100-1000x

Performance Tuning ทำอย่างไร

Primary Key Query Pattern Partition เวลา ORDER BY WHERE LowCardinality Materialized Views Projection Settings max_threads max_memory Workload

ใช้ ClickHouse กับงานอะไรบ้าง

Log Analytics Server Application Event Tracking User Behavior Real-time Dashboard Time-series IoT BI Reporting Ad-tech A/B Testing Data ขนาดใหญ่

สรุป

ClickHouse Analytics Performance Tuning Column OLAP MergeTree Partitioning Materialized Views LowCardinality Compression ORDER BY Production Billion Rows Milliseconds

📖 บทความที่เกี่ยวข้อง

Tailwind CSS v4 Performance Tuning เพิ่มความเร็วอ่านบทความ → Qwik Resumability Performance Tuning เพิ่มความเร็วอ่านบทความ → GCP BigQuery ML Performance Tuning เพิ่มความเร็วอ่านบทความ → ClickHouse Analytics Community Buildingอ่านบทความ → WordPress WooCommerce Performance Tuning เพิ่มความเร็วอ่านบทความ →

📚 ดูบทความทั้งหมด →