ClickHouse Analytics Database
ClickHouse เป็น Columnar Database สำหรับ OLAP ประมวลผล Queries เร็วมาก รองรับข้อมูลหลายพันล้าน Rows ใช้ Column-oriented Storage บีบอัดข้อมูลได้ดี เหมาะกับ Analytics
Home Lab ให้ทดลอง ClickHouse ที่บ้าน ไม่ต้องจ่ายค่า Cloud ใช้ PC เก่าหรือ Mini PC ติดตั้งผ่าน Docker เรียนรู้ Data Engineering จริง
ClickHouse Installation
# === ClickHouse Installation สำหรับ Home Lab ===
# 1. Docker Installation (แนะนำ)
docker run -d \
--name clickhouse \
--ulimit nofile=262144:262144 \
-p 8123:8123 \
-p 9000:9000 \
-v clickhouse_data:/var/lib/clickhouse \
-v clickhouse_logs:/var/log/clickhouse-server \
clickhouse/clickhouse-server:latest
# 2. Docker Compose (Production-ready)
# docker-compose.yml
# version: '3.8'
# services:
# clickhouse:
# image: clickhouse/clickhouse-server:latest
# ports:
# - "8123:8123" # HTTP
# - "9000:9000" # Native
# volumes:
# - ./data:/var/lib/clickhouse
# - ./logs:/var/log/clickhouse-server
# - ./config:/etc/clickhouse-server/config.d
# - ./users:/etc/clickhouse-server/users.d
# environment:
# CLICKHOUSE_DB: analytics
# CLICKHOUSE_USER: admin
# CLICKHOUSE_PASSWORD: secure_password
# ulimits:
# nofile:
# soft: 262144
# hard: 262144
# deploy:
# resources:
# limits:
# memory: 8G
#
# grafana:
# image: grafana/grafana:latest
# ports:
# - "3000:3000"
# environment:
# GF_INSTALL_PLUGINS: grafana-clickhouse-datasource
# volumes:
# - grafana_data:/var/lib/grafana
# 3. Native Installation (Ubuntu)
sudo apt-get install -y apt-transport-https ca-certificates
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client
sudo systemctl start clickhouse-server
sudo systemctl enable clickhouse-server
# 4. ตรวจสอบ
clickhouse-client --query "SELECT version()"
curl http://localhost:8123/?query=SELECT+1
# 5. Performance Tuning (config.d/performance.xml)
#
# 8000000000
# 8
# 100
# 5368709120
#
echo "ClickHouse installed"
echo " HTTP: http://localhost:8123"
echo " Native: localhost:9000"
echo " Client: clickhouse-client"
Table Design และ Data Loading
# === ClickHouse Table Design สำหรับ Analytics ===
# clickhouse-client
# หรือ curl -X POST http://localhost:8123 --data-binary @query.sql
# 1. Web Analytics Table
# CREATE TABLE web_events (
# event_date Date,
# event_time DateTime,
# user_id UInt64,
# session_id String,
# event_type LowCardinality(String),
# page_url String,
# referrer String,
# device LowCardinality(String),
# browser LowCardinality(String),
# country LowCardinality(String),
# city String,
# duration_ms UInt32,
# is_bounce UInt8
# ) ENGINE = MergeTree()
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_date, user_id, event_time)
# TTL event_date + INTERVAL 1 YEAR
# SETTINGS index_granularity = 8192;
# 2. Server Metrics Table
# CREATE TABLE server_metrics (
# timestamp DateTime,
# hostname LowCardinality(String),
# metric_name LowCardinality(String),
# value Float64,
# tags Map(String, String)
# ) ENGINE = MergeTree()
# PARTITION BY toYYYYMMDD(timestamp)
# ORDER BY (hostname, metric_name, timestamp)
# TTL timestamp + INTERVAL 90 DAY;
# 3. Materialized View สำหรับ Real-time Aggregation
# CREATE MATERIALIZED VIEW hourly_page_views
# ENGINE = SummingMergeTree()
# PARTITION BY toYYYYMM(hour)
# ORDER BY (hour, page_url, country)
# AS SELECT
# toStartOfHour(event_time) AS hour,
# page_url,
# country,
# count() AS views,
# uniq(user_id) AS unique_users,
# avg(duration_ms) AS avg_duration
# FROM web_events
# GROUP BY hour, page_url, country;
# 4. Python Data Loading
# pip install clickhouse-connect pandas
import clickhouse_connect
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
def setup_clickhouse():
"""Setup ClickHouse Connection และ Tables"""
client = clickhouse_connect.get_client(
host='localhost', port=8123,
username='default', password='',
)
# สร้าง Database
client.command("CREATE DATABASE IF NOT EXISTS analytics")
# สร้าง Table
client.command("""
CREATE TABLE IF NOT EXISTS analytics.web_events (
event_date Date,
event_time DateTime,
user_id UInt64,
session_id String,
event_type LowCardinality(String),
page_url String,
country LowCardinality(String),
duration_ms UInt32
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, user_id, event_time)
""")
return client
def generate_sample_data(n=100000):
"""สร้างข้อมูลตัวอย่าง"""
np.random.seed(42)
pages = ["/home", "/products", "/cart", "/checkout", "/blog", "/about"]
countries = ["TH", "US", "JP", "SG", "MY", "ID", "VN", "PH"]
events = ["page_view", "click", "scroll", "purchase"]
dates = [datetime.now() - timedelta(days=np.random.randint(0, 90)) for _ in range(n)]
data = {
"event_date": [d.date() for d in dates],
"event_time": dates,
"user_id": np.random.randint(1, 10000, n),
"session_id": [f"s{i}" for i in np.random.randint(1, 50000, n)],
"event_type": np.random.choice(events, n),
"page_url": np.random.choice(pages, n),
"country": np.random.choice(countries, n),
"duration_ms": np.random.randint(100, 30000, n),
}
return pd.DataFrame(data)
# client = setup_clickhouse()
# df = generate_sample_data(1000000)
# client.insert_df("analytics.web_events", df)
# print(f"Inserted {len(df)} rows")
print("ClickHouse Tables:")
print(" web_events: Web analytics events")
print(" server_metrics: Server monitoring")
print(" hourly_page_views: Materialized View")
Analytics Queries
# === ClickHouse Analytics Queries ===
queries = {
"Daily Active Users": """
SELECT
event_date,
uniq(user_id) AS dau,
count() AS total_events,
avg(duration_ms) AS avg_duration
FROM analytics.web_events
WHERE event_date >= today() - 30
GROUP BY event_date
ORDER BY event_date
""",
"Top Pages by Views": """
SELECT
page_url,
count() AS views,
uniq(user_id) AS unique_users,
avg(duration_ms) AS avg_duration,
countIf(is_bounce = 1) * 100.0 / count() AS bounce_rate
FROM analytics.web_events
WHERE event_date >= today() - 7
GROUP BY page_url
ORDER BY views DESC
LIMIT 10
""",
"User Funnel": """
SELECT
countIf(event_type = 'page_view') AS views,
countIf(event_type = 'click') AS clicks,
countIf(event_type = 'scroll') AS scrolls,
countIf(event_type = 'purchase') AS purchases,
round(countIf(event_type = 'purchase') * 100.0 /
countIf(event_type = 'page_view'), 2) AS conversion_rate
FROM analytics.web_events
WHERE event_date >= today() - 30
""",
"Country Distribution": """
SELECT
country,
uniq(user_id) AS users,
count() AS events,
round(avg(duration_ms) / 1000, 1) AS avg_seconds
FROM analytics.web_events
WHERE event_date >= today() - 30
GROUP BY country
ORDER BY users DESC
""",
"Hourly Traffic Pattern": """
SELECT
toHour(event_time) AS hour,
count() AS events,
uniq(user_id) AS users
FROM analytics.web_events
WHERE event_date = today()
GROUP BY hour
ORDER BY hour
""",
"Retention (Day 1, 7, 30)": """
WITH first_seen AS (
SELECT user_id, min(event_date) AS first_date
FROM analytics.web_events
GROUP BY user_id
)
SELECT
first_date,
count() AS cohort_size,
countIf(dateDiff('day', first_date, event_date) = 1) AS day1,
countIf(dateDiff('day', first_date, event_date) = 7) AS day7,
countIf(dateDiff('day', first_date, event_date) = 30) AS day30
FROM analytics.web_events
JOIN first_seen USING (user_id)
WHERE first_date >= today() - 60
GROUP BY first_date
ORDER BY first_date
""",
}
print("ClickHouse Analytics Queries:")
for name, query in queries.items():
lines = len(query.strip().split("\n"))
print(f" {name} ({lines} lines)")
# รัน Query
# client = setup_clickhouse()
# for name, query in queries.items():
# result = client.query(query)
# print(f"\n{name}:")
# for row in result.result_rows[:5]:
# print(f" {row}")
Best Practices
- Partition by Date: ใช้ PARTITION BY toYYYYMM(date) สำหรับ Time Series Data
- ORDER BY: เลือก ORDER BY ตาม Query Pattern ที่ใช้บ่อยที่สุด
- LowCardinality: ใช้ LowCardinality สำหรับ Columns ที่มีค่าซ้ำกันมาก
- Materialized Views: ใช้ Materialized Views สำหรับ Pre-aggregate Data
- TTL: ตั้ง TTL ลบข้อมูลเก่าอัตโนมัติ ประหยัด Storage
- Grafana Dashboard: ใช้ Grafana + ClickHouse Plugin สำหรับ Visualization
ClickHouse คืออะไร
Open-source Columnar Database สำหรับ OLAP พัฒนาโดย Yandex ประมวลผล Queries เร็วมาก รองรับหลายพันล้าน Rows Column-oriented Storage บีบอัดดี Log Analytics Time Series BI
Home Lab คืออะไร
ตั้ง Server ที่บ้านเรียนรู้ Technology ใช้ PC เก่า Raspberry Pi Mini PC NAS ติดตั้ง Proxmox ESXi VMs Containers เรียน Networking Storage Databases ไม่ต้องจ่ายค่า Cloud
ClickHouse เร็วกว่า PostgreSQL อย่างไร
Columnar Storage อ่านเฉพาะ Columns ที่ต้องการ Vectorized Execution ประมวลผลทีละ Block บีบอัด 10-40x ลด I/O Analytical Queries เร็วกว่า 100-1000x ไม่เหมาะ OLTP
Hardware ขั้นต่ำสำหรับ ClickHouse Home Lab เท่าไร
CPU 4+ cores แนะนำ 8 RAM 8GB+ แนะนำ 16-32GB SSD 256GB+ HDD Cold Storage Docker ติดตั้งง่าย Mini PC Intel NUC Beelink หรือ Proxmox VM
สรุป
ClickHouse เป็น Columnar Database ที่เร็วมากสำหรับ Analytics ติดตั้งใน Home Lab ด้วย Docker Partition by Date ORDER BY ตาม Query Pattern LowCardinality ลด Storage Materialized Views Pre-aggregate TTL ลบข้อมูลเก่า Grafana Dashboard Visualization
