it
ClickHouse Analytics กับ Home Lab Setup —
ClickHouse Analytics Database

ClickHouse เป็น Columnar Database สำหรับ OLAP ประมวลผล Queries เร็วมาก รองรับข้อมูลหลายพันล้าน Rows ใช้ Column-oriented Storage บีบอัดข้อมูลได้ดี เหมาะกับ Analytics
เนื้อหาเกี่ยวข้อง — อ่านต่อ: Apache Hudi Tech Conference 2026
Home Lab ให้ทดลอง ClickHouse ที่บ้าน ไม่ต้องจ่ายค่า Cloud ใช้ PC เก่าหรือ Mini PC ติดตั้งผ่าน Docker เรียนรู้ Data Engineering จริง
เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Wireless Site Survey High Availability HA Setup
ClickHouse Installation
# === ClickHouse Installation สำหรับ Home Lab ===
# 1. Docker Installation (แนะนำ)
docker run -d \
--name clickhouse \
--ulimit nofile=262144:262144 \
-p 8123:8123 \
-p 9000:9000 \
-v clickhouse_data:/var/lib/clickhouse \
-v clickhouse_logs:/var/log/clickhouse-server \
clickhouse/clickhouse-server:latest
# 2. Docker Compose (Production-ready)
# docker-compose.yml
# version: '3.8'
# services:
# clickhouse:
# image: clickhouse/clickhouse-server:latest
# ports:
# - "8123:8123" # HTTP
# - "9000:9000" # Native
# volumes:
# - ./data:/var/lib/clickhouse
# - ./logs:/var/log/clickhouse-server
# - ./config:/etc/clickhouse-server/config.d
# - ./users:/etc/clickhouse-server/users.d
# environment:
# CLICKHOUSE_DB: analytics
# CLICKHOUSE_USER: admin
# CLICKHOUSE_PASSWORD: secure_password
# ulimits:
# nofile:
# soft: 262144
# hard: 262144
# deploy:
# resources:
# limits:
# memory: 8G
#
# grafana:
# image: grafana/grafana:latest
# ports:
# - "3000:3000"
# environment:
# GF_INSTALL_PLUGINS: grafana-clickhouse-datasource
# volumes:
# - grafana_data:/var/lib/grafana
# 3. Native Installation (Ubuntu)
sudo apt-get install -y apt-transport-https ca-certificates
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client
sudo systemctl start clickhouse-server
sudo systemctl enable clickhouse-server
# 4. ตรวจสอบ
clickhouse-client --query "SELECT version()"
curl http://localhost:8123/?query=SELECT+1
# 5. Performance Tuning (config.d/performance.xml)
#
# 8000000000
# 8
# 100
# 5368709120
#
echo "ClickHouse installed"
echo " HTTP: http://localhost:8123"
echo " Native: localhost:9000"
echo " Client: clickhouse-client"
Table Design และ Data Loading

# === ClickHouse Table Design สำหรับ Analytics ===
# clickhouse-client
# หรือ curl -X POST http://localhost:8123 --data-binary @query.sql
# 1. Web Analytics Table
# CREATE TABLE web_events (
# event_date Date,
# event_time DateTime,
# user_id UInt64,
# session_id String,
# event_type LowCardinality(String),
# page_url String,
# referrer String,
# device LowCardinality(String),
# browser LowCardinality(String),
# country LowCardinality(String),
# city String,
# duration_ms UInt32,
# is_bounce UInt8
# ) ENGINE = MergeTree()
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_date, user_id, event_time)
# TTL event_date + INTERVAL 1 YEAR
# SETTINGS index_granularity = 8192;
# 2. Server Metrics Table
# CREATE TABLE server_metrics (
# timestamp DateTime,
# hostname LowCardinality(String),
# metric_name LowCardinality(String),
# value Float64,
# tags Map(String, String)
# ) ENGINE = MergeTree()
# PARTITION BY toYYYYMMDD(timestamp)
# ORDER BY (hostname, metric_name, timestamp)
# TTL timestamp + INTERVAL 90 DAY;
# 3. Materialized View สำหรับ Real-time Aggregation
# CREATE MATERIALIZED VIEW hourly_page_views
# ENGINE = SummingMergeTree()
# PARTITION BY toYYYYMM(hour)
# ORDER BY (hour, page_url, country)
# AS SELECT
# toStartOfHour(event_time) AS hour,
# page_url,
# country,
# count() AS views,
# uniq(user_id) AS unique_users,
# avg(duration_ms) AS avg_duration
# FROM web_events
# GROUP BY hour, page_url, country;
# 4. Python Data Loading
# pip install clickhouse-connect pandas
import clickhouse_connect
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
def setup_clickhouse():
"""Setup ClickHouse Connection และ Tables"""
client = clickhouse_connect.get_client(
host='localhost', port=8123,
username='default', password='',
)
# สร้าง Database
client.command("CREATE DATABASE IF NOT EXISTS analytics")
# สร้าง Table
client.command("""
CREATE TABLE IF NOT EXISTS analytics.web_events (
event_date Date,
event_time DateTime,
user_id UInt64,
session_id String,
event_type LowCardinality(String),
page_url String,
country LowCardinality(String),
duration_ms UInt32
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, user_id, event_time)
""")
return client
def generate_sample_data(n=100000):
"""สร้างข้อมูลตัวอย่าง"""
np.random.seed(42)
pages = ["/home", "/products", "/cart", "/checkout", "/blog", "/about"]
countries = ["TH", "US", "JP", "SG", "MY", "ID", "VN", "PH"]
events = ["page_view", "click", "scroll", "purchase"]
dates = [datetime.now() - timedelta(days=np.random.randint(0, 90)) for _ in range(n)]
data = {
"event_date": [d.date() for d in dates],
"event_time": dates,
"user_id": np.random.randint(1, 10000, n),
"session_id": [f"s{i}" for i in np.random.randint(1, 50000, n)],
"event_type": np.random.choice(events, n),
"page_url": np.random.choice(pages, n),
"country": np.random.choice(countries, n),
"duration_ms": np.random.randint(100, 30000, n),
}
return pd.DataFrame(data)
# client = setup_clickhouse()
# df = generate_sample_data(1000000)
# client.insert_df("analytics.web_events", df)
# print(f"Inserted {len(df)} rows")
print("ClickHouse Tables:")
print(" web_events: Web analytics events")
print(" server_metrics: Server monitoring")
print(" hourly_page_views: Materialized View")
Analytics Queries
# === ClickHouse Analytics Queries ===
queries = {
"Daily Active Users": """
SELECT
event_date,
uniq(user_id) AS dau,
count() AS total_events,
avg(duration_ms) AS avg_duration
FROM analytics.web_events
WHERE event_date >= today() - 30
GROUP BY event_date
ORDER BY event_date
""",
"Top Pages by Views": """
SELECT
page_url,
count() AS views,
uniq(user_id) AS unique_users,
avg(duration_ms) AS avg_duration,
countIf(is_bounce = 1) * 100.0 / count() AS bounce_rate
FROM analytics.web_events
WHERE event_date >= today() - 7
GROUP BY page_url
ORDER BY views DESC
LIMIT 10
""",
"User Funnel": """
SELECT
countIf(event_type = 'page_view') AS views,
countIf(event_type = 'click') AS clicks,
countIf(event_type = 'scroll') AS scrolls,
countIf(event_type = 'purchase') AS purchases,
round(countIf(event_type = 'purchase') * 100.0 /
countIf(event_type = 'page_view'), 2) AS conversion_rate
FROM analytics.web_events
WHERE event_date >= today() - 30
""",
"Country Distribution": """
SELECT
country,
uniq(user_id) AS users,
count() AS events,
round(avg(duration_ms) / 1000, 1) AS avg_seconds
FROM analytics.web_events
WHERE event_date >= today() - 30
GROUP BY country
ORDER BY users DESC
""",
"Hourly Traffic Pattern": """
SELECT
toHour(event_time) AS hour,
count() AS events,
uniq(user_id) AS users
FROM analytics.web_events
WHERE event_date = today()
GROUP BY hour
ORDER BY hour
""",
"Retention (Day 1, 7, 30)": """
WITH first_seen AS (
SELECT user_id, min(event_date) AS first_date
FROM analytics.web_events
GROUP BY user_id
)
SELECT
first_date,
count() AS cohort_size,
countIf(dateDiff('day', first_date, event_date) = 1) AS day1,
countIf(dateDiff('day', first_date, event_date) = 7) AS day7,
countIf(dateDiff('day', first_date, event_date) = 30) AS day30
FROM analytics.web_events
JOIN first_seen USING (user_id)
WHERE first_date >= today() - 60
GROUP BY first_date
ORDER BY first_date
""",
}
print("ClickHouse Analytics Queries:")
for name, query in queries.items():
lines = len(query.strip().split("\n"))
print(f" {name} ({lines} lines)")
# รัน Query
# client = setup_clickhouse()
# for name, query in queries.items():
# result = client.query(query)
# print(f"\n{name}:")
# for row in result.result_rows[:5]:
# print(f" {row}")
Best Practices
- Partition by Date: ใช้ PARTITION BY toYYYYMM(date) สำหรับ Time Series Data
- ORDER BY: เลือก ORDER BY ตาม Query Pattern ที่ใช้บ่อยที่สุด
- LowCardinality: ใช้ LowCardinality สำหรับ Columns ที่มีค่าซ้ำกันมาก
- Materialized Views: ใช้ Materialized Views สำหรับ Pre-aggregate Data
- TTL: ตั้ง TTL ลบข้อมูลเก่าอัตโนมัติ ประหยัด Storage
- Grafana Dashboard: ใช้ Grafana + ClickHouse Plugin สำหรับ Visualization
ClickHouse คืออะไร
Open-source Columnar Database สำหรับ OLAP พัฒนาโดย Yandex ประมวลผล Queries เร็วมาก รองรับหลายพันล้าน Rows Column-oriented Storage บีบอัดดี Log Analytics Time Series BI
แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Rust Serde Clean Architecture





