it

ClickHouse Analytics กับ Home Lab Setup —

ClickHouse Analytics กับ Home Lab Setup —

ClickHouse Analytics Database

ClickHouse Analytics กับ Home Lab Setup —

ClickHouse เป็น Columnar Database สำหรับ OLAP ประมวลผล Queries เร็วมาก รองรับข้อมูลหลายพันล้าน Rows ใช้ Column-oriented Storage บีบอัดข้อมูลได้ดี เหมาะกับ Analytics

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Apache Hudi Tech Conference 2026

Home Lab ให้ทดลอง ClickHouse ที่บ้าน ไม่ต้องจ่ายค่า Cloud ใช้ PC เก่าหรือ Mini PC ติดตั้งผ่าน Docker เรียนรู้ Data Engineering จริง

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Wireless Site Survey High Availability HA Setup

ClickHouse Installation

# === ClickHouse Installation สำหรับ Home Lab ===



# 1. Docker Installation (แนะนำ)

docker run -d \

  --name clickhouse \

  --ulimit nofile=262144:262144 \

  -p 8123:8123 \

  -p 9000:9000 \

  -v clickhouse_data:/var/lib/clickhouse \

  -v clickhouse_logs:/var/log/clickhouse-server \

  clickhouse/clickhouse-server:latest



# 2. Docker Compose (Production-ready)

# docker-compose.yml

# version: '3.8'

# services:

#   clickhouse:

#     image: clickhouse/clickhouse-server:latest

#     ports:

#       - "8123:8123"   # HTTP

#       - "9000:9000"   # Native

#     volumes:

#       - ./data:/var/lib/clickhouse

#       - ./logs:/var/log/clickhouse-server

#       - ./config:/etc/clickhouse-server/config.d

#       - ./users:/etc/clickhouse-server/users.d

#     environment:

#       CLICKHOUSE_DB: analytics

#       CLICKHOUSE_USER: admin

#       CLICKHOUSE_PASSWORD: secure_password

#     ulimits:

#       nofile:

#         soft: 262144

#         hard: 262144

#     deploy:

#       resources:

#         limits:

#           memory: 8G

#

#   grafana:

#     image: grafana/grafana:latest

#     ports:

#       - "3000:3000"

#     environment:

#       GF_INSTALL_PLUGINS: grafana-clickhouse-datasource

#     volumes:

#       - grafana_data:/var/lib/grafana



# 3. Native Installation (Ubuntu)

sudo apt-get install -y apt-transport-https ca-certificates

sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754

echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list

sudo apt-get update

sudo apt-get install -y clickhouse-server clickhouse-client



sudo systemctl start clickhouse-server

sudo systemctl enable clickhouse-server



# 4. ตรวจสอบ

clickhouse-client --query "SELECT version()"

curl http://localhost:8123/?query=SELECT+1



# 5. Performance Tuning (config.d/performance.xml)

# 

#   8000000000

#   8

#   100

#   5368709120

# 



echo "ClickHouse installed"

echo "  HTTP: http://localhost:8123"

echo "  Native: localhost:9000"

echo "  Client: clickhouse-client"

Table Design และ Data Loading

ClickHouse Analytics กับ Home Lab Setup —
# === ClickHouse Table Design สำหรับ Analytics ===



# clickhouse-client

# หรือ curl -X POST http://localhost:8123 --data-binary @query.sql



# 1. Web Analytics Table

# CREATE TABLE web_events (

#     event_date Date,

#     event_time DateTime,

#     user_id UInt64,

#     session_id String,

#     event_type LowCardinality(String),

#     page_url String,

#     referrer String,

#     device LowCardinality(String),

#     browser LowCardinality(String),

#     country LowCardinality(String),

#     city String,

#     duration_ms UInt32,

#     is_bounce UInt8

# ) ENGINE = MergeTree()

# PARTITION BY toYYYYMM(event_date)

# ORDER BY (event_date, user_id, event_time)

# TTL event_date + INTERVAL 1 YEAR

# SETTINGS index_granularity = 8192;



# 2. Server Metrics Table

# CREATE TABLE server_metrics (

#     timestamp DateTime,

#     hostname LowCardinality(String),

#     metric_name LowCardinality(String),

#     value Float64,

#     tags Map(String, String)

# ) ENGINE = MergeTree()

# PARTITION BY toYYYYMMDD(timestamp)

# ORDER BY (hostname, metric_name, timestamp)

# TTL timestamp + INTERVAL 90 DAY;



# 3. Materialized View สำหรับ Real-time Aggregation

# CREATE MATERIALIZED VIEW hourly_page_views

# ENGINE = SummingMergeTree()

# PARTITION BY toYYYYMM(hour)

# ORDER BY (hour, page_url, country)

# AS SELECT

#     toStartOfHour(event_time) AS hour,

#     page_url,

#     country,

#     count() AS views,

#     uniq(user_id) AS unique_users,

#     avg(duration_ms) AS avg_duration

# FROM web_events

# GROUP BY hour, page_url, country;



# 4. Python Data Loading

# pip install clickhouse-connect pandas



import clickhouse_connect

import pandas as pd

import numpy as np

from datetime import datetime, timedelta



def setup_clickhouse():

    """Setup ClickHouse Connection และ Tables"""

    client = clickhouse_connect.get_client(

        host='localhost', port=8123,

        username='default', password='',

    )



    # สร้าง Database

    client.command("CREATE DATABASE IF NOT EXISTS analytics")



    # สร้าง Table

    client.command("""

        CREATE TABLE IF NOT EXISTS analytics.web_events (

            event_date Date,

            event_time DateTime,

            user_id UInt64,

            session_id String,

            event_type LowCardinality(String),

            page_url String,

            country LowCardinality(String),

            duration_ms UInt32

        ) ENGINE = MergeTree()

        PARTITION BY toYYYYMM(event_date)

        ORDER BY (event_date, user_id, event_time)

    """)



    return client



def generate_sample_data(n=100000):

    """สร้างข้อมูลตัวอย่าง"""

    np.random.seed(42)

    pages = ["/home", "/products", "/cart", "/checkout", "/blog", "/about"]

    countries = ["TH", "US", "JP", "SG", "MY", "ID", "VN", "PH"]

    events = ["page_view", "click", "scroll", "purchase"]



    dates = [datetime.now() - timedelta(days=np.random.randint(0, 90)) for _ in range(n)]



    data = {

        "event_date": [d.date() for d in dates],

        "event_time": dates,

        "user_id": np.random.randint(1, 10000, n),

        "session_id": [f"s{i}" for i in np.random.randint(1, 50000, n)],

        "event_type": np.random.choice(events, n),

        "page_url": np.random.choice(pages, n),

        "country": np.random.choice(countries, n),

        "duration_ms": np.random.randint(100, 30000, n),

    }



    return pd.DataFrame(data)



# client = setup_clickhouse()

# df = generate_sample_data(1000000)

# client.insert_df("analytics.web_events", df)

# print(f"Inserted {len(df)} rows")



print("ClickHouse Tables:")

print("  web_events: Web analytics events")

print("  server_metrics: Server monitoring")

print("  hourly_page_views: Materialized View")

Analytics Queries

# === ClickHouse Analytics Queries ===



queries = {

    "Daily Active Users": """

        SELECT

            event_date,

            uniq(user_id) AS dau,

            count() AS total_events,

            avg(duration_ms) AS avg_duration

        FROM analytics.web_events

        WHERE event_date >= today() - 30

        GROUP BY event_date

        ORDER BY event_date

    """,



    "Top Pages by Views": """

        SELECT

            page_url,

            count() AS views,

            uniq(user_id) AS unique_users,

            avg(duration_ms) AS avg_duration,

            countIf(is_bounce = 1) * 100.0 / count() AS bounce_rate

        FROM analytics.web_events

        WHERE event_date >= today() - 7

        GROUP BY page_url

        ORDER BY views DESC

        LIMIT 10

    """,



    "User Funnel": """

        SELECT

            countIf(event_type = 'page_view') AS views,

            countIf(event_type = 'click') AS clicks,

            countIf(event_type = 'scroll') AS scrolls,

            countIf(event_type = 'purchase') AS purchases,

            round(countIf(event_type = 'purchase') * 100.0 /

                  countIf(event_type = 'page_view'), 2) AS conversion_rate

        FROM analytics.web_events

        WHERE event_date >= today() - 30

    """,



    "Country Distribution": """

        SELECT

            country,

            uniq(user_id) AS users,

            count() AS events,

            round(avg(duration_ms) / 1000, 1) AS avg_seconds

        FROM analytics.web_events

        WHERE event_date >= today() - 30

        GROUP BY country

        ORDER BY users DESC

    """,



    "Hourly Traffic Pattern": """

        SELECT

            toHour(event_time) AS hour,

            count() AS events,

            uniq(user_id) AS users

        FROM analytics.web_events

        WHERE event_date = today()

        GROUP BY hour

        ORDER BY hour

    """,



    "Retention (Day 1, 7, 30)": """

        WITH first_seen AS (

            SELECT user_id, min(event_date) AS first_date

            FROM analytics.web_events

            GROUP BY user_id

        )

        SELECT

            first_date,

            count() AS cohort_size,

            countIf(dateDiff('day', first_date, event_date) = 1) AS day1,

            countIf(dateDiff('day', first_date, event_date) = 7) AS day7,

            countIf(dateDiff('day', first_date, event_date) = 30) AS day30

        FROM analytics.web_events

        JOIN first_seen USING (user_id)

        WHERE first_date >= today() - 60

        GROUP BY first_date

        ORDER BY first_date

    """,

}



print("ClickHouse Analytics Queries:")

for name, query in queries.items():

    lines = len(query.strip().split("\n"))

    print(f"  {name} ({lines} lines)")



# รัน Query

# client = setup_clickhouse()

# for name, query in queries.items():

#     result = client.query(query)

#     print(f"\n{name}:")

#     for row in result.result_rows[:5]:

#         print(f"  {row}")

Best Practices

  • Partition by Date: ใช้ PARTITION BY toYYYYMM(date) สำหรับ Time Series Data
  • ORDER BY: เลือก ORDER BY ตาม Query Pattern ที่ใช้บ่อยที่สุด
  • LowCardinality: ใช้ LowCardinality สำหรับ Columns ที่มีค่าซ้ำกันมาก
  • Materialized Views: ใช้ Materialized Views สำหรับ Pre-aggregate Data
  • TTL: ตั้ง TTL ลบข้อมูลเก่าอัตโนมัติ ประหยัด Storage
  • Grafana Dashboard: ใช้ Grafana + ClickHouse Plugin สำหรับ Visualization

ClickHouse คืออะไร

Open-source Columnar Database สำหรับ OLAP พัฒนาโดย Yandex ประมวลผล Queries เร็วมาก รองรับหลายพันล้าน Rows Column-oriented Storage บีบอัดดี Log Analytics Time Series BI

แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Rust Serde Clean Architecture

XM Legend · เทรดเดอร์ & ผู้สอน Forex 13 ปี

ผู้ก่อตั้ง SiamCafe ตั้งแต่ปี 1997 · เทรดเดอร์สาย Forex มากกว่า 13 ปี ได้รับการยกย่องเป็น XM Legend · แบ่งปันความรู้ Forex, ไอที, AI และการเทรด จากประสบการณ์จริงในตลาดจริง