W&B + ELK Stack
Weights Biases W&B MLOps ELK Elasticsearch Logstash Kibana Log Management Experiment Tracking Monitoring Production
| Aspect | W&B (wandb) | ELK Stack | Together |
|---|---|---|---|
| Purpose | ML Experiment Tracking | Log Management | Full ML Observability |
| Tracks | Metrics, Hyperparams, Models | System Logs, App Logs | End-to-end ML Pipeline |
| Dashboard | W&B Dashboard (cloud) | Kibana (self-hosted) | Correlated views |
| Alert | W&B Alerts (basic) | ElastAlert / Kibana Alerts | ML + System alerts |
| Storage | W&B Cloud / S3 | Elasticsearch indices | Metrics in W&B, Logs in ELK |
W&B Experiment Tracking
# === W&B Setup ===
# pip install wandb
# wandb login # API key from wandb.ai
# import wandb
# import torch
#
# # Initialize run
# run = wandb.init(
# project="image-classifier",
# config={
# "learning_rate": 0.001,
# "epochs": 50,
# "batch_size": 32,
# "model": "resnet50",
# "optimizer": "adam",
# "dataset": "cifar10",
# },
# tags=["production", "v2", "gpu-a100"],
# )
#
# # Training loop
# for epoch in range(config.epochs):
# train_loss, train_acc = train_one_epoch(model, train_loader)
# val_loss, val_acc = validate(model, val_loader)
#
# # Log metrics
# wandb.log({
# "epoch": epoch,
# "train/loss": train_loss,
# "train/accuracy": train_acc,
# "val/loss": val_loss,
# "val/accuracy": val_acc,
# "learning_rate": scheduler.get_last_lr()[0],
# "gpu/memory_used": torch.cuda.memory_allocated() / 1e9,
# "gpu/utilization": get_gpu_utilization(),
# })
#
# # Save model as artifact
# artifact = wandb.Artifact("model-v2", type="model")
# artifact.add_file("model_best.pth")
# run.log_artifact(artifact)
# run.finish()
from dataclasses import dataclass
@dataclass
class WBFeature:
feature: str
api: str
use_case: str
benefit: str
features = [
WBFeature("Experiment Tracking",
"wandb.init() + wandb.log()",
"Track Loss Accuracy LR GPU ทุก Epoch",
"เปรียบเทียบ Run ทั้งหมด หา Config ที่ดีที่สุด"),
WBFeature("Artifacts",
"wandb.Artifact() + run.log_artifact()",
"Version Dataset Model Checkpoint",
"Reproducibility ทุก Run ใช้ Dataset Model Version เดิมได้"),
WBFeature("Sweeps",
"wandb.sweep() + wandb.agent()",
"Hyperparameter Tuning อัตโนมัติ Bayesian Grid Random",
"หา Hyperparameter ที่ดีที่สุดโดยอัตโนมัติ"),
WBFeature("Tables",
"wandb.Table()",
"Log Predictions Confusion Matrix Sample Images",
"วิเคราะห์ Error Pattern ดู Predictions ผิด"),
WBFeature("Reports",
"W&B UI",
"สร้างรายงาน แชร์กับทีม Document Findings",
"Communication ทีมเห็นผลลัพธ์เดียวกัน"),
]
print("=== W&B Features ===")
for f in features:
print(f" [{f.feature}] API: {f.api}")
print(f" Use: {f.use_case}")
print(f" Benefit: {f.benefit}")
ELK Stack Setup
# === ELK Configuration ===
# docker-compose.yml for ELK
# version: "3.8"
# services:
# elasticsearch:
# image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0
# environment:
# - discovery.type=single-node
# - xpack.security.enabled=true
# - ELASTIC_PASSWORD=changeme
# ports: ["9200:9200"]
# volumes: ["es-data:/usr/share/elasticsearch/data"]
#
# kibana:
# image: docker.elastic.co/kibana/kibana:8.12.0
# ports: ["5601:5601"]
# environment:
# - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
# - ELASTICSEARCH_USERNAME=kibana_system
# - ELASTICSEARCH_PASSWORD=changeme
#
# filebeat:
# image: docker.elastic.co/beats/filebeat:8.12.0
# volumes:
# - ./filebeat.yml:/usr/share/filebeat/filebeat.yml
# - /var/log/training:/var/log/training
# depends_on: [elasticsearch]
# filebeat.yml for ML training logs
# filebeat.inputs:
# - type: log
# paths: ["/var/log/training/*.log"]
# json.keys_under_root: true
# json.add_error_key: true
# fields:
# log_type: ml_training
# - type: log
# paths: ["/var/log/inference/*.log"]
# fields:
# log_type: ml_inference
# output.elasticsearch:
# hosts: ["elasticsearch:9200"]
# username: "elastic"
# password: "changeme"
@dataclass
class ELKComponent:
component: str
role: str
config: str
ml_use: str
components = [
ELKComponent("Elasticsearch",
"Search Engine + Storage",
"Single node dev, 3+ nodes production, ILM policy",
"เก็บ Training Log, Inference Log, System Metrics"),
ELKComponent("Kibana",
"Visualization + Dashboard",
"Port 5601, connect to Elasticsearch",
"Dashboard: GPU Usage, Training Status, Error Rate"),
ELKComponent("Filebeat",
"Log Shipper",
"ติดตั้งบน Training Server, อ่าน Log File",
"ส่ง Training Log, GPU Log ไป Elasticsearch"),
ELKComponent("Logstash",
"Log Processing Pipeline",
"Parse JSON Log, Enrich with W&B metadata",
"Transform Log, เพิ่ม Run ID, Project Name"),
ELKComponent("ElastAlert",
"Alerting",
"Rule-based alerts on Elasticsearch queries",
"Alert เมื่อ Training Failed, OOM, GPU Error"),
]
print("=== ELK Components ===")
for c in components:
print(f" [{c.component}] {c.role}")
print(f" Config: {c.config}")
print(f" ML Use: {c.ml_use}")
Integration Pipeline
# === W&B + ELK Integration ===
# Python logging to ELK
# import logging
# from pythonjsonlogger import jsonlogger
#
# logger = logging.getLogger("ml_training")
# handler = logging.FileHandler("/var/log/training/train.log")
# formatter = jsonlogger.JsonFormatter(
# "%(asctime)s %(name)s %(levelname)s %(message)s"
# )
# handler.setFormatter(formatter)
# logger.addHandler(handler)
#
# # Log training events (picked up by Filebeat → Elasticsearch)
# logger.info("Training started", extra={
# "wandb_run_id": run.id,
# "wandb_project": "image-classifier",
# "gpu_name": torch.cuda.get_device_name(),
# "gpu_memory_total": torch.cuda.get_device_properties(0).total_mem,
# })
@dataclass
class IntegrationPoint:
source: str
destination: str
data: str
method: str
frequency: str
integrations = [
IntegrationPoint("Training Script",
"W&B", "Metrics, Hyperparams, Artifacts",
"wandb.log() SDK", "Every epoch/step"),
IntegrationPoint("Training Script",
"ELK (via Filebeat)", "Training logs, errors, events",
"Python JSON logging → File → Filebeat",
"Every log event"),
IntegrationPoint("GPU Server",
"ELK (via Metricbeat)", "GPU util, memory, temp, power",
"Metricbeat nvidia module",
"Every 10 seconds"),
IntegrationPoint("W&B API",
"Elasticsearch", "Run metadata, final metrics",
"Python script: wandb API → ES bulk index",
"After each run completes"),
IntegrationPoint("Elasticsearch",
"Kibana Dashboard", "Aggregated views, alerts",
"Kibana saved objects, ElastAlert rules",
"Real-time"),
]
print("=== Integration Points ===")
for i in integrations:
print(f" [{i.source}] → [{i.destination}]")
print(f" Data: {i.data}")
print(f" Method: {i.method}")
print(f" Freq: {i.frequency}")
เคล็ดลับ
- wandb.log: Log ทุก Metric ที่สำคัญ Loss Accuracy LR GPU Memory ทุก Epoch
- Artifacts: ใช้ W&B Artifacts จัดการ Model Dataset Version ไม่เก็บใน Git
- Filebeat: ใช้ Filebeat ส่ง Log อย่า Write ตรงไป Elasticsearch
- ILM: ตั้ง Index Lifecycle Management ใน Elasticsearch ลบ Log เก่า
- Alert: ตั้ง Alert ใน Kibana เมื่อ Training Failed GPU Error OOM
Weights & Biases คืออะไร
MLOps Platform Experiment Track Metrics Hyperparameters Artifacts Sweeps Tables Reports Model Registry Python SDK PyTorch TensorFlow
ELK Stack คืออะไร
Elasticsearch Logstash Kibana Beats Filebeat Log Management Search Dashboard Alert ILM Open Source Application System Infrastructure
รวมกันอย่างไร
W&B Track Metrics ELK System Log GPU Filebeat Python Logging Elasticsearch Kibana Dashboard Correlation Training Error Alert Monitoring
Production Best Practices มีอะไร
Track ทุก Run Tag Project Team Artifacts Model Version Sweeps Log 30 วัน ILM Dashboard GPU Alert Training Failed Filebeat TLS Auth
สรุป
Weights Biases W&B ELK Stack Elasticsearch Kibana Experiment Tracking Log Management MLOps GPU Monitoring Alert Production Integration
