Weights & Biases Log Management ELK — รวม ML
W&B + ELK Stack
Weights Biases W&B MLOps ELK Elasticsearch Logstash Kibana Log Management Experiment Tracking Monitoring Production
| Aspect | W&B (wandb) | ELK Stack | Together |
|---|---|---|---|
| Purpose | ML Experiment Tracking | Log Management | Full ML Observability |
| Tracks | Metrics, Hyperparams, Models | System Logs, App Logs | End-to-end ML Pipeline |
| Dashboard | W&B Dashboard (cloud) | Kibana (self-hosted) | Correlated views |
| Alert | W&B Alerts (basic) | ElastAlert / Kibana Alerts | ML + System alerts |
| Storage | W&B Cloud / S3 | Elasticsearch indices | Metrics in W&B, Logs in ELK |
W&B Experiment Tracking
=== W&B Setup ===
pip install wandb
wandb login # API key from wandb.ai
import wandb
import torch
# Initialize run
run = wandb.init(
project="image-classifier",
config={
"learning_rate": 0.001,
"epochs": 50,
"batch_size": 32,
"model": "resnet50",
"optimizer": "adam",
"dataset": "cifar10",
},
tags=["production", "v2", "gpu-a100"],
)
# Training loop
for epoch in range(config.epochs):
train_loss, train_acc = train_one_epoch(model, train_loader)
val_loss, val_acc = validate(model, val_loader)
# Log metrics
wandb.log({
"epoch": epoch,
"train/loss": train_loss,
"train/accuracy": train_acc,
"val/loss": val_loss,
"val/accuracy": val_acc,
"learning_rate": scheduler.get_last_lr()[0],
"gpu/memory_used": torch.cuda.memory_allocated() / 1e9,
"gpu/utilization": get_gpu_utilization(),
})
# Save model as artifact
artifact = wandb.Artifact("model-v2", type="model")
artifact.add_file("model_best.pth")
run.log_artifact(artifact)
run.finish()
from dataclasses import dataclass
@dataclass
class WBFeature:
feature: str
api: str
use_case: str
benefit: str
features = [
WBFeature("Experiment Tracking",
"wandb.init() + wandb.log()",
"Track Loss Accuracy LR GPU ทุก Epoch",
"เปรียบเทียบ Run ทั้งหมด หา Config ที่ดีที่สุด"),
WBFeature("Artifacts",
"wandb.Artifact() + run.log_artifact()",
"Version Dataset Model Checkpoint",
"Reproducibility ทุก Run ใช้ Dataset Model Version เดิมได้"),
WBFeature("Sweeps",
"wandb.sweep() + wandb.agent()",
"Hyperparameter Tuning อัตโนมัติ Bayesian Grid Random",
"หา Hyperparameter ที่ดีที่สุดโดยอัตโนมัติ"),
WBFeature("Tables",
"wandb.Table()",
"Log Predictions Confusion Matrix Sample Images",
"วิเคราะห์ Error Pattern ดู Predictions ผิด"),
WBFeature("Reports",
"W&B UI",
"สร้างรายงาน แชร์กับทีม Document Findings",
"Communication ทีมเห็นผลลัพธ์เดียวกัน"),
]
print("=== W&B Features ===")
for f in features:
print(f" [{f.feature}] API: {f.api}")
print(f" Use: {f.use_case}")
print(f" Benefit: {f.benefit}")
ELK Stack Setup
=== ELK Configuration ===
docker-compose.yml for ELK
version: "3.8"
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0
environment:
- discovery.type=single-node
- xpack.security.enabled=true
- ELASTIC_PASSWORD=changeme
ports: ["9200:9200"]
volumes: ["es-data:/usr/share/elasticsearch/data"]
kibana:
image: docker.elastic.co/kibana/kibana:8.12.0
ports: ["5601:5601"]
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
- ELASTICSEARCH_USERNAME=kibana_system
- ELASTICSEARCH_PASSWORD=changeme
filebeat:
image: docker.elastic.co/beats/filebeat:8.12.0
volumes:
- ./filebeat.yml:/usr/share/filebeat/filebeat.yml
- /var/log/training:/var/log/training
depends_on: [elasticsearch]
filebeat.yml for ML training logs
filebeat.inputs:
- type: log
paths: ["/var/log/training/*.log"]
json.keys_under_root: true
json.add_error_key: true
fields:
log_type: ml_training
- type: log
paths: ["/var/log/inference/*.log"]
fields:
log_type: ml_inference
output.elasticsearch:
hosts: ["elasticsearch:9200"]
username: "elastic"
password: "changeme"
@dataclass
class ELKComponent:
component: str
role: str
config: str
ml_use: str
components = [
ELKComponent("Elasticsearch",
"Search Engine + Storage",
"Single node dev, 3+ nodes production, ILM policy",
"เก็บ Training Log, Inference Log, System Metrics"),
ELKComponent("Kibana",
"Visualization + Dashboard",
"Port 5601, connect to Elasticsearch",
"Dashboard: GPU Usage, Training Status, Error Rate"),
ELKComponent("Filebeat",
"Log Shipper",
"ติดตั้งบน Training Server, อ่าน Log File",
"ส่ง Training Log, GPU Log ไป Elasticsearch"),
ELKComponent("Logstash",
"Log Processing Pipeline",
"Parse JSON Log, Enrich with W&B metadata",
"Transform Log, เพิ่ม Run ID, Project Name"),
ELKComponent("ElastAlert",
"Alerting",
"Rule-based alerts on Elasticsearch queries",
"Alert เมื่อ Training Failed, OOM, GPU Error"),
]
print("=== ELK Components ===")
for c in components:
print(f" [{c.component}] {c.role}")
print(f" Config: {c.config}")
print(f" ML Use: {c.ml_use}")
Integration Pipeline
# === W&B + ELK Integration ===
# Python logging to ELK
# import logging
# from pythonjsonlogger import jsonlogger
#
# logger = logging.getLogger("ml_training")
# handler = logging.FileHandler("/var/log/training/train.log")
# formatter = jsonlogger.JsonFormatter(
# "%(asctime)s %(name)s %(levelname)s %(message)s"
# )
# handler.setFormatter(formatter)
# logger.addHandler(handler)
#
# # Log training events (picked up by Filebeat → Elasticsearch)
# logger.info("Training started", extra={
# "wandb_run_id": run.id,
# "wandb_project": "image-classifier",
# "gpu_name": torch.cuda.get_device_name(),
# "gpu_memory_total": torch.cuda.get_device_properties(0).total_mem,
# })
@dataclass
class IntegrationPoint:
source: str
destination: str
data: str
method: str
frequency: str
integrations = [
IntegrationPoint("Training Script",
"W&B", "Metrics, Hyperparams, Artifacts",
"wandb.log() SDK", "Every epoch/step"),
IntegrationPoint("Training Script",
"ELK (via Filebeat)", "Training logs, errors, events",
"Python JSON logging → File → Filebeat",
"Every log event"),
IntegrationPoint("GPU Server",
"ELK (via Metricbeat)", "GPU util, memory, temp, power",
"Metricbeat nvidia module",
"Every 10 seconds"),
IntegrationPoint("W&B API",
"Elasticsearch", "Run metadata, final metrics",
"Python script: wandb API → ES bulk index",
"After each run completes"),
IntegrationPoint("Elasticsearch",
"Kibana Dashboard", "Aggregated views, alerts",
"Kibana saved objects, ElastAlert rules",
"Real-time"),
]
print("=== Integration Points ===")
for i in integrations:
print(f" [{i.source}] → [{i.destination}]")
print(f" Data: {i.data}")
print(f" Method: {i.method}")
print(f" Freq: {i.frequency}")
เคล็ดลับ
- wandb.log: Log ทุก Metric ที่สำคัญ Loss Accuracy LR GPU Memory ทุก Epoch
- Artifacts: ใช้ W&B Artifacts จัดการ Model Dataset Version ไม่เก็บใน Git
- Filebeat: ใช้ Filebeat ส่ง Log อย่า Write ตรงไป Elasticsearch
- ILM: ตั้ง Index Lifecycle Management ใน Elasticsearch ลบ Log เก่า
- Alert: ตั้ง Alert ใน Kibana เมื่อ Training Failed GPU Error OOM
Weights & Biases คืออะไร
MLOps Platform Experiment Track Metrics Hyperparameters Artifacts Sweeps Tables Reports Model Registry Python SDK PyTorch TensorFlow