SiamCafe · Blog
Weights & Biases Log Management ELK — รวม ML
บทความ

Weights & Biases Log Management ELK — รวม ML

เผยแพร่ 28 พฤษภาคม 2569

W&B + ELK Stack

Weights Biases W&B MLOps ELK Elasticsearch Logstash Kibana Log Management Experiment Tracking Monitoring Production

AspectW&B (wandb)ELK StackTogether
PurposeML Experiment TrackingLog ManagementFull ML Observability
TracksMetrics, Hyperparams, ModelsSystem Logs, App LogsEnd-to-end ML Pipeline
DashboardW&B Dashboard (cloud)Kibana (self-hosted)Correlated views
AlertW&B Alerts (basic)ElastAlert / Kibana AlertsML + System alerts
StorageW&B Cloud / S3Elasticsearch indicesMetrics in W&B, Logs in ELK

W&B Experiment Tracking

=== W&B Setup ===

pip install wandb

wandb login # API key from wandb.ai

import wandb

import torch

# Initialize run

run = wandb.init(

project="image-classifier",

config={

"learning_rate": 0.001,

"epochs": 50,

"batch_size": 32,

"model": "resnet50",

"optimizer": "adam",

"dataset": "cifar10",

},

tags=["production", "v2", "gpu-a100"],

)

# Training loop

for epoch in range(config.epochs):

train_loss, train_acc = train_one_epoch(model, train_loader)

val_loss, val_acc = validate(model, val_loader)

# Log metrics

wandb.log({

"epoch": epoch,

"train/loss": train_loss,

"train/accuracy": train_acc,

"val/loss": val_loss,

"val/accuracy": val_acc,

"learning_rate": scheduler.get_last_lr()[0],

"gpu/memory_used": torch.cuda.memory_allocated() / 1e9,

"gpu/utilization": get_gpu_utilization(),

})

# Save model as artifact

artifact = wandb.Artifact("model-v2", type="model")

artifact.add_file("model_best.pth")

run.log_artifact(artifact)

run.finish()

from dataclasses import dataclass

@dataclass

class WBFeature:

feature: str

api: str

use_case: str

benefit: str

features = [

WBFeature("Experiment Tracking",

"wandb.init() + wandb.log()",

"Track Loss Accuracy LR GPU ทุก Epoch",

"เปรียบเทียบ Run ทั้งหมด หา Config ที่ดีที่สุด"),

WBFeature("Artifacts",

"wandb.Artifact() + run.log_artifact()",

"Version Dataset Model Checkpoint",

"Reproducibility ทุก Run ใช้ Dataset Model Version เดิมได้"),

WBFeature("Sweeps",

"wandb.sweep() + wandb.agent()",

"Hyperparameter Tuning อัตโนมัติ Bayesian Grid Random",

"หา Hyperparameter ที่ดีที่สุดโดยอัตโนมัติ"),

WBFeature("Tables",

"wandb.Table()",

"Log Predictions Confusion Matrix Sample Images",

"วิเคราะห์ Error Pattern ดู Predictions ผิด"),

WBFeature("Reports",

"W&B UI",

"สร้างรายงาน แชร์กับทีม Document Findings",

"Communication ทีมเห็นผลลัพธ์เดียวกัน"),

]

print("=== W&B Features ===")

for f in features:

print(f" [{f.feature}] API: {f.api}")

print(f" Use: {f.use_case}")

print(f" Benefit: {f.benefit}")

ELK Stack Setup

=== ELK Configuration ===

docker-compose.yml for ELK

version: "3.8"

services:

elasticsearch:

image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0

environment:

  • discovery.type=single-node
  • xpack.security.enabled=true
  • ELASTIC_PASSWORD=changeme

ports: ["9200:9200"]

volumes: ["es-data:/usr/share/elasticsearch/data"]

kibana:

image: docker.elastic.co/kibana/kibana:8.12.0

ports: ["5601:5601"]

environment:

  • ELASTICSEARCH_HOSTS=http://elasticsearch:9200
  • ELASTICSEARCH_USERNAME=kibana_system
  • ELASTICSEARCH_PASSWORD=changeme

filebeat:

image: docker.elastic.co/beats/filebeat:8.12.0

volumes:

  • ./filebeat.yml:/usr/share/filebeat/filebeat.yml
  • /var/log/training:/var/log/training

depends_on: [elasticsearch]

filebeat.yml for ML training logs

filebeat.inputs:

  • type: log

paths: ["/var/log/training/*.log"]

json.keys_under_root: true

json.add_error_key: true

fields:

log_type: ml_training

  • type: log

paths: ["/var/log/inference/*.log"]

fields:

log_type: ml_inference

output.elasticsearch:

hosts: ["elasticsearch:9200"]

username: "elastic"

password: "changeme"

@dataclass

class ELKComponent:

component: str

role: str

config: str

ml_use: str

components = [

ELKComponent("Elasticsearch",

"Search Engine + Storage",

"Single node dev, 3+ nodes production, ILM policy",

"เก็บ Training Log, Inference Log, System Metrics"),

ELKComponent("Kibana",

"Visualization + Dashboard",

"Port 5601, connect to Elasticsearch",

"Dashboard: GPU Usage, Training Status, Error Rate"),

ELKComponent("Filebeat",

"Log Shipper",

"ติดตั้งบน Training Server, อ่าน Log File",

"ส่ง Training Log, GPU Log ไป Elasticsearch"),

ELKComponent("Logstash",

"Log Processing Pipeline",

"Parse JSON Log, Enrich with W&B metadata",

"Transform Log, เพิ่ม Run ID, Project Name"),

ELKComponent("ElastAlert",

"Alerting",

"Rule-based alerts on Elasticsearch queries",

"Alert เมื่อ Training Failed, OOM, GPU Error"),

]

print("=== ELK Components ===")

for c in components:

print(f" [{c.component}] {c.role}")

print(f" Config: {c.config}")

print(f" ML Use: {c.ml_use}")

Integration Pipeline

# === W&B + ELK Integration ===

# Python logging to ELK
# import logging
# from pythonjsonlogger import jsonlogger
#
# logger = logging.getLogger("ml_training")
# handler = logging.FileHandler("/var/log/training/train.log")
# formatter = jsonlogger.JsonFormatter(
#     "%(asctime)s %(name)s %(levelname)s %(message)s"
# )
# handler.setFormatter(formatter)
# logger.addHandler(handler)
#
# # Log training events (picked up by Filebeat → Elasticsearch)
# logger.info("Training started", extra={
#     "wandb_run_id": run.id,
#     "wandb_project": "image-classifier",
#     "gpu_name": torch.cuda.get_device_name(),
#     "gpu_memory_total": torch.cuda.get_device_properties(0).total_mem,
# })

@dataclass
class IntegrationPoint:
    source: str
    destination: str
    data: str
    method: str
    frequency: str

integrations = [
    IntegrationPoint("Training Script",
        "W&B", "Metrics, Hyperparams, Artifacts",
        "wandb.log() SDK", "Every epoch/step"),
    IntegrationPoint("Training Script",
        "ELK (via Filebeat)", "Training logs, errors, events",
        "Python JSON logging → File → Filebeat",
        "Every log event"),
    IntegrationPoint("GPU Server",
        "ELK (via Metricbeat)", "GPU util, memory, temp, power",
        "Metricbeat nvidia module",
        "Every 10 seconds"),
    IntegrationPoint("W&B API",
        "Elasticsearch", "Run metadata, final metrics",
        "Python script: wandb API → ES bulk index",
        "After each run completes"),
    IntegrationPoint("Elasticsearch",
        "Kibana Dashboard", "Aggregated views, alerts",
        "Kibana saved objects, ElastAlert rules",
        "Real-time"),
]

print("=== Integration Points ===")
for i in integrations:
    print(f"  [{i.source}] → [{i.destination}]")
    print(f"    Data: {i.data}")
    print(f"    Method: {i.method}")
    print(f"    Freq: {i.frequency}")

เคล็ดลับ

  • wandb.log: Log ทุก Metric ที่สำคัญ Loss Accuracy LR GPU Memory ทุก Epoch
  • Artifacts: ใช้ W&B Artifacts จัดการ Model Dataset Version ไม่เก็บใน Git
  • Filebeat: ใช้ Filebeat ส่ง Log อย่า Write ตรงไป Elasticsearch
  • ILM: ตั้ง Index Lifecycle Management ใน Elasticsearch ลบ Log เก่า
  • Alert: ตั้ง Alert ใน Kibana เมื่อ Training Failed GPU Error OOM

Weights & Biases คืออะไร

MLOps Platform Experiment Track Metrics Hyperparameters Artifacts Sweeps Tables Reports Model Registry Python SDK PyTorch TensorFlow