SiamCafe.net Blog
Cybersecurity

Weights Biases Log Management ELK

weights biases log management elk
Weights Biases Log Management ELK | SiamCafe Blog
2025-07-04· อ. บอม — SiamCafe.net· 11,523 คำ

W&B + ELK Stack

Weights Biases W&B MLOps ELK Elasticsearch Logstash Kibana Log Management Experiment Tracking Monitoring Production

AspectW&B (wandb)ELK StackTogether
PurposeML Experiment TrackingLog ManagementFull ML Observability
TracksMetrics, Hyperparams, ModelsSystem Logs, App LogsEnd-to-end ML Pipeline
DashboardW&B Dashboard (cloud)Kibana (self-hosted)Correlated views
AlertW&B Alerts (basic)ElastAlert / Kibana AlertsML + System alerts
StorageW&B Cloud / S3Elasticsearch indicesMetrics in W&B, Logs in ELK

W&B Experiment Tracking

# === W&B Setup ===

# pip install wandb
# wandb login  # API key from wandb.ai

# import wandb
# import torch
#
# # Initialize run
# run = wandb.init(
#     project="image-classifier",
#     config={
#         "learning_rate": 0.001,
#         "epochs": 50,
#         "batch_size": 32,
#         "model": "resnet50",
#         "optimizer": "adam",
#         "dataset": "cifar10",
#     },
#     tags=["production", "v2", "gpu-a100"],
# )
#
# # Training loop
# for epoch in range(config.epochs):
#     train_loss, train_acc = train_one_epoch(model, train_loader)
#     val_loss, val_acc = validate(model, val_loader)
#     
#     # Log metrics
#     wandb.log({
#         "epoch": epoch,
#         "train/loss": train_loss,
#         "train/accuracy": train_acc,
#         "val/loss": val_loss,
#         "val/accuracy": val_acc,
#         "learning_rate": scheduler.get_last_lr()[0],
#         "gpu/memory_used": torch.cuda.memory_allocated() / 1e9,
#         "gpu/utilization": get_gpu_utilization(),
#     })
#
# # Save model as artifact
# artifact = wandb.Artifact("model-v2", type="model")
# artifact.add_file("model_best.pth")
# run.log_artifact(artifact)
# run.finish()

from dataclasses import dataclass

@dataclass
class WBFeature:
    feature: str
    api: str
    use_case: str
    benefit: str

features = [
    WBFeature("Experiment Tracking",
        "wandb.init() + wandb.log()",
        "Track Loss Accuracy LR GPU ทุก Epoch",
        "เปรียบเทียบ Run ทั้งหมด หา Config ที่ดีที่สุด"),
    WBFeature("Artifacts",
        "wandb.Artifact() + run.log_artifact()",
        "Version Dataset Model Checkpoint",
        "Reproducibility ทุก Run ใช้ Dataset Model Version เดิมได้"),
    WBFeature("Sweeps",
        "wandb.sweep() + wandb.agent()",
        "Hyperparameter Tuning อัตโนมัติ Bayesian Grid Random",
        "หา Hyperparameter ที่ดีที่สุดโดยอัตโนมัติ"),
    WBFeature("Tables",
        "wandb.Table()",
        "Log Predictions Confusion Matrix Sample Images",
        "วิเคราะห์ Error Pattern ดู Predictions ผิด"),
    WBFeature("Reports",
        "W&B UI",
        "สร้างรายงาน แชร์กับทีม Document Findings",
        "Communication ทีมเห็นผลลัพธ์เดียวกัน"),
]

print("=== W&B Features ===")
for f in features:
    print(f"  [{f.feature}] API: {f.api}")
    print(f"    Use: {f.use_case}")
    print(f"    Benefit: {f.benefit}")

ELK Stack Setup

# === ELK Configuration ===

# docker-compose.yml for ELK
# version: "3.8"
# services:
#   elasticsearch:
#     image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0
#     environment:
#       - discovery.type=single-node
#       - xpack.security.enabled=true
#       - ELASTIC_PASSWORD=changeme
#     ports: ["9200:9200"]
#     volumes: ["es-data:/usr/share/elasticsearch/data"]
#   
#   kibana:
#     image: docker.elastic.co/kibana/kibana:8.12.0
#     ports: ["5601:5601"]
#     environment:
#       - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
#       - ELASTICSEARCH_USERNAME=kibana_system
#       - ELASTICSEARCH_PASSWORD=changeme
#   
#   filebeat:
#     image: docker.elastic.co/beats/filebeat:8.12.0
#     volumes:
#       - ./filebeat.yml:/usr/share/filebeat/filebeat.yml
#       - /var/log/training:/var/log/training
#     depends_on: [elasticsearch]

# filebeat.yml for ML training logs
# filebeat.inputs:
#   - type: log
#     paths: ["/var/log/training/*.log"]
#     json.keys_under_root: true
#     json.add_error_key: true
#     fields:
#       log_type: ml_training
#   - type: log
#     paths: ["/var/log/inference/*.log"]
#     fields:
#       log_type: ml_inference
# output.elasticsearch:
#   hosts: ["elasticsearch:9200"]
#   username: "elastic"
#   password: "changeme"

@dataclass
class ELKComponent:
    component: str
    role: str
    config: str
    ml_use: str

components = [
    ELKComponent("Elasticsearch",
        "Search Engine + Storage",
        "Single node dev, 3+ nodes production, ILM policy",
        "เก็บ Training Log, Inference Log, System Metrics"),
    ELKComponent("Kibana",
        "Visualization + Dashboard",
        "Port 5601, connect to Elasticsearch",
        "Dashboard: GPU Usage, Training Status, Error Rate"),
    ELKComponent("Filebeat",
        "Log Shipper",
        "ติดตั้งบน Training Server, อ่าน Log File",
        "ส่ง Training Log, GPU Log ไป Elasticsearch"),
    ELKComponent("Logstash",
        "Log Processing Pipeline",
        "Parse JSON Log, Enrich with W&B metadata",
        "Transform Log, เพิ่ม Run ID, Project Name"),
    ELKComponent("ElastAlert",
        "Alerting",
        "Rule-based alerts on Elasticsearch queries",
        "Alert เมื่อ Training Failed, OOM, GPU Error"),
]

print("=== ELK Components ===")
for c in components:
    print(f"  [{c.component}] {c.role}")
    print(f"    Config: {c.config}")
    print(f"    ML Use: {c.ml_use}")

Integration Pipeline

# === W&B + ELK Integration ===

# Python logging to ELK
# import logging
# from pythonjsonlogger import jsonlogger
#
# logger = logging.getLogger("ml_training")
# handler = logging.FileHandler("/var/log/training/train.log")
# formatter = jsonlogger.JsonFormatter(
#     "%(asctime)s %(name)s %(levelname)s %(message)s"
# )
# handler.setFormatter(formatter)
# logger.addHandler(handler)
#
# # Log training events (picked up by Filebeat → Elasticsearch)
# logger.info("Training started", extra={
#     "wandb_run_id": run.id,
#     "wandb_project": "image-classifier",
#     "gpu_name": torch.cuda.get_device_name(),
#     "gpu_memory_total": torch.cuda.get_device_properties(0).total_mem,
# })

@dataclass
class IntegrationPoint:
    source: str
    destination: str
    data: str
    method: str
    frequency: str

integrations = [
    IntegrationPoint("Training Script",
        "W&B", "Metrics, Hyperparams, Artifacts",
        "wandb.log() SDK", "Every epoch/step"),
    IntegrationPoint("Training Script",
        "ELK (via Filebeat)", "Training logs, errors, events",
        "Python JSON logging → File → Filebeat",
        "Every log event"),
    IntegrationPoint("GPU Server",
        "ELK (via Metricbeat)", "GPU util, memory, temp, power",
        "Metricbeat nvidia module",
        "Every 10 seconds"),
    IntegrationPoint("W&B API",
        "Elasticsearch", "Run metadata, final metrics",
        "Python script: wandb API → ES bulk index",
        "After each run completes"),
    IntegrationPoint("Elasticsearch",
        "Kibana Dashboard", "Aggregated views, alerts",
        "Kibana saved objects, ElastAlert rules",
        "Real-time"),
]

print("=== Integration Points ===")
for i in integrations:
    print(f"  [{i.source}] → [{i.destination}]")
    print(f"    Data: {i.data}")
    print(f"    Method: {i.method}")
    print(f"    Freq: {i.frequency}")

เคล็ดลับ

Weights & Biases คืออะไร

MLOps Platform Experiment Track Metrics Hyperparameters Artifacts Sweeps Tables Reports Model Registry Python SDK PyTorch TensorFlow

ELK Stack คืออะไร

Elasticsearch Logstash Kibana Beats Filebeat Log Management Search Dashboard Alert ILM Open Source Application System Infrastructure

รวมกันอย่างไร

W&B Track Metrics ELK System Log GPU Filebeat Python Logging Elasticsearch Kibana Dashboard Correlation Training Error Alert Monitoring

Production Best Practices มีอะไร

Track ทุก Run Tag Project Team Artifacts Model Version Sweeps Log 30 วัน ILM Dashboard GPU Alert Training Failed Filebeat TLS Auth

สรุป

Weights Biases W&B ELK Stack Elasticsearch Kibana Experiment Tracking Log Management MLOps GPU Monitoring Alert Production Integration

📖 บทความที่เกี่ยวข้อง

Weights Biases DNS Managementอ่านบทความ → Weights Biases DevSecOps Integrationอ่านบทความ → Semgrep SAST Log Management ELKอ่านบทความ → ModSecurity WAF Log Management ELKอ่านบทความ → Tekton Pipeline Log Management ELKอ่านบทความ →

📚 ดูบทความทั้งหมด →