Weights Biases Backup Recovery Strategy —

W&B Backup & Recovery

Weights Biases W&B Backup Recovery MLOps Experiment Tracking Artifacts Model Registry API Export S3 Git DVC Automation

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Qwik Resumability DevSecOps Integration

Data Type	Location	Backup Method	Frequency
Run Metrics/Config	W&B Cloud	API Export → JSON/Parquet	Daily
Model Checkpoints	W&B Artifacts	Download → S3/GCS	Per Training Run
Datasets	W&B Artifacts / DVC	DVC Push → S3/GCS	Per Version Change
Training Scripts	Git Repository	Git Push (Already Backed Up)	Per Commit
Sweep Results	W&B Cloud	API Export → JSON	Per Sweep Completion
W&B Server DB	Self-hosted MySQL/PG	pg_dump / mysqldump → S3	Daily

Backup Script

# === W&B Backup Script ===

# pip install wandb boto3 pandas

import wandb
import json
import os
from dataclasses import dataclass, asdict
from datetime import datetime

# wandb.login(key="YOUR_API_KEY")
# api = wandb.Api()

@dataclass
class BackupConfig:
    entity: str
    project: str
    backup_dir: str
    s3_bucket: str
    s3_prefix: str
    max_runs: int

config = BackupConfig(
    entity="my-team",
    project="my-project",
    backup_dir="/tmp/wandb_backup",
    s3_bucket="my-backup-bucket",
    s3_prefix="wandb-backup",
    max_runs=1000
)

def backup_runs(api, config):
    """Backup all runs from a W&B project"""
    runs = api.runs(f"{config.entity}/{config.project}",
                    per_page=50, order="-created_at")
    backup_data = []
    for run in runs:
        run_data = {
            "id": run.id,
            "name": run.name,
            "state": run.state,
            "config": dict(run.config),
            "summary": dict(run.summary._json_dict),
            "created_at": run.created_at,
            "tags": run.tags,
            "notes": run.notes,
        }
        backup_data.append(run_data)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"runs_backup_{timestamp}.json"
    filepath = os.path.join(config.backup_dir, filename)
    os.makedirs(config.backup_dir, exist_ok=True)
    with open(filepath, 'w') as f:
        json.dump(backup_data, f, indent=2, default=str)
    print(f"Backed up {len(backup_data)} runs to {filepath}")
    return filepath

def backup_artifacts(api, config, artifact_name, version="latest"):
    """Download artifact from W&B"""
    artifact = api.artifact(
        f"{config.entity}/{config.project}/{artifact_name}:{version}")
    download_dir = os.path.join(config.backup_dir, "artifacts", artifact_name)
    artifact.download(root=download_dir)
    print(f"Downloaded {artifact_name}:{version} to {download_dir}")

print("=== W&B Backup Config ===")
for k, v in asdict(config).items():
    print(f"  {k}: {v}")

Recovery Procedures

# === Recovery Procedures ===

@dataclass
class RecoveryProcedure:
    scenario: str
    rto: str
    rpo: str
    steps: str
    verification: str

procedures = [
    RecoveryProcedure("Training Interrupted",
        "5-10 นาที",
        "Last Checkpoint (ทุก Epoch)",
        "1. ดึง Checkpoint จาก W&B Artifacts "
        "2. wandb.init(resume=True) "
        "3. Load Checkpoint ต่อ Training",
        "ตรวจ Loss ต่อจากจุดเดิม ไม่ Reset"),
    RecoveryProcedure("Model Version Rollback",
        "1-5 นาที",
        "Exact Version (W&B Registry)",
        "1. wandb.Api().artifact('model:v3').download() "
        "2. Load Model v3 "
        "3. Deploy Model v3 แทน v4",
        "ตรวจ Inference ผลลัพธ์ตรงกับ v3 Original"),
    RecoveryProcedure("Experiment Reproduction",
        "30-60 นาที (Re-train)",
        "Exact Config (W&B Run Config)",
        "1. ดึง Config จาก W&B API "
        "2. ดึง Dataset Version จาก DVC/Artifacts "
        "3. Run Training ด้วย Config + Seed เดิม",
        "ตรวจ Metrics ใกล้เคียง Original Run"),
    RecoveryProcedure("W&B Server Down (Self-hosted)",
        "1-4 ชั่วโมง",
        "Last Daily Backup",
        "1. Restore Database จาก pg_dump "
        "2. Restore Object Storage จาก S3 "
        "3. Restart W&B Server Pods "
        "4. Verify Data Integrity",
        "ตรวจ Run Count Artifact Count ตรงกับ Backup"),
    RecoveryProcedure("Accidental Run/Artifact Delete",
        "10-30 นาที",
        "Last Backup Timestamp",
        "1. ดึง Run Data จาก JSON Backup "
        "2. Re-upload Artifacts จาก S3 Backup "
        "3. สร้าง Run ใหม่ด้วย wandb.init() + Log Data",
        "ตรวจ Run Metrics Config ตรงกับ Backup"),
]

print("=== Recovery Procedures ===")
for p in procedures:
    print(f"\n  [{p.scenario}] RTO: {p.rto} | RPO: {p.rpo}")
    print(f"    Steps: {p.steps}")
    print(f"    Verify: {p.verification}")

Automation & Monitoring

# === Backup Automation ===

# Cron Job (crontab -e)
# 0 2 * * * /usr/bin/python3 /opt/wandb-backup/backup.py >> /var/log/wandb-backup.log 2>&1

# GitHub Actions (Weekly)
# name: W&B Backup
# on:
#   schedule:
#     - cron: '0 2 * * 0'  # Every Sunday 2 AM
# jobs:
#   backup:
#     runs-on: ubuntu-latest
#     steps:
#       - uses: actions/checkout@v4
#       - run: pip install wandb boto3
#       - run: python backup.py
#         env:
#           WANDB_API_KEY: }
#           AWS_ACCESS_KEY_ID: }

@dataclass
class RetentionPolicy:
    tier: str
    frequency: str
    retention: str
    storage: str
    cost: str

policies = [
    RetentionPolicy("Hot (Recent)",
        "Daily Backup",
        "30 วัน",
        "S3 Standard",
        "~$0.023/GB/เดือน"),
    RetentionPolicy("Warm (Monthly)",
        "Weekly Backup → Monthly Archive",
        "12 เดือน",
        "S3 Infrequent Access",
        "~$0.0125/GB/เดือน"),
    RetentionPolicy("Cold (Annual)",
        "Monthly Backup → Annual Archive",
        "5 ปี",
        "S3 Glacier",
        "~$0.004/GB/เดือน"),
    RetentionPolicy("Critical Models",
        "Per Release Version",
        "ตลอดไป",
        "S3 Standard + Cross-region",
        "~$0.046/GB/เดือน (2 regions)"),
]

print("=== Retention Policy ===")
for p in policies:
    print(f"  [{p.tier}] Freq: {p.frequency}")
    print(f"    Retention: {p.retention}")
    print(f"    Storage: {p.storage} | Cost: {p.cost}")