W&B Backup & Recovery
Weights Biases W&B Backup Recovery MLOps Experiment Tracking Artifacts Model Registry API Export S3 Git DVC Automation
| Data Type | Location | Backup Method | Frequency |
|---|---|---|---|
| Run Metrics/Config | W&B Cloud | API Export → JSON/Parquet | Daily |
| Model Checkpoints | W&B Artifacts | Download → S3/GCS | Per Training Run |
| Datasets | W&B Artifacts / DVC | DVC Push → S3/GCS | Per Version Change |
| Training Scripts | Git Repository | Git Push (Already Backed Up) | Per Commit |
| Sweep Results | W&B Cloud | API Export → JSON | Per Sweep Completion |
| W&B Server DB | Self-hosted MySQL/PG | pg_dump / mysqldump → S3 | Daily |
Backup Script
# === W&B Backup Script ===
# pip install wandb boto3 pandas
import wandb
import json
import os
from dataclasses import dataclass, asdict
from datetime import datetime
# wandb.login(key="YOUR_API_KEY")
# api = wandb.Api()
@dataclass
class BackupConfig:
entity: str
project: str
backup_dir: str
s3_bucket: str
s3_prefix: str
max_runs: int
config = BackupConfig(
entity="my-team",
project="my-project",
backup_dir="/tmp/wandb_backup",
s3_bucket="my-backup-bucket",
s3_prefix="wandb-backup",
max_runs=1000
)
def backup_runs(api, config):
"""Backup all runs from a W&B project"""
runs = api.runs(f"{config.entity}/{config.project}",
per_page=50, order="-created_at")
backup_data = []
for run in runs:
run_data = {
"id": run.id,
"name": run.name,
"state": run.state,
"config": dict(run.config),
"summary": dict(run.summary._json_dict),
"created_at": run.created_at,
"tags": run.tags,
"notes": run.notes,
}
backup_data.append(run_data)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"runs_backup_{timestamp}.json"
filepath = os.path.join(config.backup_dir, filename)
os.makedirs(config.backup_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(backup_data, f, indent=2, default=str)
print(f"Backed up {len(backup_data)} runs to {filepath}")
return filepath
def backup_artifacts(api, config, artifact_name, version="latest"):
"""Download artifact from W&B"""
artifact = api.artifact(
f"{config.entity}/{config.project}/{artifact_name}:{version}")
download_dir = os.path.join(config.backup_dir, "artifacts", artifact_name)
artifact.download(root=download_dir)
print(f"Downloaded {artifact_name}:{version} to {download_dir}")
print("=== W&B Backup Config ===")
for k, v in asdict(config).items():
print(f" {k}: {v}")
Recovery Procedures
# === Recovery Procedures ===
@dataclass
class RecoveryProcedure:
scenario: str
rto: str
rpo: str
steps: str
verification: str
procedures = [
RecoveryProcedure("Training Interrupted",
"5-10 นาที",
"Last Checkpoint (ทุก Epoch)",
"1. ดึง Checkpoint จาก W&B Artifacts "
"2. wandb.init(resume=True) "
"3. Load Checkpoint ต่อ Training",
"ตรวจ Loss ต่อจากจุดเดิม ไม่ Reset"),
RecoveryProcedure("Model Version Rollback",
"1-5 นาที",
"Exact Version (W&B Registry)",
"1. wandb.Api().artifact('model:v3').download() "
"2. Load Model v3 "
"3. Deploy Model v3 แทน v4",
"ตรวจ Inference ผลลัพธ์ตรงกับ v3 Original"),
RecoveryProcedure("Experiment Reproduction",
"30-60 นาที (Re-train)",
"Exact Config (W&B Run Config)",
"1. ดึง Config จาก W&B API "
"2. ดึง Dataset Version จาก DVC/Artifacts "
"3. Run Training ด้วย Config + Seed เดิม",
"ตรวจ Metrics ใกล้เคียง Original Run"),
RecoveryProcedure("W&B Server Down (Self-hosted)",
"1-4 ชั่วโมง",
"Last Daily Backup",
"1. Restore Database จาก pg_dump "
"2. Restore Object Storage จาก S3 "
"3. Restart W&B Server Pods "
"4. Verify Data Integrity",
"ตรวจ Run Count Artifact Count ตรงกับ Backup"),
RecoveryProcedure("Accidental Run/Artifact Delete",
"10-30 นาที",
"Last Backup Timestamp",
"1. ดึง Run Data จาก JSON Backup "
"2. Re-upload Artifacts จาก S3 Backup "
"3. สร้าง Run ใหม่ด้วย wandb.init() + Log Data",
"ตรวจ Run Metrics Config ตรงกับ Backup"),
]
print("=== Recovery Procedures ===")
for p in procedures:
print(f"\n [{p.scenario}] RTO: {p.rto} | RPO: {p.rpo}")
print(f" Steps: {p.steps}")
print(f" Verify: {p.verification}")
Automation & Monitoring
# === Backup Automation ===
# Cron Job (crontab -e)
# 0 2 * * * /usr/bin/python3 /opt/wandb-backup/backup.py >> /var/log/wandb-backup.log 2>&1
# GitHub Actions (Weekly)
# name: W&B Backup
# on:
# schedule:
# - cron: '0 2 * * 0' # Every Sunday 2 AM
# jobs:
# backup:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - run: pip install wandb boto3
# - run: python backup.py
# env:
# WANDB_API_KEY: }
# AWS_ACCESS_KEY_ID: }
@dataclass
class RetentionPolicy:
tier: str
frequency: str
retention: str
storage: str
cost: str
policies = [
RetentionPolicy("Hot (Recent)",
"Daily Backup",
"30 วัน",
"S3 Standard",
"~$0.023/GB/เดือน"),
RetentionPolicy("Warm (Monthly)",
"Weekly Backup → Monthly Archive",
"12 เดือน",
"S3 Infrequent Access",
"~$0.0125/GB/เดือน"),
RetentionPolicy("Cold (Annual)",
"Monthly Backup → Annual Archive",
"5 ปี",
"S3 Glacier",
"~$0.004/GB/เดือน"),
RetentionPolicy("Critical Models",
"Per Release Version",
"ตลอดไป",
"S3 Standard + Cross-region",
"~$0.046/GB/เดือน (2 regions)"),
]
print("=== Retention Policy ===")
for p in policies:
print(f" [{p.tier}] Freq: {p.frequency}")
print(f" Retention: {p.retention}")
print(f" Storage: {p.storage} | Cost: {p.cost}")
เคล็ดลับ
- Checkpoint: Log Checkpoint เป็น W&B Artifact ทุก Epoch
- resume: ใช้ wandb.init(resume=True) ต่อ Training ได้
- API: ใช้ wandb API Export ข้อมูลทุกวัน
- DVC: ใช้ DVC สำหรับ Dataset Version Control
- Test: ทดสอบ Recovery ทุก Quarter อย่ารอจนเกิดปัญหา
Weights & Biases คืออะไร
MLOps Platform Experiment Tracking Model Registry Artifacts Sweeps Dashboard Reports wandb API Python Free Individual Team Enterprise
Backup Strategy ทำอย่างไร
API Export JSON Parquet Artifacts Download S3 GCS Git DVC Self-hosted pg_dump Cron Job Daily Automated Retention Policy
Recovery ทำอย่างไร
resume=True Checkpoint Artifact Download Model Rollback Registry Config Reproduce RTO RPO Restore Database Verify Integrity
Automation ทำอย่างไร
Python wandb API Cron Job Daily GitHub Actions Weekly Retention Hot Warm Cold Glacier S3 Alert Slack Test Recovery Quarter
สรุป
Weights Biases W&B Backup Recovery MLOps API Export Artifacts Checkpoint DVC S3 Automation Cron Retention Recovery Production
