ai
Weights Biases Backup Recovery Strategy —
W&B Backup & Recovery

Weights Biases W&B Backup Recovery MLOps Experiment Tracking Artifacts Model Registry API Export S3 Git DVC Automation
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Qwik Resumability DevSecOps Integration
| Data Type | Location | Backup Method | Frequency |
|---|---|---|---|
| Run Metrics/Config | W&B Cloud | API Export → JSON/Parquet | Daily |
| Model Checkpoints | W&B Artifacts | Download → S3/GCS | Per Training Run |
| Datasets | W&B Artifacts / DVC | DVC Push → S3/GCS | Per Version Change |
| Training Scripts | Git Repository | Git Push (Already Backed Up) | Per Commit |
| Sweep Results | W&B Cloud | API Export → JSON | Per Sweep Completion |
| W&B Server DB | Self-hosted MySQL/PG | pg_dump / mysqldump → S3 | Daily |
Backup Script
# === W&B Backup Script ===
# pip install wandb boto3 pandas
import wandb
import json
import os
from dataclasses import dataclass, asdict
from datetime import datetime
# wandb.login(key="YOUR_API_KEY")
# api = wandb.Api()
@dataclass
class BackupConfig:
entity: str
project: str
backup_dir: str
s3_bucket: str
s3_prefix: str
max_runs: int
config = BackupConfig(
entity="my-team",
project="my-project",
backup_dir="/tmp/wandb_backup",
s3_bucket="my-backup-bucket",
s3_prefix="wandb-backup",
max_runs=1000
)
def backup_runs(api, config):
"""Backup all runs from a W&B project"""
runs = api.runs(f"{config.entity}/{config.project}",
per_page=50, order="-created_at")
backup_data = []
for run in runs:
run_data = {
"id": run.id,
"name": run.name,
"state": run.state,
"config": dict(run.config),
"summary": dict(run.summary._json_dict),
"created_at": run.created_at,
"tags": run.tags,
"notes": run.notes,
}
backup_data.append(run_data)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"runs_backup_{timestamp}.json"
filepath = os.path.join(config.backup_dir, filename)
os.makedirs(config.backup_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(backup_data, f, indent=2, default=str)
print(f"Backed up {len(backup_data)} runs to {filepath}")
return filepath
def backup_artifacts(api, config, artifact_name, version="latest"):
"""Download artifact from W&B"""
artifact = api.artifact(
f"{config.entity}/{config.project}/{artifact_name}:{version}")
download_dir = os.path.join(config.backup_dir, "artifacts", artifact_name)
artifact.download(root=download_dir)
print(f"Downloaded {artifact_name}:{version} to {download_dir}")
print("=== W&B Backup Config ===")
for k, v in asdict(config).items():
print(f" {k}: {v}")
Recovery Procedures

# === Recovery Procedures ===
@dataclass
class RecoveryProcedure:
scenario: str
rto: str
rpo: str
steps: str
verification: str
procedures = [
RecoveryProcedure("Training Interrupted",
"5-10 นาที",
"Last Checkpoint (ทุก Epoch)",
"1. ดึง Checkpoint จาก W&B Artifacts "
"2. wandb.init(resume=True) "
"3. Load Checkpoint ต่อ Training",
"ตรวจ Loss ต่อจากจุดเดิม ไม่ Reset"),
RecoveryProcedure("Model Version Rollback",
"1-5 นาที",
"Exact Version (W&B Registry)",
"1. wandb.Api().artifact('model:v3').download() "
"2. Load Model v3 "
"3. Deploy Model v3 แทน v4",
"ตรวจ Inference ผลลัพธ์ตรงกับ v3 Original"),
RecoveryProcedure("Experiment Reproduction",
"30-60 นาที (Re-train)",
"Exact Config (W&B Run Config)",
"1. ดึง Config จาก W&B API "
"2. ดึง Dataset Version จาก DVC/Artifacts "
"3. Run Training ด้วย Config + Seed เดิม",
"ตรวจ Metrics ใกล้เคียง Original Run"),
RecoveryProcedure("W&B Server Down (Self-hosted)",
"1-4 ชั่วโมง",
"Last Daily Backup",
"1. Restore Database จาก pg_dump "
"2. Restore Object Storage จาก S3 "
"3. Restart W&B Server Pods "
"4. Verify Data Integrity",
"ตรวจ Run Count Artifact Count ตรงกับ Backup"),
RecoveryProcedure("Accidental Run/Artifact Delete",
"10-30 นาที",
"Last Backup Timestamp",
"1. ดึง Run Data จาก JSON Backup "
"2. Re-upload Artifacts จาก S3 Backup "
"3. สร้าง Run ใหม่ด้วย wandb.init() + Log Data",
"ตรวจ Run Metrics Config ตรงกับ Backup"),
]
print("=== Recovery Procedures ===")
for p in procedures:
print(f"\n [{p.scenario}] RTO: {p.rto} | RPO: {p.rpo}")
print(f" Steps: {p.steps}")
print(f" Verify: {p.verification}")
Automation & Monitoring
# === Backup Automation ===
# Cron Job (crontab -e)
# 0 2 * * * /usr/bin/python3 /opt/wandb-backup/backup.py >> /var/log/wandb-backup.log 2>&1
# GitHub Actions (Weekly)
# name: W&B Backup
# on:
# schedule:
# - cron: '0 2 * * 0' # Every Sunday 2 AM
# jobs:
# backup:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - run: pip install wandb boto3
# - run: python backup.py
# env:
# WANDB_API_KEY: }
# AWS_ACCESS_KEY_ID: }
@dataclass
class RetentionPolicy:
tier: str
frequency: str
retention: str
storage: str
cost: str
policies = [
RetentionPolicy("Hot (Recent)",
"Daily Backup",
"30 วัน",
"S3 Standard",
"~$0.023/GB/เดือน"),
RetentionPolicy("Warm (Monthly)",
"Weekly Backup → Monthly Archive",
"12 เดือน",
"S3 Infrequent Access",
"~$0.0125/GB/เดือน"),
RetentionPolicy("Cold (Annual)",
"Monthly Backup → Annual Archive",
"5 ปี",
"S3 Glacier",
"~$0.004/GB/เดือน"),
RetentionPolicy("Critical Models",
"Per Release Version",
"ตลอดไป",
"S3 Standard + Cross-region",
"~$0.046/GB/เดือน (2 regions)"),
]
print("=== Retention Policy ===")
for p in policies:
print(f" [{p.tier}] Freq: {p.frequency}")
print(f" Retention: {p.retention}")
print(f" Storage: {p.storage} | Cost: {p.cost}")
เคล็ดลับ
- Checkpoint: Log Checkpoint เป็น W&B Artifact ทุก Epoch
- resume: ใช้ wandb.init(resume=True) ต่อ Training ได้
- API: ใช้ wandb API Export ข้อมูลทุกวัน
- DVC: ใช้ DVC สำหรับ Dataset Version Control
- Test: ทดสอบ Recovery ทุก Quarter อย่ารอจนเกิดปัญหา
Weights & Biases คืออะไร
MLOps Platform Experiment Tracking Model Registry Artifacts Sweeps Dashboard Reports wandb API Python Free Individual Team Enterprise
แนะนำเพิ่มเติม — iCafeForex
เนื้อหาเกี่ยวข้อง — สแกมเมอรคือ — คู่มือฉบับสมบูรณ์ 2026
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง DuckDB Analytics Compliance Automation





