Multus CNI Backup Recovery Strategy —

Multus CNI Backup

Multus CNI Backup Recovery NetworkAttachmentDefinition IPAM SR-IOV Macvlan Disaster Recovery Configuration Export Git Production Operations

เนื้อหาเกี่ยวข้อง — LXC vs Docker เลือก Container Technology อะไรดี

Backup Item	Location	Method	Frequency	Priority
NAD Resources	Kubernetes API	kubectl export / Velero	Every change + daily	Critical
CNI Config Files	/etc/cni/net.d/	Ansible collect / rsync	Daily	High
IPAM Database	Whereabouts CRD	kubectl export	Hourly	High
SR-IOV Config	ConfigMap + Node	kubectl + Ansible	Every change	High
Helm Values	Git repository	Git commit	Every change	Medium
Pod Annotations	Pod specs	kubectl export	With deployment	Medium

Backup Procedures

# === Multus CNI Backup Script ===

# Export all NetworkAttachmentDefinitions
# kubectl get network-attachment-definitions -A -o yaml > backup/nad-all.yaml

# Export per namespace
# for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}'); do
#   kubectl get nad -n $ns -o yaml > backup/nad-$ns.yaml 2>/dev/null
# done

# Backup CNI config from all nodes
# for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
#   ssh $node "cat /etc/cni/net.d/00-multus.conf" > backup/multus-$node.conf
#   ssh $node "ls /etc/cni/net.d/" > backup/cni-files-$node.txt
# done

# Backup Whereabouts IPAM
# kubectl get ippools.whereabouts.cni.cncf.io -A -o yaml > backup/whereabouts-pools.yaml
# kubectl get overlappingrangeipreservations.whereabouts.cni.cncf.io -A -o yaml > backup/whereabouts-reservations.yaml

# Backup SR-IOV config
# kubectl get sriovnetworknodepolicies.sriovnetwork.openshift.io -A -o yaml > backup/sriov-policies.yaml
# kubectl get sriovnetworks.sriovnetwork.openshift.io -A -o yaml > backup/sriov-networks.yaml

# Velero backup with Multus resources
# velero backup create multus-backup \
#   --include-resources networkattachmentdefinitions, ippools, sriovnetworks \
#   --include-namespaces default, production, telco

from dataclasses import dataclass

@dataclass
class BackupJob:
    name: str
    command: str
    schedule: str
    retention: str
    storage: str

jobs = [
    BackupJob("NAD Export", "kubectl get nad -A -o yaml", "Daily 02:00", "30 days", "Git + S3"),
    BackupJob("IPAM State", "kubectl get ippools -A -o yaml", "Hourly", "7 days", "S3"),
    BackupJob("CNI Config", "Ansible collect /etc/cni/net.d/", "Daily 03:00", "30 days", "Git + S3"),
    BackupJob("SR-IOV Policy", "kubectl get sriovnetworknodepolicies -o yaml", "On change", "30 days", "Git"),
    BackupJob("Velero Full", "velero backup create", "Daily 04:00", "14 days", "S3"),
    BackupJob("Helm Values", "git commit values.yaml", "On change", "Forever", "Git"),
]

print("=== Backup Jobs ===")
for j in jobs:
    print(f"  [{j.name}] Schedule: {j.schedule}")
    print(f"    Command: {j.command}")
    print(f"    Retention: {j.retention} | Storage: {j.storage}")

Recovery Procedures

# === Disaster Recovery Runbook ===

@dataclass
class RecoveryStep:
    order: int
    step: str
    command: str
    verify: str
    rollback: str

steps = [
    RecoveryStep(1, "Install Multus CNI",
        "helm install multus -f values.yaml",
        "kubectl get pods -n kube-system | grep multus",
        "helm uninstall multus"),
    RecoveryStep(2, "Restore NAD resources",
        "kubectl apply -f backup/nad-all.yaml",
        "kubectl get nad -A",
        "kubectl delete -f backup/nad-all.yaml"),
    RecoveryStep(3, "Restore IPAM state",
        "kubectl apply -f backup/whereabouts-pools.yaml",
        "kubectl get ippools -A",
        "kubectl delete ippools -A"),
    RecoveryStep(4, "Restore SR-IOV config",
        "kubectl apply -f backup/sriov-policies.yaml",
        "kubectl get sriovnetworknodepolicies -A",
        "kubectl delete -f backup/sriov-policies.yaml"),
    RecoveryStep(5, "Verify CNI config on nodes",
        "ansible all -m shell -a 'cat /etc/cni/net.d/00-multus.conf'",
        "Compare with backup config",
        "Ansible push backup config"),
    RecoveryStep(6, "Restart affected pods",
        "kubectl rollout restart deployment -n production",
        "kubectl get pods -n production -o wide (check IPs)",
        "kubectl rollout undo deployment -n production"),
    RecoveryStep(7, "Network connectivity test",
        "kubectl exec test-pod -- ping -c 3 10.0.1.1",
        "All pings succeed, no packet loss",
        "Check NAD config, IPAM, routes"),
]

print("=== Recovery Runbook ===")
for s in steps:
    print(f"  Step {s.order}: {s.step}")
    print(f"    Command: {s.command}")
    print(f"    Verify: {s.verify}")
    print(f"    Rollback: {s.rollback}")

Automation and Testing

# === Automated DR Testing ===

# CronJob for automated backup verification
# apiVersion: batch/v1
# kind: CronJob
# metadata:
#   name: multus-backup-verify
# spec:
#   schedule: "0 6 * * 1"  # Every Monday 6:00
#   jobTemplate:
#     spec:
#       template:
#         spec:
#           containers:
#             - name: verify
#               image: bitnami/kubectl
#               command:
#                 - /bin/bash
#                 - -c
#                 - |
#                   # Count NADs
#                   CURRENT=$(kubectl get nad -A --no-headers | wc -l)
#                   BACKUP=$(grep "kind: NetworkAttachmentDefinition" /backup/nad-all.yaml | wc -l)
#                   if [ "$CURRENT" != "$BACKUP" ]; then
#                     echo "WARNING: NAD count mismatch current=$CURRENT backup=$BACKUP"
#                     # Send alert
#                   fi
#                   # Verify IPAM
#                   kubectl get ippools -A --no-headers | wc -l

@dataclass
class DRTest:
    test: str
    frequency: str
    method: str
    pass_criteria: str
    last_result: str

tests = [
    DRTest("NAD backup completeness", "Weekly", "Compare NAD count backup vs live",
        "Count matches, all NADs in backup", "PASS"),
    DRTest("IPAM state consistency", "Daily", "Compare IP assignments backup vs live",
        "No orphaned IPs, no conflicts", "PASS"),
    DRTest("Restore to test cluster", "Monthly", "Apply backup to test cluster, deploy pods",
        "Pods get correct IPs, traffic flows", "PASS"),
    DRTest("SR-IOV recovery", "Quarterly", "Restore SR-IOV config, verify VF binding",
        "VFs allocated, pod network works", "PASS"),
    DRTest("Full DR drill", "Quarterly", "Simulate cluster loss, full recovery",
        "All networks restored < 2 hours", "PASS"),
]

print("DR Test Schedule:")
for t in tests:
    print(f"  [{t.test}] Frequency: {t.frequency}")
    print(f"    Method: {t.method}")
    print(f"    Pass: {t.pass_criteria} | Last: {t.last_result}")

rto_rpo = {
    "NAD Resources": "RTO: 15 min | RPO: Last change (Git)",
    "IPAM State": "RTO: 30 min | RPO: 1 hour",
    "SR-IOV Config": "RTO: 1 hour | RPO: Last change (Git)",
    "Full Network Recovery": "RTO: 2 hours | RPO: 1 hour",
    "Pod Network Connectivity": "RTO: 30 min (after infra) | RPO: N/A",
}

print(f"\n\nRTO/RPO Targets:")
for k, v in rto_rpo.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

GitOps: เก็บ NAD ทั้งหมดใน Git Repository ทุกการเปลี่ยนแปลงต้อง Commit
Velero: ใช้ Velero Backup Kubernetes Resources รวม Multus CRDs
IPAM: Backup IPAM State บ่อยกว่า NAD เพราะเปลี่ยนตลอด
Test: ทดสอบ Recovery บน Test Cluster ทุกเดือน
Label: ใส่ Label บน NAD สำหรับ Filter Backup ตาม Environment

Multus CNI คืออะไร

Meta CNI Plugin Multi-homed Pod หลาย Network Interface Calico Flannel SR-IOV Macvlan NAD Telco NFV Storage Management Network Traffic

แนะนำเพิ่มเติม — XM Signal

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: indexof javascript คือ

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Nuclei Scanner Edge Deployment