SiamCafe · Blog
Multus CNI Backup Recovery Strategy —
บทความ

Multus CNI Backup Recovery Strategy —

เผยแพร่ 28 พฤษภาคม 2569

Multus CNI Backup

Multus CNI Backup Recovery Strategy —

Multus CNI Backup Recovery NetworkAttachmentDefinition IPAM SR-IOV Macvlan Disaster Recovery Configuration Export Git Production Operations

Backup ItemLocationMethodFrequencyPriority
NAD ResourcesKubernetes APIkubectl export / VeleroEvery change + dailyCritical
CNI Config Files/etc/cni/net.d/Ansible collect / rsyncDailyHigh
IPAM DatabaseWhereabouts CRDkubectl exportHourlyHigh
SR-IOV ConfigConfigMap + Nodekubectl + AnsibleEvery changeHigh
Helm ValuesGit repositoryGit commitEvery changeMedium
Pod AnnotationsPod specskubectl exportWith deploymentMedium

Backup Procedures

# === Multus CNI Backup Script ===

# Export all NetworkAttachmentDefinitions
# kubectl get network-attachment-definitions -A -o yaml > backup/nad-all.yaml

# Export per namespace
# for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}'); do
#   kubectl get nad -n $ns -o yaml > backup/nad-$ns.yaml 2>/dev/null
# done

# Backup CNI config from all nodes
# for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
#   ssh $node "cat /etc/cni/net.d/00-multus.conf" > backup/multus-$node.conf
#   ssh $node "ls /etc/cni/net.d/" > backup/cni-files-$node.txt
# done

# Backup Whereabouts IPAM
# kubectl get ippools.whereabouts.cni.cncf.io -A -o yaml > backup/whereabouts-pools.yaml
# kubectl get overlappingrangeipreservations.whereabouts.cni.cncf.io -A -o yaml > backup/whereabouts-reservations.yaml

# Backup SR-IOV config
# kubectl get sriovnetworknodepolicies.sriovnetwork.openshift.io -A -o yaml > backup/sriov-policies.yaml
# kubectl get sriovnetworks.sriovnetwork.openshift.io -A -o yaml > backup/sriov-networks.yaml

# Velero backup with Multus resources
# velero backup create multus-backup \
#   --include-resources networkattachmentdefinitions, ippools, sriovnetworks \
#   --include-namespaces default, production, telco

from dataclasses import dataclass

@dataclass
class BackupJob:
    name: str
    command: str
    schedule: str
    retention: str
    storage: str

jobs = [
    BackupJob("NAD Export", "kubectl get nad -A -o yaml", "Daily 02:00", "30 days", "Git + S3"),
    BackupJob("IPAM State", "kubectl get ippools -A -o yaml", "Hourly", "7 days", "S3"),
    BackupJob("CNI Config", "Ansible collect /etc/cni/net.d/", "Daily 03:00", "30 days", "Git + S3"),
    BackupJob("SR-IOV Policy", "kubectl get sriovnetworknodepolicies -o yaml", "On change", "30 days", "Git"),
    BackupJob("Velero Full", "velero backup create", "Daily 04:00", "14 days", "S3"),
    BackupJob("Helm Values", "git commit values.yaml", "On change", "Forever", "Git"),
]

print("=== Backup Jobs ===")
for j in jobs:
    print(f"  [{j.name}] Schedule: {j.schedule}")
    print(f"    Command: {j.command}")
    print(f"    Retention: {j.retention} | Storage: {j.storage}")

Recovery Procedures

Multus CNI Backup Recovery Strategy —
# === Disaster Recovery Runbook ===

@dataclass
class RecoveryStep:
    order: int
    step: str
    command: str
    verify: str
    rollback: str

steps = [
    RecoveryStep(1, "Install Multus CNI",
        "helm install multus -f values.yaml",
        "kubectl get pods -n kube-system | grep multus",
        "helm uninstall multus"),
    RecoveryStep(2, "Restore NAD resources",
        "kubectl apply -f backup/nad-all.yaml",
        "kubectl get nad -A",
        "kubectl delete -f backup/nad-all.yaml"),
    RecoveryStep(3, "Restore IPAM state",
        "kubectl apply -f backup/whereabouts-pools.yaml",
        "kubectl get ippools -A",
        "kubectl delete ippools -A"),
    RecoveryStep(4, "Restore SR-IOV config",
        "kubectl apply -f backup/sriov-policies.yaml",
        "kubectl get sriovnetworknodepolicies -A",
        "kubectl delete -f backup/sriov-policies.yaml"),
    RecoveryStep(5, "Verify CNI config on nodes",
        "ansible all -m shell -a 'cat /etc/cni/net.d/00-multus.conf'",
        "Compare with backup config",
        "Ansible push backup config"),
    RecoveryStep(6, "Restart affected pods",
        "kubectl rollout restart deployment -n production",
        "kubectl get pods -n production -o wide (check IPs)",
        "kubectl rollout undo deployment -n production"),
    RecoveryStep(7, "Network connectivity test",
        "kubectl exec test-pod -- ping -c 3 10.0.1.1",
        "All pings succeed, no packet loss",
        "Check NAD config, IPAM, routes"),
]

print("=== Recovery Runbook ===")
for s in steps:
    print(f"  Step {s.order}: {s.step}")
    print(f"    Command: {s.command}")
    print(f"    Verify: {s.verify}")
    print(f"    Rollback: {s.rollback}")

Automation and Testing

# === Automated DR Testing ===

# CronJob for automated backup verification
# apiVersion: batch/v1
# kind: CronJob
# metadata:
#   name: multus-backup-verify
# spec:
#   schedule: "0 6 * * 1"  # Every Monday 6:00
#   jobTemplate:
#     spec:
#       template:
#         spec:
#           containers:
#             - name: verify
#               image: bitnami/kubectl
#               command:
#                 - /bin/bash
#                 - -c
#                 - |
#                   # Count NADs
#                   CURRENT=$(kubectl get nad -A --no-headers | wc -l)
#                   BACKUP=$(grep "kind: NetworkAttachmentDefinition" /backup/nad-all.yaml | wc -l)
#                   if [ "$CURRENT" != "$BACKUP" ]; then
#                     echo "WARNING: NAD count mismatch current=$CURRENT backup=$BACKUP"
#                     # Send alert
#                   fi
#                   # Verify IPAM
#                   kubectl get ippools -A --no-headers | wc -l

@dataclass
class DRTest:
    test: str
    frequency: str
    method: str
    pass_criteria: str
    last_result: str

tests = [
    DRTest("NAD backup completeness", "Weekly", "Compare NAD count backup vs live",
        "Count matches, all NADs in backup", "PASS"),
    DRTest("IPAM state consistency", "Daily", "Compare IP assignments backup vs live",
        "No orphaned IPs, no conflicts", "PASS"),
    DRTest("Restore to test cluster", "Monthly", "Apply backup to test cluster, deploy pods",
        "Pods get correct IPs, traffic flows", "PASS"),
    DRTest("SR-IOV recovery", "Quarterly", "Restore SR-IOV config, verify VF binding",
        "VFs allocated, pod network works", "PASS"),
    DRTest("Full DR drill", "Quarterly", "Simulate cluster loss, full recovery",
        "All networks restored < 2 hours", "PASS"),
]

print("DR Test Schedule:")
for t in tests:
    print(f"  [{t.test}] Frequency: {t.frequency}")
    print(f"    Method: {t.method}")
    print(f"    Pass: {t.pass_criteria} | Last: {t.last_result}")

rto_rpo = {
    "NAD Resources": "RTO: 15 min | RPO: Last change (Git)",
    "IPAM State": "RTO: 30 min | RPO: 1 hour",
    "SR-IOV Config": "RTO: 1 hour | RPO: Last change (Git)",
    "Full Network Recovery": "RTO: 2 hours | RPO: 1 hour",
    "Pod Network Connectivity": "RTO: 30 min (after infra) | RPO: N/A",
}

print(f"\n\nRTO/RPO Targets:")
for k, v in rto_rpo.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

  • GitOps: เก็บ NAD ทั้งหมดใน Git Repository ทุกการเปลี่ยนแปลงต้อง Commit
  • Velero: ใช้ Velero Backup Kubernetes Resources รวม Multus CRDs
  • IPAM: Backup IPAM State บ่อยกว่า NAD เพราะเปลี่ยนตลอด
  • Test: ทดสอบ Recovery บน Test Cluster ทุกเดือน
  • Label: ใส่ Label บน NAD สำหรับ Filter Backup ตาม Environment

Multus CNI คืออะไร

Meta CNI Plugin Multi-homed Pod หลาย Network Interface Calico Flannel SR-IOV Macvlan NAD Telco NFV Storage Management Network Traffic