A/B Testing ML กับ MLOps Workflow

A/B Testing สำหรับ ML Models

A/B Testing เป็นวิธีมาตรฐานในการเปรียบเทียบ ML Models ก่อน Deploy เต็มรูปแบบ แบ่ง Traffic ส่งไป Model เก่าและใหม่ วัดผลด้วย Statistical Tests ตัดสินใจจากข้อมูลจริง

MLOps Workflow ทำให้กระบวนการ A/B Testing อัตโนมัติ ตั้งแต่ Training, Registry, Deployment, Traffic Splitting, Monitoring ไปจนถึง Automated Rollback

A/B Testing Framework

# ab_testing_ml.py — A/B Testing Framework สำหรับ ML
# pip install scipy numpy pandas

import numpy as np
import pandas as pd
from scipy import stats
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple
from datetime import datetime, timedelta
import json

@dataclass
class ABExperiment:
    name: str
    model_a: str  # Control
    model_b: str  # Treatment
    metric: str   # Primary metric
    traffic_split: float = 0.1  # % ไป Model B
    min_sample_size: int = 1000
    significance_level: float = 0.05
    power: float = 0.80
    start_date: Optional[str] = None
    status: str = "draft"  # draft, running, completed, rolled_back

@dataclass
class ABResult:
    metric_a: float
    metric_b: float
    sample_a: int
    sample_b: int
    p_value: float
    confidence_interval: Tuple[float, float]
    is_significant: bool
    lift: float  # % improvement

class MLABTestingFramework:
    """A/B Testing Framework สำหรับ ML Models"""

    def __init__(self):
        self.experiments: Dict[str, ABExperiment] = {}
        self.results: Dict[str, ABResult] = {}

    def create_experiment(self, experiment: ABExperiment):
        self.experiments[experiment.name] = experiment
        print(f"Created experiment: {experiment.name}")
        print(f"  Control: {experiment.model_a}")
        print(f"  Treatment: {experiment.model_b}")
        print(f"  Traffic Split: {experiment.traffic_split:.0%}")

    def calculate_sample_size(self, baseline_rate, mde, alpha=0.05, power=0.80):
        """คำนวณ Sample Size ที่ต้องการ"""
        z_alpha = stats.norm.ppf(1 - alpha / 2)
        z_beta = stats.norm.ppf(power)

        p1 = baseline_rate
        p2 = baseline_rate * (1 + mde)

        n = ((z_alpha * np.sqrt(2 * p1 * (1 - p1)) +
              z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2))) ** 2) / \
            ((p2 - p1) ** 2)

        return int(np.ceil(n))

    def analyze(self, name, data_a, data_b):
        """วิเคราะห์ผล A/B Test"""
        exp = self.experiments[name]

        mean_a = np.mean(data_a)
        mean_b = np.mean(data_b)
        n_a = len(data_a)
        n_b = len(data_b)

        # Two-sample t-test
        t_stat, p_value = stats.ttest_ind(data_a, data_b)

        # Confidence Interval
        se = np.sqrt(np.var(data_a) / n_a + np.var(data_b) / n_b)
        diff = mean_b - mean_a
        ci = (diff - 1.96 * se, diff + 1.96 * se)

        # Lift
        lift = (mean_b - mean_a) / mean_a * 100

        is_significant = p_value < exp.significance_level

        result = ABResult(
            metric_a=mean_a, metric_b=mean_b,
            sample_a=n_a, sample_b=n_b,
            p_value=p_value, confidence_interval=ci,
            is_significant=is_significant, lift=lift,
        )
        self.results[name] = result

        return result

    def report(self, name):
        """แสดงผล A/B Test"""
        exp = self.experiments[name]
        result = self.results[name]

        print(f"\n{'='*55}")
        print(f"A/B Test Report: {name}")
        print(f"{'='*55}")
        print(f"  Control ({exp.model_a}):   {result.metric_a:.4f} (n={result.sample_a:,})")
        print(f"  Treatment ({exp.model_b}): {result.metric_b:.4f} (n={result.sample_b:,})")
        print(f"  Lift: {result.lift:+.2f}%")
        print(f"  P-value: {result.p_value:.4f}")
        print(f"  95% CI: [{result.confidence_interval[0]:.4f}, {result.confidence_interval[1]:.4f}]")
        print(f"  Significant: {'YES' if result.is_significant else 'NO'}")

        if result.is_significant and result.lift > 0:
            print(f"\n  Recommendation: DEPLOY Model B ({exp.model_b})")
        elif result.is_significant and result.lift < 0:
            print(f"\n  Recommendation: KEEP Model A ({exp.model_a})")
        else:
            print(f"\n  Recommendation: Continue testing (not significant)")

# === ตัวอย่าง ===
framework = MLABTestingFramework()

# สร้าง Experiment
exp = ABExperiment(
    name="recommendation-v2",
    model_a="rec-model-v1.2",
    model_b="rec-model-v2.0",
    metric="click_through_rate",
    traffic_split=0.10,
)
framework.create_experiment(exp)

# Sample Size
n = framework.calculate_sample_size(baseline_rate=0.05, mde=0.10)
print(f"\nRequired sample size per group: {n:,}")

# จำลองข้อมูล
np.random.seed(42)
data_a = np.random.binomial(1, 0.050, 5000)  # CTR 5.0%
data_b = np.random.binomial(1, 0.055, 5000)  # CTR 5.5%

result = framework.analyze("recommendation-v2", data_a, data_b)
framework.report("recommendation-v2")

MLOps Pipeline

# mlops_pipeline.py — MLOps Workflow สำหรับ A/B Testing
# pip install mlflow boto3

from dataclasses import dataclass
from typing import Dict, List, Optional
from enum import Enum
import json

class ModelStage(Enum):
    DEVELOPMENT = "development"
    STAGING = "staging"
    CANARY = "canary"
    PRODUCTION = "production"
    ARCHIVED = "archived"

@dataclass
class ModelVersion:
    name: str
    version: str
    stage: ModelStage
    metrics: Dict[str, float]
    artifact_path: str
    created_at: str

class MLOpsWorkflow:
    """MLOps Workflow สำหรับ Model Lifecycle"""

    def __init__(self):
        self.models: Dict[str, List[ModelVersion]] = {}
        self.deployments: Dict[str, Dict] = {}

    def register_model(self, model: ModelVersion):
        """Register Model ใน Registry"""
        if model.name not in self.models:
            self.models[model.name] = []
        self.models[model.name].append(model)
        print(f"Registered: {model.name} v{model.version} ({model.stage.value})")

    def promote(self, name, version, target_stage: ModelStage):
        """Promote Model ไป Stage ถัดไป"""
        models = self.models.get(name, [])
        for m in models:
            if m.version == version:
                old_stage = m.stage
                m.stage = target_stage
                print(f"Promoted: {name} v{version} "
                      f"{old_stage.value} -> {target_stage.value}")
                return True
        return False

    def deploy_ab(self, name, version_a, version_b, split=0.10):
        """Deploy A/B Test"""
        self.deployments[name] = {
            "type": "ab_test",
            "control": version_a,
            "treatment": version_b,
            "split": split,
            "status": "running",
        }

        print(f"\nA/B Deployment: {name}")
        print(f"  Control: v{version_a} ({1-split:.0%} traffic)")
        print(f"  Treatment: v{version_b} ({split:.0%} traffic)")

    def canary_deploy(self, name, version, initial_pct=5):
        """Canary Deployment"""
        self.deployments[name] = {
            "type": "canary",
            "version": version,
            "traffic_pct": initial_pct,
            "status": "running",
        }

        print(f"\nCanary Deployment: {name} v{version}")
        print(f"  Initial traffic: {initial_pct}%")

    def rollback(self, name):
        """Rollback Deployment"""
        if name in self.deployments:
            self.deployments[name]["status"] = "rolled_back"
            print(f"Rolled back: {name}")

    def pipeline_status(self):
        """แสดงสถานะ Pipeline"""
        print(f"\n{'='*55}")
        print(f"MLOps Pipeline Status")
        print(f"{'='*55}")

        for name, versions in self.models.items():
            print(f"\n  Model: {name}")
            for v in versions:
                metrics_str = ", ".join(f"{k}={v:.3f}" for k, v in v.metrics.items())
                print(f"    v{v.version} [{v.stage.value}] {metrics_str}")

        if self.deployments:
            print(f"\n  Active Deployments:")
            for name, deploy in self.deployments.items():
                print(f"    {name}: {deploy['type']} ({deploy['status']})")

# === ตัวอย่าง ===
mlops = MLOpsWorkflow()

# Register Models
mlops.register_model(ModelVersion(
    "recommender", "1.2", ModelStage.PRODUCTION,
    {"accuracy": 0.85, "latency_p95": 45.0}, "s3://models/rec/1.2", "2024-01-01",
))
mlops.register_model(ModelVersion(
    "recommender", "2.0", ModelStage.STAGING,
    {"accuracy": 0.88, "latency_p95": 42.0}, "s3://models/rec/2.0", "2024-02-01",
))

# Deploy A/B Test
mlops.deploy_ab("recommender", "1.2", "2.0", split=0.10)

# Pipeline Status
mlops.pipeline_status()

Kubernetes Deployment สำหรับ A/B Testing

# === Kubernetes + Istio A/B Testing ===

# 1. Model Deployments
# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: ml-model-v1
#   labels:
#     app: ml-model
#     version: v1
# spec:
#   replicas: 3
#   selector:
#     matchLabels:
#       app: ml-model
#       version: v1
#   template:
#     metadata:
#       labels:
#         app: ml-model
#         version: v1
#     spec:
#       containers:
#         - name: model
#           image: ml-model:v1.2
#           ports:
#             - containerPort: 8080
#           resources:
#             requests:
#               cpu: "500m"
#               memory: "1Gi"
#               nvidia.com/gpu: "1"
# ---
# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: ml-model-v2
#   labels:
#     app: ml-model
#     version: v2
# spec:
#   replicas: 1
#   selector:
#     matchLabels:
#       app: ml-model
#       version: v2
#   template:
#     metadata:
#       labels:
#         app: ml-model
#         version: v2
#     spec:
#       containers:
#         - name: model
#           image: ml-model:v2.0
#           ports:
#             - containerPort: 8080

# 2. Istio VirtualService — Traffic Splitting
# apiVersion: networking.istio.io/v1beta1
# kind: VirtualService
# metadata:
#   name: ml-model-vs
# spec:
#   hosts:
#     - ml-model
#   http:
#     - route:
#         - destination:
#             host: ml-model
#             subset: v1
#           weight: 90
#         - destination:
#             host: ml-model
#             subset: v2
#           weight: 10

# 3. DestinationRule
# apiVersion: networking.istio.io/v1beta1
# kind: DestinationRule
# metadata:
#   name: ml-model-dr
# spec:
#   host: ml-model
#   subsets:
#     - name: v1
#       labels:
#         version: v1
#     - name: v2
#       labels:
#         version: v2

# 4. Progressive Traffic Increase
# kubectl patch virtualservice ml-model-vs --type=json \
#   -p='[{"op":"replace","path":"/spec/http/0/route/0/weight","value":80},
#        {"op":"replace","path":"/spec/http/0/route/1/weight","value":20}]'

# 5. Full Rollout
# kubectl patch virtualservice ml-model-vs --type=json \
#   -p='[{"op":"replace","path":"/spec/http/0/route/0/weight","value":0},
#        {"op":"replace","path":"/spec/http/0/route/1/weight","value":100}]'

echo "A/B Testing Deployment:"
echo "  Model v1: 90% traffic (Control)"
echo "  Model v2: 10% traffic (Treatment)"
echo "  Istio VirtualService: Traffic splitting"
echo "  Progressive: 90/10 -> 80/20 -> 50/50 -> 0/100"

Best Practices

Sample Size: คำนวณ Sample Size ก่อนเริ่ม อย่าหยุด Test ก่อนถึงจำนวน
One Metric: กำหนด Primary Metric เดียว ป้องกัน Multiple Testing Problem
Guardrail Metrics: ตั้ง Guardrail Metrics (Latency, Error Rate) ถ้าเกิน Rollback ทันที
Progressive Rollout: เริ่มจาก 5-10% ค่อยเพิ่มทีละ Step ไม่ Switch 100% ทีเดียว
Automated Rollback: ตั้ง Automated Rollback ถ้า Error Rate เกิน Threshold
Model Registry: เก็บทุก Model Version ใน Registry พร้อม Metrics สำหรับ Comparison

A/B Testing สำหรับ ML คืออะไร

เปรียบเทียบ Model ใหม่กับเก่า แบ่ง Traffic ไป Control Treatment วัด Accuracy Latency Business KPIs ใช้ Statistical Tests ตัดสินว่า Model ใหม่ดีกว่าจริงหรือไม่

MLOps คืออะไร

Practices รวม ML กับ DevOps ครอบคลุม Data Pipeline Training Registry Deployment Monitoring Retraining ทำให้ Models ไป Production เร็วเชื่อถือได้ Automation ลดงาน Manual

Traffic Splitting ทำอย่างไร

ใช้ Istio Service Mesh API Gateway Load Balancer Feature Flags แบ่ง Traffic เช่น 90/10 ค่อยเพิ่มถ้าผลดี Canary Deployment Header-based Cookie-based Routing

วิธีวัดผล A/B Test สำหรับ ML ทำอย่างไร

กำหนด Primary Metric CTR Conversion Revenue Guardrail Metrics Latency Error Rate ใช้ t-test Chi-square Sample Size ล่วงหน้า Significance 0.05 Power 80%

สรุป

A/B Testing สำหรับ ML Models ร่วมกับ MLOps Workflow ให้กระบวนการ Deploy Model ที่เชื่อถือได้ คำนวณ Sample Size ก่อน กำหนด Primary Metric เดียว Progressive Rollout จาก 10% Guardrail Metrics สำหรับ Automated Rollback Model Registry เก็บทุก Version Istio Traffic Splitting