A/B Testing สำหรับ ML Models
A/B Testing เป็นวิธีมาตรฐานในการเปรียบเทียบ ML Models ก่อน Deploy เต็มรูปแบบ แบ่ง Traffic ส่งไป Model เก่าและใหม่ วัดผลด้วย Statistical Tests ตัดสินใจจากข้อมูลจริง
MLOps Workflow ทำให้กระบวนการ A/B Testing อัตโนมัติ ตั้งแต่ Training, Registry, Deployment, Traffic Splitting, Monitoring ไปจนถึง Automated Rollback
A/B Testing Framework
# ab_testing_ml.py — A/B Testing Framework สำหรับ ML
# pip install scipy numpy pandas
import numpy as np
import pandas as pd
from scipy import stats
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple
from datetime import datetime, timedelta
import json
@dataclass
class ABExperiment:
name: str
model_a: str # Control
model_b: str # Treatment
metric: str # Primary metric
traffic_split: float = 0.1 # % ไป Model B
min_sample_size: int = 1000
significance_level: float = 0.05
power: float = 0.80
start_date: Optional[str] = None
status: str = "draft" # draft, running, completed, rolled_back
@dataclass
class ABResult:
metric_a: float
metric_b: float
sample_a: int
sample_b: int
p_value: float
confidence_interval: Tuple[float, float]
is_significant: bool
lift: float # % improvement
class MLABTestingFramework:
"""A/B Testing Framework สำหรับ ML Models"""
def __init__(self):
self.experiments: Dict[str, ABExperiment] = {}
self.results: Dict[str, ABResult] = {}
def create_experiment(self, experiment: ABExperiment):
self.experiments[experiment.name] = experiment
print(f"Created experiment: {experiment.name}")
print(f" Control: {experiment.model_a}")
print(f" Treatment: {experiment.model_b}")
print(f" Traffic Split: {experiment.traffic_split:.0%}")
def calculate_sample_size(self, baseline_rate, mde, alpha=0.05, power=0.80):
"""คำนวณ Sample Size ที่ต้องการ"""
z_alpha = stats.norm.ppf(1 - alpha / 2)
z_beta = stats.norm.ppf(power)
p1 = baseline_rate
p2 = baseline_rate * (1 + mde)
n = ((z_alpha * np.sqrt(2 * p1 * (1 - p1)) +
z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2))) ** 2) / \
((p2 - p1) ** 2)
return int(np.ceil(n))
def analyze(self, name, data_a, data_b):
"""วิเคราะห์ผล A/B Test"""
exp = self.experiments[name]
mean_a = np.mean(data_a)
mean_b = np.mean(data_b)
n_a = len(data_a)
n_b = len(data_b)
# Two-sample t-test
t_stat, p_value = stats.ttest_ind(data_a, data_b)
# Confidence Interval
se = np.sqrt(np.var(data_a) / n_a + np.var(data_b) / n_b)
diff = mean_b - mean_a
ci = (diff - 1.96 * se, diff + 1.96 * se)
# Lift
lift = (mean_b - mean_a) / mean_a * 100
is_significant = p_value < exp.significance_level
result = ABResult(
metric_a=mean_a, metric_b=mean_b,
sample_a=n_a, sample_b=n_b,
p_value=p_value, confidence_interval=ci,
is_significant=is_significant, lift=lift,
)
self.results[name] = result
return result
def report(self, name):
"""แสดงผล A/B Test"""
exp = self.experiments[name]
result = self.results[name]
print(f"\n{'='*55}")
print(f"A/B Test Report: {name}")
print(f"{'='*55}")
print(f" Control ({exp.model_a}): {result.metric_a:.4f} (n={result.sample_a:,})")
print(f" Treatment ({exp.model_b}): {result.metric_b:.4f} (n={result.sample_b:,})")
print(f" Lift: {result.lift:+.2f}%")
print(f" P-value: {result.p_value:.4f}")
print(f" 95% CI: [{result.confidence_interval[0]:.4f}, {result.confidence_interval[1]:.4f}]")
print(f" Significant: {'YES' if result.is_significant else 'NO'}")
if result.is_significant and result.lift > 0:
print(f"\n Recommendation: DEPLOY Model B ({exp.model_b})")
elif result.is_significant and result.lift < 0:
print(f"\n Recommendation: KEEP Model A ({exp.model_a})")
else:
print(f"\n Recommendation: Continue testing (not significant)")
# === ตัวอย่าง ===
framework = MLABTestingFramework()
# สร้าง Experiment
exp = ABExperiment(
name="recommendation-v2",
model_a="rec-model-v1.2",
model_b="rec-model-v2.0",
metric="click_through_rate",
traffic_split=0.10,
)
framework.create_experiment(exp)
# Sample Size
n = framework.calculate_sample_size(baseline_rate=0.05, mde=0.10)
print(f"\nRequired sample size per group: {n:,}")
# จำลองข้อมูล
np.random.seed(42)
data_a = np.random.binomial(1, 0.050, 5000) # CTR 5.0%
data_b = np.random.binomial(1, 0.055, 5000) # CTR 5.5%
result = framework.analyze("recommendation-v2", data_a, data_b)
framework.report("recommendation-v2")
MLOps Pipeline
# mlops_pipeline.py — MLOps Workflow สำหรับ A/B Testing
# pip install mlflow boto3
from dataclasses import dataclass
from typing import Dict, List, Optional
from enum import Enum
import json
class ModelStage(Enum):
DEVELOPMENT = "development"
STAGING = "staging"
CANARY = "canary"
PRODUCTION = "production"
ARCHIVED = "archived"
@dataclass
class ModelVersion:
name: str
version: str
stage: ModelStage
metrics: Dict[str, float]
artifact_path: str
created_at: str
class MLOpsWorkflow:
"""MLOps Workflow สำหรับ Model Lifecycle"""
def __init__(self):
self.models: Dict[str, List[ModelVersion]] = {}
self.deployments: Dict[str, Dict] = {}
def register_model(self, model: ModelVersion):
"""Register Model ใน Registry"""
if model.name not in self.models:
self.models[model.name] = []
self.models[model.name].append(model)
print(f"Registered: {model.name} v{model.version} ({model.stage.value})")
def promote(self, name, version, target_stage: ModelStage):
"""Promote Model ไป Stage ถัดไป"""
models = self.models.get(name, [])
for m in models:
if m.version == version:
old_stage = m.stage
m.stage = target_stage
print(f"Promoted: {name} v{version} "
f"{old_stage.value} -> {target_stage.value}")
return True
return False
def deploy_ab(self, name, version_a, version_b, split=0.10):
"""Deploy A/B Test"""
self.deployments[name] = {
"type": "ab_test",
"control": version_a,
"treatment": version_b,
"split": split,
"status": "running",
}
print(f"\nA/B Deployment: {name}")
print(f" Control: v{version_a} ({1-split:.0%} traffic)")
print(f" Treatment: v{version_b} ({split:.0%} traffic)")
def canary_deploy(self, name, version, initial_pct=5):
"""Canary Deployment"""
self.deployments[name] = {
"type": "canary",
"version": version,
"traffic_pct": initial_pct,
"status": "running",
}
print(f"\nCanary Deployment: {name} v{version}")
print(f" Initial traffic: {initial_pct}%")
def rollback(self, name):
"""Rollback Deployment"""
if name in self.deployments:
self.deployments[name]["status"] = "rolled_back"
print(f"Rolled back: {name}")
def pipeline_status(self):
"""แสดงสถานะ Pipeline"""
print(f"\n{'='*55}")
print(f"MLOps Pipeline Status")
print(f"{'='*55}")
for name, versions in self.models.items():
print(f"\n Model: {name}")
for v in versions:
metrics_str = ", ".join(f"{k}={v:.3f}" for k, v in v.metrics.items())
print(f" v{v.version} [{v.stage.value}] {metrics_str}")
if self.deployments:
print(f"\n Active Deployments:")
for name, deploy in self.deployments.items():
print(f" {name}: {deploy['type']} ({deploy['status']})")
# === ตัวอย่าง ===
mlops = MLOpsWorkflow()
# Register Models
mlops.register_model(ModelVersion(
"recommender", "1.2", ModelStage.PRODUCTION,
{"accuracy": 0.85, "latency_p95": 45.0}, "s3://models/rec/1.2", "2024-01-01",
))
mlops.register_model(ModelVersion(
"recommender", "2.0", ModelStage.STAGING,
{"accuracy": 0.88, "latency_p95": 42.0}, "s3://models/rec/2.0", "2024-02-01",
))
# Deploy A/B Test
mlops.deploy_ab("recommender", "1.2", "2.0", split=0.10)
# Pipeline Status
mlops.pipeline_status()
Kubernetes Deployment สำหรับ A/B Testing
# === Kubernetes + Istio A/B Testing ===
# 1. Model Deployments
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: ml-model-v1
# labels:
# app: ml-model
# version: v1
# spec:
# replicas: 3
# selector:
# matchLabels:
# app: ml-model
# version: v1
# template:
# metadata:
# labels:
# app: ml-model
# version: v1
# spec:
# containers:
# - name: model
# image: ml-model:v1.2
# ports:
# - containerPort: 8080
# resources:
# requests:
# cpu: "500m"
# memory: "1Gi"
# nvidia.com/gpu: "1"
# ---
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: ml-model-v2
# labels:
# app: ml-model
# version: v2
# spec:
# replicas: 1
# selector:
# matchLabels:
# app: ml-model
# version: v2
# template:
# metadata:
# labels:
# app: ml-model
# version: v2
# spec:
# containers:
# - name: model
# image: ml-model:v2.0
# ports:
# - containerPort: 8080
# 2. Istio VirtualService — Traffic Splitting
# apiVersion: networking.istio.io/v1beta1
# kind: VirtualService
# metadata:
# name: ml-model-vs
# spec:
# hosts:
# - ml-model
# http:
# - route:
# - destination:
# host: ml-model
# subset: v1
# weight: 90
# - destination:
# host: ml-model
# subset: v2
# weight: 10
# 3. DestinationRule
# apiVersion: networking.istio.io/v1beta1
# kind: DestinationRule
# metadata:
# name: ml-model-dr
# spec:
# host: ml-model
# subsets:
# - name: v1
# labels:
# version: v1
# - name: v2
# labels:
# version: v2
# 4. Progressive Traffic Increase
# kubectl patch virtualservice ml-model-vs --type=json \
# -p='[{"op":"replace","path":"/spec/http/0/route/0/weight","value":80},
# {"op":"replace","path":"/spec/http/0/route/1/weight","value":20}]'
# 5. Full Rollout
# kubectl patch virtualservice ml-model-vs --type=json \
# -p='[{"op":"replace","path":"/spec/http/0/route/0/weight","value":0},
# {"op":"replace","path":"/spec/http/0/route/1/weight","value":100}]'
echo "A/B Testing Deployment:"
echo " Model v1: 90% traffic (Control)"
echo " Model v2: 10% traffic (Treatment)"
echo " Istio VirtualService: Traffic splitting"
echo " Progressive: 90/10 -> 80/20 -> 50/50 -> 0/100"
Best Practices
- Sample Size: คำนวณ Sample Size ก่อนเริ่ม อย่าหยุด Test ก่อนถึงจำนวน
- One Metric: กำหนด Primary Metric เดียว ป้องกัน Multiple Testing Problem
- Guardrail Metrics: ตั้ง Guardrail Metrics (Latency, Error Rate) ถ้าเกิน Rollback ทันที
- Progressive Rollout: เริ่มจาก 5-10% ค่อยเพิ่มทีละ Step ไม่ Switch 100% ทีเดียว
- Automated Rollback: ตั้ง Automated Rollback ถ้า Error Rate เกิน Threshold
- Model Registry: เก็บทุก Model Version ใน Registry พร้อม Metrics สำหรับ Comparison
A/B Testing สำหรับ ML คืออะไร
เปรียบเทียบ Model ใหม่กับเก่า แบ่ง Traffic ไป Control Treatment วัด Accuracy Latency Business KPIs ใช้ Statistical Tests ตัดสินว่า Model ใหม่ดีกว่าจริงหรือไม่
MLOps คืออะไร
Practices รวม ML กับ DevOps ครอบคลุม Data Pipeline Training Registry Deployment Monitoring Retraining ทำให้ Models ไป Production เร็วเชื่อถือได้ Automation ลดงาน Manual
Traffic Splitting ทำอย่างไร
ใช้ Istio Service Mesh API Gateway Load Balancer Feature Flags แบ่ง Traffic เช่น 90/10 ค่อยเพิ่มถ้าผลดี Canary Deployment Header-based Cookie-based Routing
วิธีวัดผล A/B Test สำหรับ ML ทำอย่างไร
กำหนด Primary Metric CTR Conversion Revenue Guardrail Metrics Latency Error Rate ใช้ t-test Chi-square Sample Size ล่วงหน้า Significance 0.05 Power 80%
สรุป
A/B Testing สำหรับ ML Models ร่วมกับ MLOps Workflow ให้กระบวนการ Deploy Model ที่เชื่อถือได้ คำนวณ Sample Size ก่อน กำหนด Primary Metric เดียว Progressive Rollout จาก 10% Guardrail Metrics สำหรับ Automated Rollback Model Registry เก็บทุก Version Istio Traffic Splitting
