TensorFlow Serving Production Setup Guide —

TensorFlow Serving Production

TensorFlow Serving ML Model Deploy Production Docker gRPC REST API Monitoring Auto-scaling Kubernetes GPU Inference

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: oVirt Virtualization Technical Debt Management

Feature	TF Serving	TorchServe	Triton (NVIDIA)
Framework	TensorFlow only	PyTorch only	TF + PyTorch + ONNX
API	gRPC + REST	gRPC + REST	gRPC + REST
Batching	Built-in	Built-in	Built-in (advanced)
GPU	CUDA support	CUDA support	CUDA + TensorRT
Versioning	Auto version	Manual	Auto version
Kubernetes	Works well	Works well	Works well

Model Export & Docker

# === TensorFlow Serving Setup ===

# Export SavedModel
# import tensorflow as tf
#
# model = tf.keras.models.load_model('my_model.h5')
#
# # Save as SavedModel (version 1)
# model.save('/models/my_model/1')
#
# # Verify SavedModel
# # saved_model_cli show --dir /models/my_model/1 --all
# # Output:
# # signature_def['serving_default']:
# #   inputs['input_1']:  dtype: DT_FLOAT  shape: (-1, 224, 224, 3)
# #   outputs['dense']:   dtype: DT_FLOAT  shape: (-1, 1000)
#
# # Docker run (CPU)
# # docker run -d --name tf-serving \
# #   -p 8501:8501 -p 8500:8500 \
# #   --mount type=bind, source=/models/my_model, target=/models/my_model \
# #   -e MODEL_NAME=my_model \
# #   tensorflow/serving
#
# # Docker run (GPU)
# # docker run -d --name tf-serving-gpu \
# #   --gpus all \
# #   -p 8501:8501 -p 8500:8500 \
# #   --mount type=bind, source=/models/my_model, target=/models/my_model \
# #   -e MODEL_NAME=my_model \
# #   tensorflow/serving:latest-gpu
#
# # REST API call
# # curl -X POST http://localhost:8501/v1/models/my_model:predict \
# #   -H "Content-Type: application/json" \
# #   -d '{"instances": [{"input_1": [[0.1, 0.2, ...]]}]}'

from dataclasses import dataclass

@dataclass
class DeployOption:
    option: str
    command: str
    when: str
    pros: str
    cons: str

options = [
    DeployOption("Docker (CPU)",
        "docker run tensorflow/serving",
        "Development, Small traffic, No GPU needed",
        "ง่าย เร็ว ไม่ต้อง Config มาก",
        "ไม่มี Auto-scaling Manual restart"),
    DeployOption("Docker (GPU)",
        "docker run --gpus all tensorflow/serving:latest-gpu",
        "GPU Inference, Low latency needed",
        "GPU Acceleration ลด Latency 5-10x",
        "ต้องมี NVIDIA Driver Docker GPU Support"),
    DeployOption("Docker Compose",
        "docker-compose up -d",
        "Multi-model, Development/Staging",
        "จัดการหลาย Model Container ง่าย",
        "ไม่มี Auto-scaling เหมือน Kubernetes"),
    DeployOption("Kubernetes + HPA",
        "kubectl apply -f tf-serving.yaml",
        "Production, Auto-scaling, High Availability",
        "Auto-scale, Rolling Update, HA, Monitoring",
        "ซับซ้อน ต้องมี K8s Cluster"),
    DeployOption("Kubernetes + GPU",
        "kubectl apply -f tf-serving-gpu.yaml",
        "Production GPU Inference at scale",
        "GPU Auto-scaling, Multi-model, HA",
        "แพง ต้อง GPU Node Pool NVIDIA Plugin"),
]

print("=== Deployment Options ===")
for o in options:
    print(f"  [{o.option}] When: {o.when}")
    print(f"    Command: {o.command}")
    print(f"    Pros: {o.pros}")
    print(f"    Cons: {o.cons}")

Kubernetes Deployment

# === Kubernetes Config ===

# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: tf-serving
# spec:
#   replicas: 3
#   selector:
#     matchLabels:
#       app: tf-serving
#   template:
#     metadata:
#       labels:
#         app: tf-serving
#     spec:
#       containers:
#       - name: tf-serving
#         image: tensorflow/serving:latest
#         ports:
#         - containerPort: 8501  # REST
#         - containerPort: 8500  # gRPC
#         env:
#         - name: MODEL_NAME
#           value: "my_model"
#         volumeMounts:
#         - name: model-volume
#           mountPath: /models/my_model
#         resources:
#           requests:
#             cpu: "500m"
#             memory: "1Gi"
#           limits:
#             cpu: "2"
#             memory: "4Gi"
#         readinessProbe:
#           httpGet:
#             path: /v1/models/my_model
#             port: 8501
#           initialDelaySeconds: 30
#           periodSeconds: 10
#       volumes:
#       - name: model-volume
#         persistentVolumeClaim:
#           claimName: model-pvc
# ---
# apiVersion: autoscaling/v2
# kind: HorizontalPodAutoscaler
# metadata:
#   name: tf-serving-hpa
# spec:
#   scaleTargetRef:
#     apiVersion: apps/v1
#     kind: Deployment
#     name: tf-serving
#   minReplicas: 2
#   maxReplicas: 20
#   metrics:
#   - type: Resource
#     resource:
#       name: cpu
#       target:
#         type: Utilization
#         averageUtilization: 70

@dataclass
class K8sConfig:
    component: str
    purpose: str
    key_settings: str
    production_value: str

configs = [
    K8sConfig("Replicas",
        "จำนวน Pod สำหรับ HA",
        "spec.replicas", "3+ สำหรับ Production"),
    K8sConfig("Resource Requests",
        "Resource ขั้นต่ำที่ต้องการ",
        "resources.requests.cpu/memory",
        "CPU: 500m-2, Memory: 1-4Gi"),
    K8sConfig("Resource Limits",
        "Resource สูงสุดที่ใช้ได้",
        "resources.limits.cpu/memory",
        "CPU: 2-4, Memory: 4-8Gi"),
    K8sConfig("Readiness Probe",
        "ตรวจว่า Model Load เสร็จก่อนรับ Traffic",
        "readinessProbe.httpGet /v1/models/MODEL",
        "initialDelay: 30s, period: 10s"),
    K8sConfig("HPA",
        "Auto-scale ตาม CPU/Custom Metrics",
        "minReplicas, maxReplicas, targetUtilization",
        "min: 2, max: 20, CPU: 70%"),
]

print("=== K8s Configuration ===")
for c in configs:
    print(f"  [{c.component}] {c.purpose}")
    print(f"    Setting: {c.key_settings}")
    print(f"    Production: {c.production_value}")

Monitoring

# === Monitoring Setup ===

# Prometheus scrape config
# scrape_configs:
#   - job_name: 'tf-serving'
#     metrics_path: '/monitoring/prometheus/metrics'
#     static_configs:
#       - targets: ['tf-serving:8501']

# Key metrics to monitor:
# :tensorflow:serving:request_count       - Total requests
# :tensorflow:serving:request_latency     - Request latency histogram
# :tensorflow:core:saved_model_load_count - Model load events

@dataclass
class Metric:
    metric: str
    type_: str
    threshold: str
    alert: str
    dashboard: str

metrics = [
    Metric("Request Latency p99",
        "Histogram", "< 100ms (CPU), < 20ms (GPU)",
        "Alert เมื่อ > 200ms ต่อเนื่อง 5 นาที",
        "Grafana: Latency percentiles over time"),
    Metric("QPS (Queries per Second)",
        "Counter", "ดู Capacity ไม่ให้เกิน 80%",
        "Alert เมื่อ QPS > 80% ของ Max tested",
        "Grafana: QPS line chart"),
    Metric("Error Rate",
        "Counter", "< 0.1%",
        "Alert เมื่อ > 1% ต่อเนื่อง 2 นาที",
        "Grafana: Error rate percentage"),
    Metric("CPU/GPU Utilization",
        "Gauge", "< 70% average",
        "Alert เมื่อ > 80% ต่อเนื่อง 5 นาที HPA Scale",
        "Grafana: CPU/GPU utilization gauge"),
    Metric("Model Version",
        "Info", "Latest version loaded",
        "Alert เมื่อ Model Load Failed",
        "Grafana: Current model version text"),
]

print("=== Monitoring Metrics ===")
for m in metrics:
    print(f"  [{m.metric}] Type: {m.type_}")
    print(f"    Threshold: {m.threshold}")
    print(f"    Alert: {m.alert}")
    print(f"    Dashboard: {m.dashboard}")