SiamCafe · Blog
TensorFlow Serving Troubleshooting แก้ปัญหา
บทความ

TensorFlow Serving Troubleshooting แก้ปัญหา

เผยแพร่ 28 พฤษภาคม 2569

TensorFlow Serving Troubleshooting แก้ปัญหา

TensorFlow Serving เป็น production-grade serving system สำหรับ deploy machine learning models ที่ Google พัฒนา รองรับ gRPC และ REST API สำหรับ inference requests ในการใช้งานจริงมักเจอปัญหาหลายอย่าง เช่น model loading errors, performance bottlenecks, memory leaks, version conflicts และ scaling issues บทความนี้รวบรวมปัญหาที่พบบ่อยของ TensorFlow Serving พร้อมวิธีแก้ไข debugging techniques และ monitoring best practices

ปัญหาที่พบบ่อยและวิธีแก้

# common_issues.py — Common TF Serving issues
import json

class CommonIssues:
    ISSUES = {
        "model_not_found": {
            "error": "NOT_FOUND: Could not find model",
            "cause": "Model path ผิด หรือ model format ไม่ถูกต้อง",
            "fix": [
                "ตรวจสอบ model path: /models/my_model/1/saved_model.pb",
                "Directory structure: /models/{model_name}/{version}/saved_model.pb",
                "ตรวจสอบ --model_base_path flag ตรงกับ path จริง",
                "version directory ต้องเป็นตัวเลข (1, 2, 3...)",
            ],
        },
        "signature_mismatch": {
            "error": "INVALID_ARGUMENT: input tensor alias not found in signature",
            "cause": "Input/output tensor names ไม่ตรงกับ model signature",
            "fix": [
                "ตรวจสอบ signature: saved_model_cli show --dir /model/1 --all",
                "ใช้ชื่อ input/output ที่ตรงกับ signature_def",
                "ตรวจสอบ dtype และ shape ของ input tensor",
            ],
        },
        "oom_error": {
            "error": "ResourceExhaustedError: OOM when allocating tensor",
            "cause": "GPU/CPU memory ไม่พอสำหรับ model + inference batch",
            "fix": [
                "ลด batch size",
                "ตั้ง per_process_gpu_memory_fraction: --tensorflow_gpu_memory_fraction=0.5",
                "ใช้ TF_FORCE_GPU_ALLOW_GROWTH=true",
                "ลด max_batch_size ใน batching config",
                "ใช้ model optimization (TFLite, TensorRT, quantization)",
            ],
        },
        "slow_startup": {
            "error": "Model takes too long to load (> 60 seconds)",
            "cause": "Model ใหญ่เกินไป หรือ warmup ไม่ถูกต้อง",
            "fix": [
                "เพิ่ม warmup requests: assets.extra/tf_serving_warmup_requests",
                "ใช้ --model_config_file_poll_wait_seconds สำหรับ config reload",
                "Pre-load models ก่อน serve traffic",
                "ใช้ SavedModel optimization (strip unused ops)",
            ],
        },
        "version_conflict": {
            "error": "Model version not compatible with TF Serving version",
            "cause": "Model saved ด้วย TF version ที่ไม่ compatible กับ Serving version",
            "fix": [
                "ใช้ TF Serving version ที่ตรงกับ TF training version",
                "Re-export model ด้วย compatible TF version",
                "ตรวจสอบ: saved_model_cli show --dir /model/1 --tag_set serve",
            ],
        },
    }

    def show_issues(self):
        print("=== Common Issues ===\n")
        for key, issue in self.ISSUES.items():
            print(f"[{issue['error'][:60]}...]")
            print(f"  Cause: {issue['cause']}")
            print(f"  Fix:")
            for fix in issue["fix"][:3]:
                print(f"    • {fix}")
            print()

issues = CommonIssues()
issues.show_issues()

Debugging Tools & Commands

# debugging.py — Debugging TF Serving
import json

class DebuggingTools:
    COMMANDS = {
        "inspect_model": {
            "name": "Inspect SavedModel",
            "commands": [
                "# ดู model signature",
                "saved_model_cli show --dir /path/to/model/1 --all",
                "",
                "# ดู tag sets",
                "saved_model_cli show --dir /path/to/model/1 --tag_set serve",
                "",
                "# ทดสอบ run model",
                "saved_model_cli run --dir /path/to/model/1 --tag_set serve \\",
                "  --signature_def serving_default \\",
                "  --input_exprs 'input_1=np.ones((1,224,224,3))'",
            ],
        },
        "health_check": {
            "name": "Health Check APIs",
            "commands": [
                "# REST API health",
                "curl http://localhost:8501/v1/models/my_model",
                "",
                "# Model metadata",
                "curl http://localhost:8501/v1/models/my_model/metadata",
                "",
                "# Model status (versions)",
                "curl http://localhost:8501/v1/models/my_model/versions/1",
                "",
                "# gRPC health (grpcurl)",
                "grpcurl -plaintext localhost:8500 tensorflow.serving.PredictionService/GetModelMetadata",
            ],
        },
        "docker_debug": {
            "name": "Docker Debugging",
            "commands": [
                "# ดู logs",
                "docker logs tf-serving --tail 100",
                "",
                "# เข้า container",
                "docker exec -it tf-serving bash",
                "",
                "# ดู resource usage",
                "docker stats tf-serving",
                "",
                "# ตรวจสอบ model files ใน container",
                "docker exec tf-serving ls -la /models/my_model/",
            ],
        },
    }

    def show_commands(self):
        print("=== Debugging Commands ===\n")
        for key, section in self.COMMANDS.items():
            print(f"[{section['name']}]")
            for cmd in section["commands"][:5]:
                print(f"  {cmd}")
            print()

debug = DebuggingTools()
debug.show_commands()

Performance Optimization

# performance.py — TF Serving performance optimization
import json

class PerformanceOptimization:
    BATCHING_CONFIG = """
# batching_config.txt — Request batching configuration
max_batch_size { value: 32 }
batch_timeout_micros { value: 5000 }
num_batch_threads { value: 4 }
max_enqueued_batches { value: 100 }
pad_variable_length_inputs: true
"""

    MODEL_CONFIG = """
# model_config.txt — Multi-model serving
model_config_list {
  config {
    name: "image_classifier"
    base_path: "/models/image_classifier"
    model_platform: "tensorflow"
    model_version_policy {
      specific { versions: 1 versions: 2 }
    }
  }
  config {
    name: "text_model"
    base_path: "/models/text_model"
    model_platform: "tensorflow"
  }
}
"""

    DOCKER_OPTIMIZED = """
# docker-compose.yml — Optimized TF Serving
version: '3.8'
services:
  tf-serving:
    image: tensorflow/serving:latest-gpu
    ports:
      - "8500:8500"  # gRPC
      - "8501:8501"  # REST
    volumes:
      - ./models:/models
      - ./config:/config
    environment:
      MODEL_NAME: my_model
      TF_CPP_MIN_LOG_LEVEL: "1"
      TF_FORCE_GPU_ALLOW_GROWTH: "true"
    command:
      - --model_config_file=/config/model_config.txt
      - --batching_parameters_file=/config/batching_config.txt
      - --enable_batching=true
      - --rest_api_num_threads=16
      - --tensorflow_inter_op_parallelism=4
      - --tensorflow_intra_op_parallelism=8
      - --file_system_poll_wait_seconds=30
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G
        reservations:
          devices:
            - capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8501/v1/models/my_model"]
      interval: 30s
      timeout: 10s
      retries: 3
"""

    TUNING_TIPS = {
        "batching": "Enable batching — รวม requests เป็น batch → GPU utilization สูงขึ้น",
        "grpc": "ใช้ gRPC แทน REST — เร็วกว่า 2-5x (binary protocol, HTTP/2)",
        "quantization": "Quantize model (INT8/FP16) — เร็วขึ้น 2-4x, memory ลด 50-75%",
        "tensorrt": "ใช้ TensorRT optimization — NVIDIA GPU inference เร็วขึ้น 2-6x",
        "warmup": "Add warmup requests — ลด cold start latency",
        "threads": "ปรับ inter/intra op parallelism — ตาม CPU cores",
    }

    def show_batching(self):
        print("=== Batching Config ===")
        print(self.BATCHING_CONFIG)

    def show_docker(self):
        print("=== Optimized Docker ===")
        print(self.DOCKER_OPTIMIZED[:500])

    def show_tips(self):
        print(f"\n=== Tuning Tips ===")
        for key, tip in self.TUNING_TIPS.items():
            print(f"  [{key}] {tip}")

perf = PerformanceOptimization()
perf.show_batching()
perf.show_tips()

Python Client & Monitoring

# monitoring.py — TF Serving client and monitoring
import json
import random

class TFServingMonitoring:
    CLIENT_CODE = """
# tf_serving_client.py — Python client
import requests
import numpy as np
import time

class TFServingClient:
    def __init__(self, base_url='http://localhost:8501'):
        self.base_url = base_url
    
    def predict(self, model_name, input_data, version=None):
        url = f"{self.base_url}/v1/models/{model_name}"
        if version:
            url += f"/versions/{version}"
        url += ":predict"
        
        payload = {"instances": input_data.tolist()}
        
        start = time.time()
        resp = requests.post(url, json=payload, timeout=30)
        latency = (time.time() - start) * 1000
        
        if resp.status_code != 200:
            return {"error": resp.text, "status": resp.status_code}
        
        return {
            "predictions": resp.json()["predictions"],
            "latency_ms": round(latency, 1),
            "status": 200,
        }
    
    def health(self, model_name):
        url = f"{self.base_url}/v1/models/{model_name}"
        try:
            resp = requests.get(url, timeout=5)
            return resp.json()
        except Exception as e:
            return {"error": str(e)}
    
    def metadata(self, model_name):
        url = f"{self.base_url}/v1/models/{model_name}/metadata"
        resp = requests.get(url, timeout=5)
        return resp.json()

# Monitoring with Prometheus metrics
class TFServingMonitor:
    def __init__(self, serving_url):
        self.client = TFServingClient(serving_url)
        self.metrics = {"requests": 0, "errors": 0, "latencies": []}
    
    def check_health(self, model_name):
        result = self.client.health(model_name)
        state = result.get("model_version_status", [{}])[0].get("state", "UNKNOWN")
        return state == "AVAILABLE"
    
    def benchmark(self, model_name, input_data, n_requests=100):
        latencies = []
        errors = 0
        for _ in range(n_requests):
            result = self.client.predict(model_name, input_data)
            if "error" in result:
                errors += 1
            else:
                latencies.append(result["latency_ms"])
        
        return {
            "total": n_requests,
            "errors": errors,
            "p50": np.percentile(latencies, 50),
            "p95": np.percentile(latencies, 95),
            "p99": np.percentile(latencies, 99),
            "avg": np.mean(latencies),
        }

client = TFServingClient()
# result = client.predict("my_model", np.random.randn(1, 224, 224, 3))
"""

    def show_client(self):
        print("=== TF Serving Client ===")
        print(self.CLIENT_CODE[:600])

    def dashboard(self):
        print(f"\n=== Monitoring Dashboard ===")
        metrics = {
            "Requests/sec": random.randint(100, 1000),
            "Latency P50": f"{random.uniform(5, 20):.1f}ms",
            "Latency P99": f"{random.uniform(50, 200):.1f}ms",
            "Error rate": f"{random.uniform(0, 1):.2f}%",
            "GPU utilization": f"{random.randint(40, 95)}%",
            "Memory usage": f"{random.uniform(2, 8):.1f}GB",
            "Active models": random.randint(1, 5),
            "Model versions": random.randint(2, 8),
        }
        for m, v in metrics.items():
            print(f"  {m}: {v}")

mon = TFServingMonitoring()
mon.show_client()
mon.dashboard()

Kubernetes Deployment

# k8s.py — Kubernetes deployment for TF Serving import json class K8sDeployment: DEPLOYMENT = """ # tf-serving-deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: tf-serving labels: app: tf-serving spec: replicas: 3 selector: matchLabels: app: tf-serving template: metadata: labels: app: tf-serving spec: containers: - name: tf-serving image: tensorflow/serving:latest ports: - containerPort: 8500 name: grpc - containerPort: 8501 name: rest args: - --model_config_file=/config/model_config.txt - --enable_batching=true - --batching_parameters_file=/config/batching_config.txt resources: requests: cpu: "2" memory: 4Gi limits: cpu: "4" memory: 8Gi readinessProbe: httpGet: path: /v1/models/my_model port: 8501 initialDelaySeconds: 30 periodSeconds: 10 livenessProbe: httpGet: path: /v1/models/my_model port: 8501 initialDelaySeconds: 60 periodSeconds: 30 volumeMounts: - name: model-volume mountPath: /models - name: config-volume mountPath: /config volumes: - name: model-volume persistentVolumeClaim: claimName: model-pvc - name: config-volume configMap: name: tf-serving-config --- apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: tf-serving-hpa spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: tf-serving minReplicas: 2 maxReplicas: 10 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 70 """ def show_deployment(self): print("=== K8s Deployment ===") print(self.DEPLOYMENT[:600]) def troubleshoot_k8s(self): print(f"\n=== K8s Troubleshooting ===") checks = [ "kubectl get pods -l app=tf-serving (ดู pod status)", "kubectl logs --tail=100 (ดู logs)", "kubectl describe pod (ดู events + errors)", "kubectl top pods -l app=tf-serving (ดู resource usage)", "kubectl exec -it -- curl localhost:8501/v1/models/my_model (health check)", ] for check in checks: print(f" $ {check}") k8s = K8sDeployment() k8s.show_deployment() k8s.troubleshoot_k8s()

FAQ - คำถามที่พบบ่อย

Q: TF Serving กับ TorchServe อันไหนดี?

A: TF Serving: สำหรับ TensorFlow/Keras models, mature, Google-backed TorchServe: สำหรับ PyTorch models, AWS-backed, flexible Triton Inference Server: รองรับทุก framework (TF, PyTorch, ONNX) — แนะนำสำหรับ multi-framework เลือกตาม framework ที่ใช้ — ถ้าใช้ TF เลือก TF Serving, PyTorch เลือก TorchServe

Q: REST กับ gRPC ใช้อันไหน?

A: gRPC: เร็วกว่า 2-5x, binary protocol, HTTP/2, streaming — แนะนำสำหรับ production REST: ง่ายกว่า, debug ง่าย, ใช้ curl ได้ — เหมาะ development/testing Internal services: gRPC, External/browser: REST

Q: Model loading ช้ามาก ทำอย่างไร?

A: 1) Add warmup requests (assets.extra/tf_serving_warmup_requests) 2) ใช้ model optimization (quantization, pruning) 3) Pre-load models ก่อน route traffic 4) ใช้ PVC ที่เร็ว (SSD, NVMe) สำหรับ model storage 5) ลด model size (SavedModel strip unused ops)

Q: Monitor TF Serving อย่างไร?

A: Built-in: /monitoring/prometheus/metrics endpoint Prometheus: scrape metrics → Grafana dashboard Key metrics: request_count, request_latency, model_load_latency, batch_size Alerting: latency P99 > threshold, error rate > 1%, GPU memory > 90%