TensorFlow Serving Troubleshooting แก้ปัญหา
TensorFlow Serving เป็น production-grade serving system สำหรับ deploy machine learning models ที่ Google พัฒนา รองรับ gRPC และ REST API สำหรับ inference requests ในการใช้งานจริงมักเจอปัญหาหลายอย่าง เช่น model loading errors, performance bottlenecks, memory leaks, version conflicts และ scaling issues บทความนี้รวบรวมปัญหาที่พบบ่อยของ TensorFlow Serving พร้อมวิธีแก้ไข debugging techniques และ monitoring best practices
ปัญหาที่พบบ่อยและวิธีแก้
# common_issues.py — Common TF Serving issues
import json
class CommonIssues:
ISSUES = {
"model_not_found": {
"error": "NOT_FOUND: Could not find model",
"cause": "Model path ผิด หรือ model format ไม่ถูกต้อง",
"fix": [
"ตรวจสอบ model path: /models/my_model/1/saved_model.pb",
"Directory structure: /models/{model_name}/{version}/saved_model.pb",
"ตรวจสอบ --model_base_path flag ตรงกับ path จริง",
"version directory ต้องเป็นตัวเลข (1, 2, 3...)",
],
},
"signature_mismatch": {
"error": "INVALID_ARGUMENT: input tensor alias not found in signature",
"cause": "Input/output tensor names ไม่ตรงกับ model signature",
"fix": [
"ตรวจสอบ signature: saved_model_cli show --dir /model/1 --all",
"ใช้ชื่อ input/output ที่ตรงกับ signature_def",
"ตรวจสอบ dtype และ shape ของ input tensor",
],
},
"oom_error": {
"error": "ResourceExhaustedError: OOM when allocating tensor",
"cause": "GPU/CPU memory ไม่พอสำหรับ model + inference batch",
"fix": [
"ลด batch size",
"ตั้ง per_process_gpu_memory_fraction: --tensorflow_gpu_memory_fraction=0.5",
"ใช้ TF_FORCE_GPU_ALLOW_GROWTH=true",
"ลด max_batch_size ใน batching config",
"ใช้ model optimization (TFLite, TensorRT, quantization)",
],
},
"slow_startup": {
"error": "Model takes too long to load (> 60 seconds)",
"cause": "Model ใหญ่เกินไป หรือ warmup ไม่ถูกต้อง",
"fix": [
"เพิ่ม warmup requests: assets.extra/tf_serving_warmup_requests",
"ใช้ --model_config_file_poll_wait_seconds สำหรับ config reload",
"Pre-load models ก่อน serve traffic",
"ใช้ SavedModel optimization (strip unused ops)",
],
},
"version_conflict": {
"error": "Model version not compatible with TF Serving version",
"cause": "Model saved ด้วย TF version ที่ไม่ compatible กับ Serving version",
"fix": [
"ใช้ TF Serving version ที่ตรงกับ TF training version",
"Re-export model ด้วย compatible TF version",
"ตรวจสอบ: saved_model_cli show --dir /model/1 --tag_set serve",
],
},
}
def show_issues(self):
print("=== Common Issues ===\n")
for key, issue in self.ISSUES.items():
print(f"[{issue['error'][:60]}...]")
print(f" Cause: {issue['cause']}")
print(f" Fix:")
for fix in issue["fix"][:3]:
print(f" • {fix}")
print()
issues = CommonIssues()
issues.show_issues()
Debugging Tools & Commands
# debugging.py — Debugging TF Serving
import json
class DebuggingTools:
COMMANDS = {
"inspect_model": {
"name": "Inspect SavedModel",
"commands": [
"# ดู model signature",
"saved_model_cli show --dir /path/to/model/1 --all",
"",
"# ดู tag sets",
"saved_model_cli show --dir /path/to/model/1 --tag_set serve",
"",
"# ทดสอบ run model",
"saved_model_cli run --dir /path/to/model/1 --tag_set serve \\",
" --signature_def serving_default \\",
" --input_exprs 'input_1=np.ones((1,224,224,3))'",
],
},
"health_check": {
"name": "Health Check APIs",
"commands": [
"# REST API health",
"curl http://localhost:8501/v1/models/my_model",
"",
"# Model metadata",
"curl http://localhost:8501/v1/models/my_model/metadata",
"",
"# Model status (versions)",
"curl http://localhost:8501/v1/models/my_model/versions/1",
"",
"# gRPC health (grpcurl)",
"grpcurl -plaintext localhost:8500 tensorflow.serving.PredictionService/GetModelMetadata",
],
},
"docker_debug": {
"name": "Docker Debugging",
"commands": [
"# ดู logs",
"docker logs tf-serving --tail 100",
"",
"# เข้า container",
"docker exec -it tf-serving bash",
"",
"# ดู resource usage",
"docker stats tf-serving",
"",
"# ตรวจสอบ model files ใน container",
"docker exec tf-serving ls -la /models/my_model/",
],
},
}
def show_commands(self):
print("=== Debugging Commands ===\n")
for key, section in self.COMMANDS.items():
print(f"[{section['name']}]")
for cmd in section["commands"][:5]:
print(f" {cmd}")
print()
debug = DebuggingTools()
debug.show_commands()
Performance Optimization
# performance.py — TF Serving performance optimization
import json
class PerformanceOptimization:
BATCHING_CONFIG = """
# batching_config.txt — Request batching configuration
max_batch_size { value: 32 }
batch_timeout_micros { value: 5000 }
num_batch_threads { value: 4 }
max_enqueued_batches { value: 100 }
pad_variable_length_inputs: true
"""
MODEL_CONFIG = """
# model_config.txt — Multi-model serving
model_config_list {
config {
name: "image_classifier"
base_path: "/models/image_classifier"
model_platform: "tensorflow"
model_version_policy {
specific { versions: 1 versions: 2 }
}
}
config {
name: "text_model"
base_path: "/models/text_model"
model_platform: "tensorflow"
}
}
"""
DOCKER_OPTIMIZED = """
# docker-compose.yml — Optimized TF Serving
version: '3.8'
services:
tf-serving:
image: tensorflow/serving:latest-gpu
ports:
- "8500:8500" # gRPC
- "8501:8501" # REST
volumes:
- ./models:/models
- ./config:/config
environment:
MODEL_NAME: my_model
TF_CPP_MIN_LOG_LEVEL: "1"
TF_FORCE_GPU_ALLOW_GROWTH: "true"
command:
- --model_config_file=/config/model_config.txt
- --batching_parameters_file=/config/batching_config.txt
- --enable_batching=true
- --rest_api_num_threads=16
- --tensorflow_inter_op_parallelism=4
- --tensorflow_intra_op_parallelism=8
- --file_system_poll_wait_seconds=30
deploy:
resources:
limits:
cpus: '4'
memory: 8G
reservations:
devices:
- capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8501/v1/models/my_model"]
interval: 30s
timeout: 10s
retries: 3
"""
TUNING_TIPS = {
"batching": "Enable batching — รวม requests เป็น batch → GPU utilization สูงขึ้น",
"grpc": "ใช้ gRPC แทน REST — เร็วกว่า 2-5x (binary protocol, HTTP/2)",
"quantization": "Quantize model (INT8/FP16) — เร็วขึ้น 2-4x, memory ลด 50-75%",
"tensorrt": "ใช้ TensorRT optimization — NVIDIA GPU inference เร็วขึ้น 2-6x",
"warmup": "Add warmup requests — ลด cold start latency",
"threads": "ปรับ inter/intra op parallelism — ตาม CPU cores",
}
def show_batching(self):
print("=== Batching Config ===")
print(self.BATCHING_CONFIG)
def show_docker(self):
print("=== Optimized Docker ===")
print(self.DOCKER_OPTIMIZED[:500])
def show_tips(self):
print(f"\n=== Tuning Tips ===")
for key, tip in self.TUNING_TIPS.items():
print(f" [{key}] {tip}")
perf = PerformanceOptimization()
perf.show_batching()
perf.show_tips()
Python Client & Monitoring
# monitoring.py — TF Serving client and monitoring
import json
import random
class TFServingMonitoring:
CLIENT_CODE = """
# tf_serving_client.py — Python client
import requests
import numpy as np
import time
class TFServingClient:
def __init__(self, base_url='http://localhost:8501'):
self.base_url = base_url
def predict(self, model_name, input_data, version=None):
url = f"{self.base_url}/v1/models/{model_name}"
if version:
url += f"/versions/{version}"
url += ":predict"
payload = {"instances": input_data.tolist()}
start = time.time()
resp = requests.post(url, json=payload, timeout=30)
latency = (time.time() - start) * 1000
if resp.status_code != 200:
return {"error": resp.text, "status": resp.status_code}
return {
"predictions": resp.json()["predictions"],
"latency_ms": round(latency, 1),
"status": 200,
}
def health(self, model_name):
url = f"{self.base_url}/v1/models/{model_name}"
try:
resp = requests.get(url, timeout=5)
return resp.json()
except Exception as e:
return {"error": str(e)}
def metadata(self, model_name):
url = f"{self.base_url}/v1/models/{model_name}/metadata"
resp = requests.get(url, timeout=5)
return resp.json()
# Monitoring with Prometheus metrics
class TFServingMonitor:
def __init__(self, serving_url):
self.client = TFServingClient(serving_url)
self.metrics = {"requests": 0, "errors": 0, "latencies": []}
def check_health(self, model_name):
result = self.client.health(model_name)
state = result.get("model_version_status", [{}])[0].get("state", "UNKNOWN")
return state == "AVAILABLE"
def benchmark(self, model_name, input_data, n_requests=100):
latencies = []
errors = 0
for _ in range(n_requests):
result = self.client.predict(model_name, input_data)
if "error" in result:
errors += 1
else:
latencies.append(result["latency_ms"])
return {
"total": n_requests,
"errors": errors,
"p50": np.percentile(latencies, 50),
"p95": np.percentile(latencies, 95),
"p99": np.percentile(latencies, 99),
"avg": np.mean(latencies),
}
client = TFServingClient()
# result = client.predict("my_model", np.random.randn(1, 224, 224, 3))
"""
def show_client(self):
print("=== TF Serving Client ===")
print(self.CLIENT_CODE[:600])
def dashboard(self):
print(f"\n=== Monitoring Dashboard ===")
metrics = {
"Requests/sec": random.randint(100, 1000),
"Latency P50": f"{random.uniform(5, 20):.1f}ms",
"Latency P99": f"{random.uniform(50, 200):.1f}ms",
"Error rate": f"{random.uniform(0, 1):.2f}%",
"GPU utilization": f"{random.randint(40, 95)}%",
"Memory usage": f"{random.uniform(2, 8):.1f}GB",
"Active models": random.randint(1, 5),
"Model versions": random.randint(2, 8),
}
for m, v in metrics.items():
print(f" {m}: {v}")
mon = TFServingMonitoring()
mon.show_client()
mon.dashboard()
Kubernetes Deployment
# k8s.py — Kubernetes deployment for TF Serving
import json
class K8sDeployment:
DEPLOYMENT = """
# tf-serving-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: tf-serving
labels:
app: tf-serving
spec:
replicas: 3
selector:
matchLabels:
app: tf-serving
template:
metadata:
labels:
app: tf-serving
spec:
containers:
- name: tf-serving
image: tensorflow/serving:latest
ports:
- containerPort: 8500
name: grpc
- containerPort: 8501
name: rest
args:
- --model_config_file=/config/model_config.txt
- --enable_batching=true
- --batching_parameters_file=/config/batching_config.txt
resources:
requests:
cpu: "2"
memory: 4Gi
limits:
cpu: "4"
memory: 8Gi
readinessProbe:
httpGet:
path: /v1/models/my_model
port: 8501
initialDelaySeconds: 30
periodSeconds: 10
livenessProbe:
httpGet:
path: /v1/models/my_model
port: 8501
initialDelaySeconds: 60
periodSeconds: 30
volumeMounts:
- name: model-volume
mountPath: /models
- name: config-volume
mountPath: /config
volumes:
- name: model-volume
persistentVolumeClaim:
claimName: model-pvc
- name: config-volume
configMap:
name: tf-serving-config
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: tf-serving-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: tf-serving
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
"""
def show_deployment(self):
print("=== K8s Deployment ===")
print(self.DEPLOYMENT[:600])
def troubleshoot_k8s(self):
print(f"\n=== K8s Troubleshooting ===")
checks = [
"kubectl get pods -l app=tf-serving (ดู pod status)",
"kubectl logs --tail=100 (ดู logs)",
"kubectl describe pod (ดู events + errors)",
"kubectl top pods -l app=tf-serving (ดู resource usage)",
"kubectl exec -it -- curl localhost:8501/v1/models/my_model (health check)",
]
for check in checks:
print(f" $ {check}")
k8s = K8sDeployment()
k8s.show_deployment()
k8s.troubleshoot_k8s()
FAQ - คำถามที่พบบ่อย
Q: TF Serving กับ TorchServe อันไหนดี?
A: TF Serving: สำหรับ TensorFlow/Keras models, mature, Google-backed TorchServe: สำหรับ PyTorch models, AWS-backed, flexible Triton Inference Server: รองรับทุก framework (TF, PyTorch, ONNX) — แนะนำสำหรับ multi-framework เลือกตาม framework ที่ใช้ — ถ้าใช้ TF เลือก TF Serving, PyTorch เลือก TorchServe
Q: REST กับ gRPC ใช้อันไหน?
A: gRPC: เร็วกว่า 2-5x, binary protocol, HTTP/2, streaming — แนะนำสำหรับ production REST: ง่ายกว่า, debug ง่าย, ใช้ curl ได้ — เหมาะ development/testing Internal services: gRPC, External/browser: REST
Q: Model loading ช้ามาก ทำอย่างไร?
A: 1) Add warmup requests (assets.extra/tf_serving_warmup_requests) 2) ใช้ model optimization (quantization, pruning) 3) Pre-load models ก่อน route traffic 4) ใช้ PVC ที่เร็ว (SSD, NVMe) สำหรับ model storage 5) ลด model size (SavedModel strip unused ops)
Q: Monitor TF Serving อย่างไร?
A: Built-in: /monitoring/prometheus/metrics endpoint Prometheus: scrape metrics → Grafana dashboard Key metrics: request_count, request_latency, model_load_latency, batch_size Alerting: latency P99 > threshold, error rate > 1%, GPU memory > 90%
อ่านเพิ่มเติม: สอนเทรด Forex | XM Signal | IT Hardware | อาชีพ IT
