SiamCafe · Blog
TensorFlow Serving Edge Computing — การ Deploy
บทความ

TensorFlow Serving Edge Computing — การ Deploy

เผยแพร่ 28 พฤษภาคม 2569

TensorFlow Serving Edge Computing

TensorFlow Serving Production ML Serving gRPC REST API Model Versioning Batching GPU Edge Computing ประมวลผลใกล้แหล่งข้อมูล ลด Latency IoT CCTV หุ่นยนต์ โรงงาน

PlatformTargetPerformanceUse Case
TF ServingServer/Cloudสูงมาก (GPU)Production API
TF LiteMobile/EdgeปานกลางAndroid iOS RPi
TF.jsBrowser/NodeปานกลางWeb Application
TF MicroMicrocontrollerต่ำTinyML IoT Sensor
NVIDIA TritonServer/EdgeสูงมากMulti-Model Serving

TensorFlow Serving Setup

=== TensorFlow Serving Setup ===

อ่านเพิ่ม: Docker Compose ตัวอย่าง Config สำหรับ Self-hosted Apps · อ่านเพิ่ม: Helm Chart Template IoT Gateway — คู่มือฉบับสมบูรณ์ 2026 | S · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog

1. Docker (แนะนำ)

docker pull tensorflow/serving:latest-gpu

docker run -p 8501:8501 -p 8500:8500 \

--mount type=bind, source=/models/my_model, target=/models/my_model \

-e MODEL_NAME=my_model \

-t tensorflow/serving:latest-gpu

2. SavedModel Format

import tensorflow as tf

model = tf.keras.applications.MobileNetV2(weights='imagenet')

tf.saved_model.save(model, '/models/my_model/1/')

# Directory structure:

# /models/my_model/

# 1/ (version 1)

# saved_model.pb

# variables/

# 2/ (version 2)

# saved_model.pb

# variables/

3. REST API

curl http://localhost:8501/v1/models/my_model

curl -d '{"instances": [[1.0, 2.0, 3.0]]}' \

http://localhost:8501/v1/models/my_model:predict

4. gRPC (faster)

pip install tensorflow-serving-api

import grpc

from tensorflow_serving.apis import predict_pb2

from tensorflow_serving.apis import prediction_service_pb2_grpc

channel = grpc.insecure_channel('localhost:8500')

stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)

request = predict_pb2.PredictRequest()

request.model_spec.name = 'my_model'

5. Model Config (multiple models)

model_config.config:

model_config_list {

config {

name: 'classifier'

base_path: '/models/classifier'

model_platform: 'tensorflow'

}

config {

name: 'detector'

base_path: '/models/detector'

model_platform: 'tensorflow'

}

}

docker-compose.yml

version: '3.8'

services:

tf-serving:

image: tensorflow/serving:latest-gpu

ports:

  • "8500:8500"
  • "8501:8501"

volumes:

  • ./models:/models
  • ./model_config.config:/config

command: --model_config_file=/config

deploy:

resources:

reservations:

devices:

  • driver: nvidia

count: 1

capabilities: [gpu]

from dataclasses import dataclass

from typing import List, Dict

@dataclass

class ServingConfig:

model_name: str

version: int

platform: str

batch_size: int

gpu: bool

configs = [

ServingConfig("image_classifier", 3, "tensorflow", 32, True),

ServingConfig("object_detector", 2, "tensorflow", 16, True),

ServingConfig("text_classifier", 1, "tensorflow", 64, False),

]

print("TensorFlow Serving Models:")

for c in configs:

gpu_str = "GPU" if c.gpu else "CPU"

print(f" {c.model_name} v{c.version} | {c.platform} | "

f"batch={c.batch_size} | {gpu_str}")

TensorFlow Lite Edge

=== TensorFlow Lite for Edge Devices ===

1. Convert Model to TFLite

import tensorflow as tf

model = tf.keras.applications.MobileNetV2(weights='imagenet')

# Standard conversion

converter = tf.lite.TFLiteConverter.from_keras_model(model)

tflite_model = converter.convert()

with open('model.tflite', 'wb') as f:

f.write(tflite_model)

# Quantized (INT8) — เล็กลง 4x เร็วขึ้น

converter.optimizations = [tf.lite.Optimize.DEFAULT]

converter.representative_dataset = representative_data_gen

converter.target_spec.supported_ops = [

tf.lite.OpsSet.TFLITE_BUILTINS_INT8

]

converter.inference_input_type = tf.uint8

converter.inference_output_type = tf.uint8

quantized_model = converter.convert()

with open('model_quant.tflite', 'wb') as f:

f.write(quantized_model)

2. Run on Raspberry Pi

pip install tflite-runtime

import tflite_runtime.interpreter as tflite

import numpy as np

from PIL import Image

interpreter = tflite.Interpreter(model_path='model.tflite')

interpreter.allocate_tensors()

input_details = interpreter.get_input_details()

output_details = interpreter.get_output_details()

img = Image.open('test.jpg').resize((224, 224))

input_data = np.expand_dims(np.array(img, dtype=np.float32) / 255.0, 0)

interpreter.set_tensor(input_details[0]['index'], input_data)

interpreter.invoke()

output = interpreter.get_tensor(output_details[0]['index'])

3. Run with Coral Edge TPU

pip install pycoral

from pycoral.utils.edgetpu import make_interpreter

interpreter = make_interpreter('model_edgetpu.tflite')

interpreter.allocate_tensors()

# 10x faster than CPU on Raspberry Pi

4. NVIDIA Jetson

pip install jetson-inference

import jetson.inference

net = jetson.inference.detectNet("ssd-mobilenet-v2", threshold=0.5)

camera = jetson.utils.videoSource("/dev/video0")

while True:

img = camera.Capture()

detections = net.Detect(img)

edge_devices = {

"Raspberry Pi 5": {

"cpu": "ARM Cortex-A76 2.4GHz",

"ram": "4-8GB",

"price": "$60-80",

"tflite_fps": "5-15 FPS",

"use": "Prototype, Camera, Sensor",

},

"Coral Edge TPU": {

"cpu": "Edge TPU Coprocessor",

"ram": "N/A (USB Accelerator)",

"price": "$60",

"tflite_fps": "50-100 FPS",

"use": "TFLite Acceleration",

},

"NVIDIA Jetson Orin Nano": {

"cpu": "ARM A78AE + 1024 CUDA",

"ram": "8GB",

"price": "$500",

"tflite_fps": "100+ FPS",

"use": "AI Robot, Drone, CCTV",

},

"Intel NUC": {

"cpu": "Intel i5/i7",

"ram": "8-32GB",

"price": "$300-600",

"tflite_fps": "30-60 FPS",

"use": "Edge Server, Gateway",

},

}

print("\nEdge Devices for TensorFlow:")

for device, info in edge_devices.items():

print(f"\n [{device}]")

for k, v in info.items():

print(f" {k}: {v}")

Kubernetes Edge

# === Kubernetes Edge Deployment ===

# K3s — Lightweight Kubernetes for Edge
# curl -sfL https://get.k3s.io | sh -
# kubectl get nodes

# Edge Deployment
# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: edge-inference
# spec:
#   replicas: 2
#   selector:
#     matchLabels:
#       app: edge-inference
#   template:
#     spec:
#       containers:
#       - name: tflite-server
#         image: edge-inference:latest
#         ports:
#         - containerPort: 8080
#         resources:
#           limits:
#             cpu: "2"
#             memory: "2Gi"
#           requests:
#             cpu: "1"
#             memory: "1Gi"
#         volumeMounts:
#         - name: models
#           mountPath: /models
#       volumes:
#       - name: models
#         hostPath:
#           path: /opt/models

# KubeEdge — Kubernetes for Edge Computing
# keadm init --advertise-address=10.0.0.1
# keadm join --cloudcore-ipport=10.0.0.1:10000

edge_architectures = {
    "Cloud-Edge": {
        "desc": "Train บน Cloud, Inference บน Edge",
        "latency": "10-50ms",
        "bandwidth": "ต่ำ (ส่งแค่ผลลัพธ์)",
        "tools": "TF Serving (Cloud) + TF Lite (Edge)",
    },
    "Edge-Only": {
        "desc": "ทำทุกอย่างบน Edge ไม่ต้อง Cloud",
        "latency": "1-10ms",
        "bandwidth": "ไม่ใช้",
        "tools": "TF Lite + Coral TPU / Jetson",
    },
    "Federated": {
        "desc": "Train บน Edge ส่ง Gradient กลับ Cloud",
        "latency": "10-50ms",
        "bandwidth": "ปานกลาง (ส่ง Gradient)",
        "tools": "TF Federated + TF Lite",
    },
}

print("Edge Computing Architectures:")
for arch, info in edge_architectures.items():
    print(f"\n  [{arch}]")
    for k, v in info.items():
        print(f"    {k}: {v}")

# Model Optimization Pipeline
optimization = [
    "1. Train Full Model (Cloud GPU)",
    "2. Prune — ตัด Neurons ที่ไม่สำคัญ (ลด 50%)",
    "3. Quantize — INT8 แทน FP32 (ลด 4x)",
    "4. Convert — SavedModel to TFLite",
    "5. Compile — Edge TPU Compiler (ถ้าใช้ Coral)",
    "6. Benchmark — วัด Latency/Accuracy บน Edge",
    "7. Deploy — OTA Update ไป Edge Device",
]

print(f"\n\nModel Optimization Pipeline:")
for step in optimization:
    print(f"  {step}")

เคล็ดลับ

  • MobileNet: ใช้ MobileNetV2/V3 สำหรับ Edge เร็วและเล็ก
  • Quantization: INT8 ลดขนาด 4x เร็วขึ้น 2-3x Accuracy ลดน้อยมาก
  • Coral TPU: เร็วกว่า CPU 10x สำหรับ TFLite Model
  • K3s: ใช้ K3s แทน K8s บน Edge Device ใช้ RAM น้อยกว่า
  • OTA Update: อัปเดต Model ผ่าน Network ไม่ต้องไปเปลี่ยนที่อุปกรณ์

TensorFlow Serving คืออะไร

Production ML Serving Google gRPC REST API Model Versioning Batching GPU Acceleration Production Scale