TensorFlow Serving Edge Computing

TensorFlow Serving Production ML Serving gRPC REST API Model Versioning Batching GPU Edge Computing ประมวลผลใกล้แหล่งข้อมูล ลด Latency IoT CCTV หุ่นยนต์ โรงงาน

Platform	Target	Performance	Use Case
TF Serving	Server/Cloud	สูงมาก (GPU)	Production API
TF Lite	Mobile/Edge	ปานกลาง	Android iOS RPi
TF.js	Browser/Node	ปานกลาง	Web Application
TF Micro	Microcontroller	ต่ำ	TinyML IoT Sensor
NVIDIA Triton	Server/Edge	สูงมาก	Multi-Model Serving

TensorFlow Serving Setup

=== TensorFlow Serving Setup ===

อ่านเพิ่ม: Docker Compose ตัวอย่าง Config สำหรับ Self-hosted Apps · อ่านเพิ่ม: Helm Chart Template IoT Gateway — คู่มือฉบับสมบูรณ์ 2026 | S · อ่านเพิ่ม: LLM Inference vLLM Pub Sub Architecture | SiamCafe Blog

1. Docker (แนะนำ)

docker pull tensorflow/serving:latest-gpu

docker run -p 8501:8501 -p 8500:8500 \

--mount type=bind, source=/models/my_model, target=/models/my_model \

-e MODEL_NAME=my_model \

-t tensorflow/serving:latest-gpu

2. SavedModel Format

import tensorflow as tf

model = tf.keras.applications.MobileNetV2(weights='imagenet')

tf.saved_model.save(model, '/models/my_model/1/')

# Directory structure:

# /models/my_model/

# 1/ (version 1)

# saved_model.pb

# variables/

# 2/ (version 2)

# saved_model.pb

# variables/

3. REST API

curl http://localhost:8501/v1/models/my_model

curl -d '{"instances": [[1.0, 2.0, 3.0]]}' \

http://localhost:8501/v1/models/my_model:predict

4. gRPC (faster)

pip install tensorflow-serving-api

import grpc

from tensorflow_serving.apis import predict_pb2

from tensorflow_serving.apis import prediction_service_pb2_grpc

channel = grpc.insecure_channel('localhost:8500')

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ คู่มือแก้ปัญหา Docusaurus Documentation ทีละขั้นตอน

stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)

request = predict_pb2.PredictRequest()

request.model_spec.name = 'my_model'

5. Model Config (multiple models)

model_config.config:

model_config_list {

config {

base_path: '/models/classifier'

model_platform: 'tensorflow'

แนะนำเพิ่มเติม — SiamCafeBook

}

config {

base_path: '/models/detector'

model_platform: 'tensorflow'

}

docker-compose.yml

version: '3.8'

services:

tf-serving:

image: tensorflow/serving:latest-gpu

ports:

"8500:8500"
"8501:8501"

volumes:

./models:/models
./model_config.config:/config

command: --model_config_file=/config

deploy:

resources:

reservations:

devices:

driver: nvidia

capabilities: [gpu]

from dataclasses import dataclass

from typing import List, Dict

@dataclass

class ServingConfig:

model_name: str

version: int

platform: str

batch_size: int

gpu: bool

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง จอมอนิเตอร์ 24 นิ้ว

configs = [

ServingConfig("image_classifier", 3, "tensorflow", 32, True),

ServingConfig("object_detector", 2, "tensorflow", 16, True),

ServingConfig("text_classifier", 1, "tensorflow", 64, False),

]

print("TensorFlow Serving Models:")

for c in configs:

gpu_str = "GPU" if c.gpu else "CPU"

print(f" {c.model_name} v{c.version} | {c.platform} | "

f"batch={c.batch_size} | {gpu_str}")

TensorFlow Lite Edge

=== TensorFlow Lite for Edge Devices ===

1. Convert Model to TFLite

import tensorflow as tf

model = tf.keras.applications.MobileNetV2(weights='imagenet')

# Standard conversion

converter = tf.lite.TFLiteConverter.from_keras_model(model)

tflite_model = converter.convert()

แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex

with open('model.tflite', 'wb') as f:

f.write(tflite_model)

# Quantized (INT8) — เล็กลง 4x เร็วขึ้น

converter.optimizations = [tf.lite.Optimize.DEFAULT]

converter.representative_dataset = representative_data_gen

converter.target_spec.supported_ops = [

tf.lite.OpsSet.TFLITE_BUILTINS_INT8

]

converter.inference_input_type = tf.uint8

converter.inference_output_type = tf.uint8

quantized_model = converter.convert()

with open('model_quant.tflite', 'wb') as f:

f.write(quantized_model)

2. Run on Raspberry Pi

pip install tflite-runtime

import tflite_runtime.interpreter as tflite

เนื้อหาเกี่ยวข้อง — อ่านต่อ: CSS Container Queries Real-time Processing

import numpy as np

from PIL import Image

interpreter = tflite.Interpreter(model_path='model.tflite')

interpreter.allocate_tensors()

input_details = interpreter.get_input_details()

output_details = interpreter.get_output_details()

img = Image.open('test.jpg').resize((224, 224))

input_data = np.expand_dims(np.array(img, dtype=np.float32) / 255.0, 0)

interpreter.set_tensor(input_details[0]['index'], input_data)

interpreter.invoke()

output = interpreter.get_tensor(output_details[0]['index'])

3. Run with Coral Edge TPU

pip install pycoral

from pycoral.utils.edgetpu import make_interpreter

interpreter = make_interpreter('model_edgetpu.tflite')

interpreter.allocate_tensors()

# 10x faster than CPU on Raspberry Pi

4. NVIDIA Jetson

pip install jetson-inference

import jetson.inference

net = jetson.inference.detectNet("ssd-mobilenet-v2", threshold=0.5)

camera = jetson.utils.videoSource("/dev/video0")

while True:

img = camera.Capture()

detections = net.Detect(img)

edge_devices = {

"Raspberry Pi 5": {

"cpu": "ARM Cortex-A76 2.4GHz",

"ram": "4-8GB",

"price": "$60-80",

"tflite_fps": "5-15 FPS",

เนื้อหาเกี่ยวข้อง — Ma คืออะไร — คู่มือฉบับสมบูรณ์ 2026

"use": "Prototype, Camera, Sensor",

"Coral Edge TPU": {

"cpu": "Edge TPU Coprocessor",

"ram": "N/A (USB Accelerator)",

"price": "$60",

"tflite_fps": "50-100 FPS",

"use": "TFLite Acceleration",

"NVIDIA Jetson Orin Nano": {

"cpu": "ARM A78AE + 1024 CUDA",

"ram": "8GB",

"price": "$500",

"tflite_fps": "100+ FPS",

"use": "AI Robot, Drone, CCTV",

"Intel NUC": {

"cpu": "Intel i5/i7",

"ram": "8-32GB",

"price": "$300-600",

"tflite_fps": "30-60 FPS",

"use": "Edge Server, Gateway",

}

print("\nEdge Devices for TensorFlow:")

for device, info in edge_devices.items():

print(f"\n [{device}]")

for k, v in info.items():

print(f" {k}: {v}")

Kubernetes Edge

# === Kubernetes Edge Deployment ===

# K3s — Lightweight Kubernetes for Edge
# curl -sfL https://get.k3s.io | sh -
# kubectl get nodes

# Edge Deployment
# apiVersion: apps/v1
# kind: Deployment
# metadata:
#   name: edge-inference
# spec:
#   replicas: 2
#   selector:
#     matchLabels:
#       app: edge-inference
#   template:
#     spec:
#       containers:
#       - name: tflite-server
#         image: edge-inference:latest
#         ports:
#         - containerPort: 8080
#         resources:
#           limits:
#             cpu: "2"
#             memory: "2Gi"
#           requests:
#             cpu: "1"
#             memory: "1Gi"
#         volumeMounts:
#         - name: models
#           mountPath: /models
#       volumes:
#       - name: models
#         hostPath:
#           path: /opt/models

# KubeEdge — Kubernetes for Edge Computing
# keadm init --advertise-address=10.0.0.1
# keadm join --cloudcore-ipport=10.0.0.1:10000

edge_architectures = {
    "Cloud-Edge": {
        "desc": "Train บน Cloud, Inference บน Edge",
        "latency": "10-50ms",
        "bandwidth": "ต่ำ (ส่งแค่ผลลัพธ์)",
        "tools": "TF Serving (Cloud) + TF Lite (Edge)",
    },
    "Edge-Only": {
        "desc": "ทำทุกอย่างบน Edge ไม่ต้อง Cloud",
        "latency": "1-10ms",
        "bandwidth": "ไม่ใช้",
        "tools": "TF Lite + Coral TPU / Jetson",
    },
    "Federated": {
        "desc": "Train บน Edge ส่ง Gradient กลับ Cloud",
        "latency": "10-50ms",
        "bandwidth": "ปานกลาง (ส่ง Gradient)",
        "tools": "TF Federated + TF Lite",
    },
}

print("Edge Computing Architectures:")
for arch, info in edge_architectures.items():
    print(f"\n  [{arch}]")
    for k, v in info.items():
        print(f"    {k}: {v}")

# Model Optimization Pipeline
optimization = [
    "1. Train Full Model (Cloud GPU)",
    "2. Prune — ตัด Neurons ที่ไม่สำคัญ (ลด 50%)",
    "3. Quantize — INT8 แทน FP32 (ลด 4x)",
    "4. Convert — SavedModel to TFLite",
    "5. Compile — Edge TPU Compiler (ถ้าใช้ Coral)",
    "6. Benchmark — วัด Latency/Accuracy บน Edge",
    "7. Deploy — OTA Update ไป Edge Device",
]

print(f"\n\nModel Optimization Pipeline:")
for step in optimization:
    print(f"  {step}")

เคล็ดลับ

MobileNet: ใช้ MobileNetV2/V3 สำหรับ Edge เร็วและเล็ก
Quantization: INT8 ลดขนาด 4x เร็วขึ้น 2-3x Accuracy ลดน้อยมาก
Coral TPU: เร็วกว่า CPU 10x สำหรับ TFLite Model
K3s: ใช้ K3s แทน K8s บน Edge Device ใช้ RAM น้อยกว่า
OTA Update: อัปเดต Model ผ่าน Network ไม่ต้องไปเปลี่ยนที่อุปกรณ์

TensorFlow Serving คืออะไร

Production ML Serving Google gRPC REST API Model Versioning Batching GPU Acceleration Production Scale

TensorFlow Serving Edge Computing — การ Deploy