RAG Architecture Performance Tuning
RAG Performance Tuning

RAG Retrieval Augmented Generation Performance Tuning Chunking Embedding Vector Search Caching LLM Latency Throughput Production Optimization
| Bottleneck | Latency | Solution | Improvement |
|---|---|---|---|
| Embedding | 50-200ms | Cache, Smaller Model | -80% |
| Vector Search | 10-100ms | Index Tuning, Filter | -50% |
| Context Assembly | 5-20ms | Pre-process, Limit | -60% |
| LLM Generation | 500-5000ms | Smaller Model, Stream | -40% |
| Network | 10-50ms | Co-locate, gRPC | -70% |
Chunking และ Embedding Optimization
=== Chunking & Embedding Optimization ===
pip install langchain tiktoken sentence-transformers
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
MarkdownHeaderTextSplitter,
)
# Recursive Character Splitting (Default)
splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_text(document)
# Markdown Header Splitting
headers = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
md_splitter = MarkdownHeaderTextSplitter(headers)
chunks = md_splitter.split_text(markdown_doc)
# Semantic Chunking (Advanced)
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
semantic_splitter = SemanticChunker(
embeddings,
breakpoint_threshold_type="percentile",
breakpoint_threshold_amount=90,
)
chunks = semantic_splitter.split_text(document)
from dataclasses import dataclass
from typing import List
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง LangChain Agent Capacity Planning
@dataclass
class ChunkingStrategy:
name: str
chunk_size: int
overlap: int
retrieval_accuracy: float
latency_ms: float
use_case: str
strategies = [
ChunkingStrategy("Fixed 256", 256, 50, 0.78, 12, "Short Q&A"),
ChunkingStrategy("Fixed 512", 512, 50, 0.85, 15, "General Purpose"),
แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook
ChunkingStrategy("Fixed 1024", 1024, 100, 0.82, 20, "Long Context"),
ChunkingStrategy("Semantic", 400, 0, 0.91, 25, "High Quality"),
ChunkingStrategy("Markdown", 500, 0, 0.88, 18, "Documentation"),
ChunkingStrategy("Sentence", 300, 0, 0.86, 14, "FAQ, Support"),
]
print("=== Chunking Strategies ===")
for s in strategies:
print(f" [{s.name}] Size: {s.chunk_size} | Overlap: {s.overlap}")
print(f" Accuracy: {s.retrieval_accuracy:.0%} | Latency: {s.latency_ms}ms")
print(f" Use: {s.use_case}")
Caching และ Query Optimization
=== RAG Caching Strategy ===
import redis
import hashlib
import json
import numpy as np
class RAGCache:
def __init__(self, redis_url="redis://localhost:6379"):
self.redis = redis.from_url(redis_url)
self.ttl = 3600 # 1 hour
def _hash_query(self, query: str) -> str:
return hashlib.sha256(query.encode()).hexdigest()
def get_cached_response(self, query: str):
key = f"rag:response:{self._hash_query(query)}"
cached = self.redis.get(key)
return json.loads(cached) if cached else None
def cache_response(self, query: str, response: dict):
key = f"rag:response:{self._hash_query(query)}"
self.redis.setex(key, self.ttl, json.dumps(response))
def get_cached_embedding(self, text: str):
key = f"rag:emb:{self._hash_query(text)}"
cached = self.redis.get(key)
เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Nearly แปลว่าอะไร — ความหมาย การใช้งาน
return np.frombuffer(cached) if cached else None
def cache_embedding(self, text: str, embedding):
key = f"rag:emb:{self._hash_query(text)}"
self.redis.setex(key, self.ttl * 24, embedding.tobytes())
# Semantic Cache (ค้นหา Query ที่คล้ายกัน)
class SemanticCache:

def __init__(self, index, threshold=0.95):
self.index = index # Pinecone/Qdrant index
self.threshold = threshold
def find_similar(self, query_embedding):
results = self.index.query(
vector=query_embedding,
top_k=1,
namespace="cache",
)
if results.matches and results.matches[0].score > self.threshold:
return results.matches[0].metadata.get("response")
return None
@dataclass
class CacheMetrics:
cache_type: str
hit_rate: float
avg_latency_hit_ms: float
avg_latency_miss_ms: float
memory_mb: float
แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex
savings_pct: float
caches = [
CacheMetrics("Response Cache", 0.35, 2, 1500, 256, 35),
CacheMetrics("Embedding Cache", 0.65, 1, 80, 512, 52),
CacheMetrics("Semantic Cache", 0.45, 5, 1500, 128, 45),
CacheMetrics("Search Result Cache", 0.50, 3, 50, 128, 25),
]
print("\n=== Cache Performance ===")
for c in caches:
effective_latency = (c.hit_rate * c.avg_latency_hit_ms +
(1 - c.hit_rate) * c.avg_latency_miss_ms)
print(f" [{c.cache_type}]")
print(f" Hit Rate: {c.hit_rate:.0%} | Hit: {c.avg_latency_hit_ms}ms | "
f"Miss: {c.avg_latency_miss_ms}ms")
print(f" Effective: {effective_latency:.0f}ms | Savings: {c.savings_pct}%")
Production Architecture
=== Production RAG Pipeline ===
Optimized Pipeline
เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Python Poetry High Availability HA Setup
1. Query comes in
2. Check Semantic Cache -> if hit, return cached response
3. Check Embedding Cache -> if hit, use cached embedding
4. Generate embedding (if cache miss)
5. Parallel: Vector Search + Keyword Search (Hybrid)
6. Re-rank results with Cross-encoder
7. Assemble context (top 3-5 chunks)
8. Stream LLM response to client
9. Cache response asynchronously
FastAPI Production Server
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import asyncio
app = FastAPI()
@app.post("/api/query")
async def query(request: QueryRequest):
# 1. Check cache
cached = cache.get_cached_response(request.query)
if cached:
return cached
# 2. Get embedding (with cache)
embedding = await get_embedding_cached(request.query)
# 3. Parallel search
vector_results, keyword_results = await asyncio.gather(
vector_search(embedding, top_k=10),
keyword_search(request.query, top_k=10),
)
# 4. Hybrid merge + Re-rank
merged = hybrid_merge(vector_results, keyword_results, alpha=0.7)
reranked = cross_encoder_rerank(request.query, merged, top_k=5)
# 5. Stream LLM response
context = build_context(reranked)
return StreamingResponse(
stream_llm_response(request.query, context),
media_type="text/event-stream",
)
เนื้อหาเกี่ยวข้อง — Dom Html คืออะไร — ข้อมูลครบถ้วน 2026
pipeline_metrics = {
"Before Optimization": {
"p50_latency": "2,500ms",
"p99_latency": "8,000ms",
"throughput": "10 req/s",
"cost_per_1k": "$2.50",
"accuracy": "82%",
},
"After Optimization": {
"p50_latency": "800ms",
"p99_latency": "2,500ms",
"throughput": "50 req/s",
"cost_per_1k": "$0.80",
"accuracy": "89%",
},
}
print("RAG Pipeline Metrics:")
for phase, metrics in pipeline_metrics.items():
print(f"\n [{phase}]")
for k, v in metrics.items():
print(f" {k}: {v}")
Optimization Checklist
checklist = [
"Chunking: 512 tokens + 50 overlap ดีสำหรับเริ่มต้น",
"Embedding: ใช้ text-embedding-3-small ถ้า Quality พอ",
"Hybrid Search: Vector + Keyword ดีกว่า Vector อย่างเดียว",
"Re-ranking: Cross-encoder เพิ่ม Accuracy 5-10%",
"Caching: Semantic + Embedding Cache ลด 50% Latency",
"Streaming: Stream Response ให้ User เห็น Token แรกเร็ว",
"Parallel: Search หลาย Source พร้อมกัน asyncio.gather",
"Context: จำกัด 3-5 Chunks ไม่ส่งทั้งหมดให้ LLM",
]
print(f"\n\nOptimization Checklist:")
for i, item in enumerate(checklist, 1):
print(f" {i}. {item}")
เคล็ดลับ
- Streaming: Stream Response ให้ User เห็น Token แรกเร็ว ลด Perceived Latency
- Hybrid: ใช้ Hybrid Search (Vector + BM25) ผลลัพธ์ดีกว่า Vector อย่างเดียว
- Re-rank: ใช้ Cross-encoder Re-rank ผลลัพธ์ เพิ่ม Accuracy 5-10%
- Cache: Semantic Cache ลด Latency 50-90% สำหรับ Query ซ้ำ
- Monitor: วัด Latency ทุกขั้น หา Bottleneck แก้ทีละจุด
RAG Architecture คืออะไร
Retrieval Augmented Generation เสริม LLM Knowledge Base Vector Database Context คำตอบถูกต้อง Up-to-date ลด Hallucination





