RAG Performance Tuning
RAG Retrieval Augmented Generation Performance Tuning Chunking Embedding Vector Search Caching LLM Latency Throughput Production Optimization
| Bottleneck | Latency | Solution | Improvement |
|---|---|---|---|
| Embedding | 50-200ms | Cache, Smaller Model | -80% |
| Vector Search | 10-100ms | Index Tuning, Filter | -50% |
| Context Assembly | 5-20ms | Pre-process, Limit | -60% |
| LLM Generation | 500-5000ms | Smaller Model, Stream | -40% |
| Network | 10-50ms | Co-locate, gRPC | -70% |
Chunking และ Embedding Optimization
# === Chunking & Embedding Optimization ===
# pip install langchain tiktoken sentence-transformers
# from langchain.text_splitter import (
# RecursiveCharacterTextSplitter,
# MarkdownHeaderTextSplitter,
# )
#
# # Recursive Character Splitting (Default)
# splitter = RecursiveCharacterTextSplitter(
# chunk_size=512,
# chunk_overlap=50,
# length_function=len,
# separators=["\n\n", "\n", ". ", " ", ""],
# )
# chunks = splitter.split_text(document)
#
# # Markdown Header Splitting
# headers = [
# ("#", "Header 1"),
# ("##", "Header 2"),
# ("###", "Header 3"),
# ]
# md_splitter = MarkdownHeaderTextSplitter(headers)
# chunks = md_splitter.split_text(markdown_doc)
#
# # Semantic Chunking (Advanced)
# from langchain_experimental.text_splitter import SemanticChunker
# from langchain_openai import OpenAIEmbeddings
#
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# semantic_splitter = SemanticChunker(
# embeddings,
# breakpoint_threshold_type="percentile",
# breakpoint_threshold_amount=90,
# )
# chunks = semantic_splitter.split_text(document)
from dataclasses import dataclass
from typing import List
@dataclass
class ChunkingStrategy:
name: str
chunk_size: int
overlap: int
retrieval_accuracy: float
latency_ms: float
use_case: str
strategies = [
ChunkingStrategy("Fixed 256", 256, 50, 0.78, 12, "Short Q&A"),
ChunkingStrategy("Fixed 512", 512, 50, 0.85, 15, "General Purpose"),
ChunkingStrategy("Fixed 1024", 1024, 100, 0.82, 20, "Long Context"),
ChunkingStrategy("Semantic", 400, 0, 0.91, 25, "High Quality"),
ChunkingStrategy("Markdown", 500, 0, 0.88, 18, "Documentation"),
ChunkingStrategy("Sentence", 300, 0, 0.86, 14, "FAQ, Support"),
]
print("=== Chunking Strategies ===")
for s in strategies:
print(f" [{s.name}] Size: {s.chunk_size} | Overlap: {s.overlap}")
print(f" Accuracy: {s.retrieval_accuracy:.0%} | Latency: {s.latency_ms}ms")
print(f" Use: {s.use_case}")
Caching และ Query Optimization
# === RAG Caching Strategy ===
# import redis
# import hashlib
# import json
# import numpy as np
#
# class RAGCache:
# def __init__(self, redis_url="redis://localhost:6379"):
# self.redis = redis.from_url(redis_url)
# self.ttl = 3600 # 1 hour
#
# def _hash_query(self, query: str) -> str:
# return hashlib.sha256(query.encode()).hexdigest()
#
# def get_cached_response(self, query: str):
# key = f"rag:response:{self._hash_query(query)}"
# cached = self.redis.get(key)
# return json.loads(cached) if cached else None
#
# def cache_response(self, query: str, response: dict):
# key = f"rag:response:{self._hash_query(query)}"
# self.redis.setex(key, self.ttl, json.dumps(response))
#
# def get_cached_embedding(self, text: str):
# key = f"rag:emb:{self._hash_query(text)}"
# cached = self.redis.get(key)
# return np.frombuffer(cached) if cached else None
#
# def cache_embedding(self, text: str, embedding):
# key = f"rag:emb:{self._hash_query(text)}"
# self.redis.setex(key, self.ttl * 24, embedding.tobytes())
#
# # Semantic Cache (ค้นหา Query ที่คล้ายกัน)
# class SemanticCache:
# def __init__(self, index, threshold=0.95):
# self.index = index # Pinecone/Qdrant index
# self.threshold = threshold
#
# def find_similar(self, query_embedding):
# results = self.index.query(
# vector=query_embedding,
# top_k=1,
# namespace="cache",
# )
# if results.matches and results.matches[0].score > self.threshold:
# return results.matches[0].metadata.get("response")
# return None
@dataclass
class CacheMetrics:
cache_type: str
hit_rate: float
avg_latency_hit_ms: float
avg_latency_miss_ms: float
memory_mb: float
savings_pct: float
caches = [
CacheMetrics("Response Cache", 0.35, 2, 1500, 256, 35),
CacheMetrics("Embedding Cache", 0.65, 1, 80, 512, 52),
CacheMetrics("Semantic Cache", 0.45, 5, 1500, 128, 45),
CacheMetrics("Search Result Cache", 0.50, 3, 50, 128, 25),
]
print("\n=== Cache Performance ===")
for c in caches:
effective_latency = (c.hit_rate * c.avg_latency_hit_ms +
(1 - c.hit_rate) * c.avg_latency_miss_ms)
print(f" [{c.cache_type}]")
print(f" Hit Rate: {c.hit_rate:.0%} | Hit: {c.avg_latency_hit_ms}ms | "
f"Miss: {c.avg_latency_miss_ms}ms")
print(f" Effective: {effective_latency:.0f}ms | Savings: {c.savings_pct}%")
Production Architecture
# === Production RAG Pipeline ===
# Optimized Pipeline
# 1. Query comes in
# 2. Check Semantic Cache -> if hit, return cached response
# 3. Check Embedding Cache -> if hit, use cached embedding
# 4. Generate embedding (if cache miss)
# 5. Parallel: Vector Search + Keyword Search (Hybrid)
# 6. Re-rank results with Cross-encoder
# 7. Assemble context (top 3-5 chunks)
# 8. Stream LLM response to client
# 9. Cache response asynchronously
# FastAPI Production Server
# from fastapi import FastAPI
# from fastapi.responses import StreamingResponse
# import asyncio
#
# app = FastAPI()
#
# @app.post("/api/query")
# async def query(request: QueryRequest):
# # 1. Check cache
# cached = cache.get_cached_response(request.query)
# if cached:
# return cached
#
# # 2. Get embedding (with cache)
# embedding = await get_embedding_cached(request.query)
#
# # 3. Parallel search
# vector_results, keyword_results = await asyncio.gather(
# vector_search(embedding, top_k=10),
# keyword_search(request.query, top_k=10),
# )
#
# # 4. Hybrid merge + Re-rank
# merged = hybrid_merge(vector_results, keyword_results, alpha=0.7)
# reranked = cross_encoder_rerank(request.query, merged, top_k=5)
#
# # 5. Stream LLM response
# context = build_context(reranked)
# return StreamingResponse(
# stream_llm_response(request.query, context),
# media_type="text/event-stream",
# )
pipeline_metrics = {
"Before Optimization": {
"p50_latency": "2,500ms",
"p99_latency": "8,000ms",
"throughput": "10 req/s",
"cost_per_1k": "$2.50",
"accuracy": "82%",
},
"After Optimization": {
"p50_latency": "800ms",
"p99_latency": "2,500ms",
"throughput": "50 req/s",
"cost_per_1k": "$0.80",
"accuracy": "89%",
},
}
print("RAG Pipeline Metrics:")
for phase, metrics in pipeline_metrics.items():
print(f"\n [{phase}]")
for k, v in metrics.items():
print(f" {k}: {v}")
# Optimization Checklist
checklist = [
"Chunking: 512 tokens + 50 overlap ดีสำหรับเริ่มต้น",
"Embedding: ใช้ text-embedding-3-small ถ้า Quality พอ",
"Hybrid Search: Vector + Keyword ดีกว่า Vector อย่างเดียว",
"Re-ranking: Cross-encoder เพิ่ม Accuracy 5-10%",
"Caching: Semantic + Embedding Cache ลด 50% Latency",
"Streaming: Stream Response ให้ User เห็น Token แรกเร็ว",
"Parallel: Search หลาย Source พร้อมกัน asyncio.gather",
"Context: จำกัด 3-5 Chunks ไม่ส่งทั้งหมดให้ LLM",
]
print(f"\n\nOptimization Checklist:")
for i, item in enumerate(checklist, 1):
print(f" {i}. {item}")
เคล็ดลับ
- Streaming: Stream Response ให้ User เห็น Token แรกเร็ว ลด Perceived Latency
- Hybrid: ใช้ Hybrid Search (Vector + BM25) ผลลัพธ์ดีกว่า Vector อย่างเดียว
- Re-rank: ใช้ Cross-encoder Re-rank ผลลัพธ์ เพิ่ม Accuracy 5-10%
- Cache: Semantic Cache ลด Latency 50-90% สำหรับ Query ซ้ำ
- Monitor: วัด Latency ทุกขั้น หา Bottleneck แก้ทีละจุด
RAG Architecture คืออะไร
Retrieval Augmented Generation เสริม LLM Knowledge Base Vector Database Context คำตอบถูกต้อง Up-to-date ลด Hallucination
ทำไม RAG ถึงช้า
Embedding Generation Vector Search Context Assembly LLM Generation Network Latency แก้ด้วย Caching Parallel Smaller Models Chunking
Chunking Strategy ที่ดีเป็นอย่างไร
512 tokens Overlap 50 Semantic Chunking Meaning Recursive Character Markdown Header Splitting Documentation เล็กเกินขาด Context ใหญ่เกิน Noise
Cache ช่วย RAG เร็วขึ้นอย่างไร
Semantic Cache Query คล้าย Embedding Cache ลด API Result Cache ลด Search ลด Latency 50-90% ลด Cost Redis In-memory
สรุป
RAG Performance Tuning Chunking Embedding Vector Search Hybrid BM25 Re-ranking Cross-encoder Caching Semantic Cache Streaming Parallel Processing Production FastAPI Latency Throughput Cost Optimization
