RAG Architecture Performance Tuning

RAG Performance Tuning

RAG Retrieval Augmented Generation Performance Tuning Chunking Embedding Vector Search Caching LLM Latency Throughput Production Optimization

Bottleneck	Latency	Solution	Improvement
Embedding	50-200ms	Cache, Smaller Model	-80%
Vector Search	10-100ms	Index Tuning, Filter	-50%
Context Assembly	5-20ms	Pre-process, Limit	-60%
LLM Generation	500-5000ms	Smaller Model, Stream	-40%
Network	10-50ms	Co-locate, gRPC	-70%

Chunking และ Embedding Optimization

=== Chunking & Embedding Optimization ===

pip install langchain tiktoken sentence-transformers

from langchain.text_splitter import (

RecursiveCharacterTextSplitter,

MarkdownHeaderTextSplitter,

)

# Recursive Character Splitting (Default)

splitter = RecursiveCharacterTextSplitter(

chunk_size=512,

chunk_overlap=50,

length_function=len,

separators=["\n\n", "\n", ". ", " ", ""],

)

chunks = splitter.split_text(document)

# Markdown Header Splitting

headers = [

("#", "Header 1"),

("##", "Header 2"),

("###", "Header 3"),

]

md_splitter = MarkdownHeaderTextSplitter(headers)

chunks = md_splitter.split_text(markdown_doc)

# Semantic Chunking (Advanced)

from langchain_experimental.text_splitter import SemanticChunker

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

semantic_splitter = SemanticChunker(

embeddings,

breakpoint_threshold_type="percentile",

breakpoint_threshold_amount=90,

)

chunks = semantic_splitter.split_text(document)

from dataclasses import dataclass

from typing import List

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง LangChain Agent Capacity Planning

@dataclass

class ChunkingStrategy:

name: str

chunk_size: int

overlap: int

retrieval_accuracy: float

latency_ms: float

use_case: str

strategies = [

ChunkingStrategy("Fixed 256", 256, 50, 0.78, 12, "Short Q&A"),

ChunkingStrategy("Fixed 512", 512, 50, 0.85, 15, "General Purpose"),

แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook

ChunkingStrategy("Fixed 1024", 1024, 100, 0.82, 20, "Long Context"),

ChunkingStrategy("Semantic", 400, 0, 0.91, 25, "High Quality"),

ChunkingStrategy("Markdown", 500, 0, 0.88, 18, "Documentation"),

ChunkingStrategy("Sentence", 300, 0, 0.86, 14, "FAQ, Support"),

]

print("=== Chunking Strategies ===")

for s in strategies:

print(f" [{s.name}] Size: {s.chunk_size} | Overlap: {s.overlap}")

print(f" Accuracy: {s.retrieval_accuracy:.0%} | Latency: {s.latency_ms}ms")

print(f" Use: {s.use_case}")

Caching และ Query Optimization

=== RAG Caching Strategy ===

import redis

import hashlib

import json

import numpy as np

class RAGCache:

def init(self, redis_url="redis://localhost:6379"):

self.redis = redis.from_url(redis_url)

self.ttl = 3600 # 1 hour

def _hash_query(self, query: str) -> str:

return hashlib.sha256(query.encode()).hexdigest()

def get_cached_response(self, query: str):

key = f"rag:response:{self._hash_query(query)}"

cached = self.redis.get(key)

return json.loads(cached) if cached else None

def cache_response(self, query: str, response: dict):

key = f"rag:response:{self._hash_query(query)}"

self.redis.setex(key, self.ttl, json.dumps(response))

def get_cached_embedding(self, text: str):

key = f"rag:emb:{self._hash_query(text)}"

cached = self.redis.get(key)

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Nearly แปลว่าอะไร — ความหมาย การใช้งาน

return np.frombuffer(cached) if cached else None

def cache_embedding(self, text: str, embedding):

key = f"rag:emb:{self._hash_query(text)}"

self.redis.setex(key, self.ttl * 24, embedding.tobytes())

# Semantic Cache (ค้นหา Query ที่คล้ายกัน)

class SemanticCache:

def init(self, index, threshold=0.95):

self.index = index # Pinecone/Qdrant index

self.threshold = threshold

def find_similar(self, query_embedding):

results = self.index.query(

vector=query_embedding,

top_k=1,

namespace="cache",

)

if results.matches and results.matches[0].score > self.threshold:

return results.matches[0].metadata.get("response")

return None

@dataclass

class CacheMetrics:

cache_type: str

hit_rate: float

avg_latency_hit_ms: float

avg_latency_miss_ms: float

memory_mb: float

แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex

savings_pct: float

caches = [

CacheMetrics("Response Cache", 0.35, 2, 1500, 256, 35),

CacheMetrics("Embedding Cache", 0.65, 1, 80, 512, 52),

CacheMetrics("Semantic Cache", 0.45, 5, 1500, 128, 45),

CacheMetrics("Search Result Cache", 0.50, 3, 50, 128, 25),

]

print("\n=== Cache Performance ===")

for c in caches:

effective_latency = (c.hit_rate * c.avg_latency_hit_ms +

(1 - c.hit_rate) * c.avg_latency_miss_ms)

print(f" [{c.cache_type}]")

print(f" Hit Rate: {c.hit_rate:.0%} | Hit: {c.avg_latency_hit_ms}ms | "

f"Miss: {c.avg_latency_miss_ms}ms")

print(f" Effective: {effective_latency:.0f}ms | Savings: {c.savings_pct}%")

Production Architecture

=== Production RAG Pipeline ===

Optimized Pipeline

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Python Poetry High Availability HA Setup

1. Query comes in

2. Check Semantic Cache -> if hit, return cached response

3. Check Embedding Cache -> if hit, use cached embedding

4. Generate embedding (if cache miss)

5. Parallel: Vector Search + Keyword Search (Hybrid)

6. Re-rank results with Cross-encoder

7. Assemble context (top 3-5 chunks)

8. Stream LLM response to client

9. Cache response asynchronously

FastAPI Production Server

from fastapi import FastAPI

from fastapi.responses import StreamingResponse

import asyncio

app = FastAPI()

@app.post("/api/query")

async def query(request: QueryRequest):

# 1. Check cache

cached = cache.get_cached_response(request.query)

if cached:

return cached

# 2. Get embedding (with cache)

embedding = await get_embedding_cached(request.query)

# 3. Parallel search

vector_results, keyword_results = await asyncio.gather(

vector_search(embedding, top_k=10),

keyword_search(request.query, top_k=10),

)

# 4. Hybrid merge + Re-rank

merged = hybrid_merge(vector_results, keyword_results, alpha=0.7)

reranked = cross_encoder_rerank(request.query, merged, top_k=5)

# 5. Stream LLM response

context = build_context(reranked)

return StreamingResponse(

stream_llm_response(request.query, context),

media_type="text/event-stream",

)

เนื้อหาเกี่ยวข้อง — Dom Html คืออะไร — ข้อมูลครบถ้วน 2026

pipeline_metrics = {

"Before Optimization": {

"p50_latency": "2,500ms",

"p99_latency": "8,000ms",

"throughput": "10 req/s",

"cost_per_1k": "$2.50",

"accuracy": "82%",

"After Optimization": {

"p50_latency": "800ms",

"p99_latency": "2,500ms",

"throughput": "50 req/s",

"cost_per_1k": "$0.80",

"accuracy": "89%",

}

print("RAG Pipeline Metrics:")

for phase, metrics in pipeline_metrics.items():

print(f"\n [{phase}]")

for k, v in metrics.items():

print(f" {k}: {v}")

Optimization Checklist

checklist = [

"Chunking: 512 tokens + 50 overlap ดีสำหรับเริ่มต้น",

"Embedding: ใช้ text-embedding-3-small ถ้า Quality พอ",

"Hybrid Search: Vector + Keyword ดีกว่า Vector อย่างเดียว",

"Re-ranking: Cross-encoder เพิ่ม Accuracy 5-10%",

"Caching: Semantic + Embedding Cache ลด 50% Latency",

"Streaming: Stream Response ให้ User เห็น Token แรกเร็ว",

"Parallel: Search หลาย Source พร้อมกัน asyncio.gather",

"Context: จำกัด 3-5 Chunks ไม่ส่งทั้งหมดให้ LLM",

]

print(f"\n\nOptimization Checklist:")

for i, item in enumerate(checklist, 1):

print(f" {i}. {item}")

เคล็ดลับ

Streaming: Stream Response ให้ User เห็น Token แรกเร็ว ลด Perceived Latency
Hybrid: ใช้ Hybrid Search (Vector + BM25) ผลลัพธ์ดีกว่า Vector อย่างเดียว
Re-rank: ใช้ Cross-encoder Re-rank ผลลัพธ์ เพิ่ม Accuracy 5-10%
Cache: Semantic Cache ลด Latency 50-90% สำหรับ Query ซ้ำ
Monitor: วัด Latency ทุกขั้น หา Bottleneck แก้ทีละจุด

RAG Architecture คืออะไร

Retrieval Augmented Generation เสริม LLM Knowledge Base Vector Database Context คำตอบถูกต้อง Up-to-date ลด Hallucination