ai

RAG Architecture Performance Tuning

RAG Architecture Performance Tuning

RAG Performance Tuning

RAG Architecture Performance Tuning

RAG Retrieval Augmented Generation Performance Tuning Chunking Embedding Vector Search Caching LLM Latency Throughput Production Optimization

BottleneckLatencySolutionImprovement
Embedding50-200msCache, Smaller Model-80%
Vector Search10-100msIndex Tuning, Filter-50%
Context Assembly5-20msPre-process, Limit-60%
LLM Generation500-5000msSmaller Model, Stream-40%
Network10-50msCo-locate, gRPC-70%

Chunking และ Embedding Optimization

=== Chunking & Embedding Optimization ===

pip install langchain tiktoken sentence-transformers

from langchain.text_splitter import (

RecursiveCharacterTextSplitter,

MarkdownHeaderTextSplitter,

)

# Recursive Character Splitting (Default)

splitter = RecursiveCharacterTextSplitter(

chunk_size=512,

chunk_overlap=50,

length_function=len,

separators=["\n\n", "\n", ". ", " ", ""],

)

chunks = splitter.split_text(document)

# Markdown Header Splitting

headers = [

("#", "Header 1"),

("##", "Header 2"),

("###", "Header 3"),

]

md_splitter = MarkdownHeaderTextSplitter(headers)

chunks = md_splitter.split_text(markdown_doc)

# Semantic Chunking (Advanced)

from langchain_experimental.text_splitter import SemanticChunker

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

semantic_splitter = SemanticChunker(

embeddings,

breakpoint_threshold_type="percentile",

breakpoint_threshold_amount=90,

)

chunks = semantic_splitter.split_text(document)

from dataclasses import dataclass

from typing import List

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง LangChain Agent Capacity Planning

@dataclass

class ChunkingStrategy:

name: str

chunk_size: int

overlap: int

retrieval_accuracy: float

latency_ms: float

use_case: str

strategies = [

ChunkingStrategy("Fixed 256", 256, 50, 0.78, 12, "Short Q&A"),

ChunkingStrategy("Fixed 512", 512, 50, 0.85, 15, "General Purpose"),

แนะนำเพิ่มเติม — คู่มือเทรดจาก SiamCafeBook

ChunkingStrategy("Fixed 1024", 1024, 100, 0.82, 20, "Long Context"),

ChunkingStrategy("Semantic", 400, 0, 0.91, 25, "High Quality"),

ChunkingStrategy("Markdown", 500, 0, 0.88, 18, "Documentation"),

ChunkingStrategy("Sentence", 300, 0, 0.86, 14, "FAQ, Support"),

]

print("=== Chunking Strategies ===")

for s in strategies:

print(f" [{s.name}] Size: {s.chunk_size} | Overlap: {s.overlap}")

print(f" Accuracy: {s.retrieval_accuracy:.0%} | Latency: {s.latency_ms}ms")

print(f" Use: {s.use_case}")

Caching และ Query Optimization

=== RAG Caching Strategy ===

import redis

import hashlib

import json

import numpy as np

class RAGCache:

def __init__(self, redis_url="redis://localhost:6379"):

self.redis = redis.from_url(redis_url)

self.ttl = 3600 # 1 hour

def _hash_query(self, query: str) -> str:

return hashlib.sha256(query.encode()).hexdigest()

def get_cached_response(self, query: str):

key = f"rag:response:{self._hash_query(query)}"

cached = self.redis.get(key)

return json.loads(cached) if cached else None

def cache_response(self, query: str, response: dict):

key = f"rag:response:{self._hash_query(query)}"

self.redis.setex(key, self.ttl, json.dumps(response))

def get_cached_embedding(self, text: str):

key = f"rag:emb:{self._hash_query(text)}"

cached = self.redis.get(key)

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง Nearly แปลว่าอะไร — ความหมาย การใช้งาน

return np.frombuffer(cached) if cached else None

def cache_embedding(self, text: str, embedding):

key = f"rag:emb:{self._hash_query(text)}"

self.redis.setex(key, self.ttl * 24, embedding.tobytes())

# Semantic Cache (ค้นหา Query ที่คล้ายกัน)

class SemanticCache:

RAG Architecture Performance Tuning

def __init__(self, index, threshold=0.95):

self.index = index # Pinecone/Qdrant index

self.threshold = threshold

def find_similar(self, query_embedding):

results = self.index.query(

vector=query_embedding,

top_k=1,

namespace="cache",

)

if results.matches and results.matches[0].score > self.threshold:

return results.matches[0].metadata.get("response")

return None

@dataclass

class CacheMetrics:

cache_type: str

hit_rate: float

avg_latency_hit_ms: float

avg_latency_miss_ms: float

memory_mb: float

แนะนำเพิ่มเติม — ระบบเทรดของ iCafeForex

savings_pct: float

caches = [

CacheMetrics("Response Cache", 0.35, 2, 1500, 256, 35),

CacheMetrics("Embedding Cache", 0.65, 1, 80, 512, 52),

CacheMetrics("Semantic Cache", 0.45, 5, 1500, 128, 45),

CacheMetrics("Search Result Cache", 0.50, 3, 50, 128, 25),

]

print("\n=== Cache Performance ===")

for c in caches:

effective_latency = (c.hit_rate * c.avg_latency_hit_ms +

(1 - c.hit_rate) * c.avg_latency_miss_ms)

print(f" [{c.cache_type}]")

print(f" Hit Rate: {c.hit_rate:.0%} | Hit: {c.avg_latency_hit_ms}ms | "

f"Miss: {c.avg_latency_miss_ms}ms")

print(f" Effective: {effective_latency:.0f}ms | Savings: {c.savings_pct}%")

Production Architecture

=== Production RAG Pipeline ===

Optimized Pipeline

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: Python Poetry High Availability HA Setup

1. Query comes in

2. Check Semantic Cache -> if hit, return cached response

3. Check Embedding Cache -> if hit, use cached embedding

4. Generate embedding (if cache miss)

5. Parallel: Vector Search + Keyword Search (Hybrid)

6. Re-rank results with Cross-encoder

7. Assemble context (top 3-5 chunks)

8. Stream LLM response to client

9. Cache response asynchronously

FastAPI Production Server

from fastapi import FastAPI

from fastapi.responses import StreamingResponse

import asyncio

app = FastAPI()

@app.post("/api/query")

async def query(request: QueryRequest):

# 1. Check cache

cached = cache.get_cached_response(request.query)

if cached:

return cached

# 2. Get embedding (with cache)

embedding = await get_embedding_cached(request.query)

# 3. Parallel search

vector_results, keyword_results = await asyncio.gather(

vector_search(embedding, top_k=10),

keyword_search(request.query, top_k=10),

)

# 4. Hybrid merge + Re-rank

merged = hybrid_merge(vector_results, keyword_results, alpha=0.7)

reranked = cross_encoder_rerank(request.query, merged, top_k=5)

# 5. Stream LLM response

context = build_context(reranked)

return StreamingResponse(

stream_llm_response(request.query, context),

media_type="text/event-stream",

)

เนื้อหาเกี่ยวข้อง — Dom Html คืออะไร — ข้อมูลครบถ้วน 2026

pipeline_metrics = {

"Before Optimization": {

"p50_latency": "2,500ms",

"p99_latency": "8,000ms",

"throughput": "10 req/s",

"cost_per_1k": "$2.50",

"accuracy": "82%",

},

"After Optimization": {

"p50_latency": "800ms",

"p99_latency": "2,500ms",

"throughput": "50 req/s",

"cost_per_1k": "$0.80",

"accuracy": "89%",

},

}

print("RAG Pipeline Metrics:")

for phase, metrics in pipeline_metrics.items():

print(f"\n [{phase}]")

for k, v in metrics.items():

print(f" {k}: {v}")

Optimization Checklist

checklist = [

"Chunking: 512 tokens + 50 overlap ดีสำหรับเริ่มต้น",

"Embedding: ใช้ text-embedding-3-small ถ้า Quality พอ",

"Hybrid Search: Vector + Keyword ดีกว่า Vector อย่างเดียว",

"Re-ranking: Cross-encoder เพิ่ม Accuracy 5-10%",

"Caching: Semantic + Embedding Cache ลด 50% Latency",

"Streaming: Stream Response ให้ User เห็น Token แรกเร็ว",

"Parallel: Search หลาย Source พร้อมกัน asyncio.gather",

"Context: จำกัด 3-5 Chunks ไม่ส่งทั้งหมดให้ LLM",

]

print(f"\n\nOptimization Checklist:")

for i, item in enumerate(checklist, 1):

print(f" {i}. {item}")

เคล็ดลับ

  • Streaming: Stream Response ให้ User เห็น Token แรกเร็ว ลด Perceived Latency
  • Hybrid: ใช้ Hybrid Search (Vector + BM25) ผลลัพธ์ดีกว่า Vector อย่างเดียว
  • Re-rank: ใช้ Cross-encoder Re-rank ผลลัพธ์ เพิ่ม Accuracy 5-10%
  • Cache: Semantic Cache ลด Latency 50-90% สำหรับ Query ซ้ำ
  • Monitor: วัด Latency ทุกขั้น หา Bottleneck แก้ทีละจุด

RAG Architecture คืออะไร

Retrieval Augmented Generation เสริม LLM Knowledge Base Vector Database Context คำตอบถูกต้อง Up-to-date ลด Hallucination

XM Legend · เทรดเดอร์ & ผู้สอน Forex 13 ปี

ผู้ก่อตั้ง SiamCafe ตั้งแต่ปี 1997 · เทรดเดอร์สาย Forex มากกว่า 13 ปี ได้รับการยกย่องเป็น XM Legend · แบ่งปันความรู้ Forex, ไอที, AI และการเทรด จากประสบการณ์จริงในตลาดจริง