RAG Architecture
RAG Retrieval-Augmented Generation LLM Knowledge Base Chunking Embedding Vector Store Retrieval Re-ranking Hybrid Search Production Optimization Hallucination
| Component | Options | Purpose | Impact |
|---|---|---|---|
| Chunking | Fixed/Semantic/Recursive | แบ่งเอกสาร | สูงมาก |
| Embedding | OpenAI/BGE/E5 | แปลง Text → Vector | สูง |
| Vector Store | Pinecone/Weaviate/Chroma | เก็บและค้นหา Vector | สูง |
| Retrieval | Semantic/Hybrid/Multi-query | ค้นหา Relevant Chunks | สูงมาก |
| Re-ranking | Cohere/Cross-encoder | จัดอันดับ Chunks ใหม่ | ปานกลาง |
| LLM | GPT-4o/Claude/Llama | สร้างคำตอบ | สูง |
RAG Pipeline
# === RAG Pipeline Implementation ===
# pip install langchain chromadb openai sentence-transformers
# from langchain.document_loaders import PyPDFLoader, TextLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import Chroma
# from langchain.chat_models import ChatOpenAI
# from langchain.chains import RetrievalQA
#
# # Step 1: Load Documents
# loader = PyPDFLoader("knowledge_base.pdf")
# documents = loader.load()
#
# # Step 2: Chunking — Recursive Splitting
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=512,
# chunk_overlap=50,
# separators=["\n\n", "\n", ". ", " ", ""],
# length_function=len,
# )
# chunks = text_splitter.split_documents(documents)
#
# # Step 3: Embedding + Vector Store
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# vectorstore = Chroma.from_documents(
# documents=chunks,
# embedding=embeddings,
# persist_directory="./chroma_db",
# collection_metadata={"hnsw:space": "cosine"},
# )
#
# # Step 4: Retrieval + Generation
# retriever = vectorstore.as_retriever(
# search_type="mmr", # Maximum Marginal Relevance
# search_kwargs={"k": 5, "fetch_k": 20},
# )
#
# llm = ChatOpenAI(model="gpt-4o", temperature=0)
# qa_chain = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# retriever=retriever,
# return_source_documents=True,
# )
#
# result = qa_chain({"query": "อธิบาย RAG Architecture"})
# print(result["result"])
# for doc in result["source_documents"]:
# print(f" Source: {doc.metadata['source']} p.{doc.metadata.get('page', 'N/A')}")
from dataclasses import dataclass
@dataclass
class ChunkStrategy:
strategy: str
chunk_size: str
overlap: str
pros: str
cons: str
use_case: str
strategies = [
ChunkStrategy("Fixed Size", "256-512 tokens", "10-20%", "ง่าย เร็ว", "ตัดกลางประโยค", "Quick prototype"),
ChunkStrategy("Sentence Split", "3-5 sentences", "1 sentence", "ไม่ตัดกลางประโยค", "Chunk ขนาดไม่เท่ากัน", "Text documents"),
ChunkStrategy("Recursive", "512 tokens", "50 tokens", "แบ่งตาม Structure", "ต้อง Tune separators", "Production"),
ChunkStrategy("Semantic", "Variable", "Adaptive", "ตาม Meaning จริงๆ", "ช้า ต้อง Embedding", "High quality"),
ChunkStrategy("Parent-Child", "Parent 2048 Child 256", "None", "Context + Precision", "ซับซ้อน", "Complex docs"),
]
print("=== Chunking Strategies ===")
for s in strategies:
print(f" [{s.strategy}] Size: {s.chunk_size} | Overlap: {s.overlap}")
print(f" Pros: {s.pros} | Cons: {s.cons}")
print(f" Use Case: {s.use_case}")
Advanced Retrieval
# === Advanced RAG Techniques ===
# Hybrid Search — Semantic + Keyword
# from langchain.retrievers import EnsembleRetriever
# from langchain.retrievers import BM25Retriever
#
# bm25_retriever = BM25Retriever.from_documents(chunks, k=5)
# vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
#
# hybrid_retriever = EnsembleRetriever(
# retrievers=[bm25_retriever, vector_retriever],
# weights=[0.3, 0.7], # 30% keyword, 70% semantic
# )
# Re-ranking with Cross-encoder
# from sentence_transformers import CrossEncoder
#
# reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
#
# def rerank_results(query, documents, top_k=3):
# pairs = [(query, doc.page_content) for doc in documents]
# scores = reranker.predict(pairs)
# ranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
# return [doc for doc, score in ranked[:top_k]]
# Multi-Query Retrieval
# from langchain.retrievers.multi_query import MultiQueryRetriever
#
# multi_retriever = MultiQueryRetriever.from_llm(
# retriever=vectorstore.as_retriever(),
# llm=ChatOpenAI(temperature=0.3),
# )
# # Generates 3 query variations automatically
# HyDE — Hypothetical Document Embeddings
# from langchain.chains import HypotheticalDocumentEmbedder
#
# hyde_embeddings = HypotheticalDocumentEmbedder.from_llm(
# llm=ChatOpenAI(), base_embeddings=OpenAIEmbeddings(),
# prompt_key="web_search",
# )
@dataclass
class RetrievalTechnique:
technique: str
how_it_works: str
improvement: str
complexity: str
latency_impact: str
techniques = [
RetrievalTechnique("Hybrid Search", "Semantic + BM25 keyword", "+15-25% relevance", "ต่ำ", "+10ms"),
RetrievalTechnique("Re-ranking", "Cross-encoder score top results", "+10-20% precision", "ปานกลาง", "+50-100ms"),
RetrievalTechnique("Multi-Query", "Generate query variations", "+10-15% recall", "ปานกลาง", "+500ms (LLM call)"),
RetrievalTechnique("HyDE", "Generate hypothetical doc first", "+5-15% for abstract queries", "สูง", "+1s (LLM call)"),
RetrievalTechnique("Parent-Child", "Retrieve child return parent", "+20% context quality", "ปานกลาง", "+20ms"),
RetrievalTechnique("Metadata Filter", "Pre-filter by category/date", "ลด Noise มาก", "ต่ำ", "-10ms (fewer docs)"),
]
print("\n=== Advanced Retrieval Techniques ===")
for t in techniques:
print(f" [{t.technique}] Improvement: {t.improvement}")
print(f" How: {t.how_it_works}")
print(f" Complexity: {t.complexity} | Latency: {t.latency_impact}")
Production RAG
# === Production RAG System ===
# Vector Store Comparison
@dataclass
class VectorDB:
name: str
hosting: str
max_vectors: str
features: str
pricing: str
best_for: str
vector_dbs = [
VectorDB("Pinecone", "Managed Cloud", "Billions", "Hybrid Search Metadata Namespaces", "Pay-per-use", "Production SaaS"),
VectorDB("Weaviate", "Self-hosted/Cloud", "Billions", "Hybrid GraphQL Modules", "Open Source / Cloud", "Flexible"),
VectorDB("Chroma", "Self-hosted", "Millions", "Simple API Embedding Functions", "Free Open Source", "Prototyping"),
VectorDB("Qdrant", "Self-hosted/Cloud", "Billions", "Payload Filtering Geo Search", "Open Source / Cloud", "Advanced Filter"),
VectorDB("Milvus", "Self-hosted", "Billions", "GPU Index Multi-vector", "Free Open Source", "Large Scale"),
VectorDB("pgvector", "PostgreSQL Extension", "Millions", "SQL Integration HNSW IVFFlat", "Free", "Existing Postgres"),
]
print("=== Vector Database Comparison ===")
for v in vector_dbs:
print(f" [{v.name}] Hosting: {v.hosting}")
print(f" Max: {v.max_vectors} | Features: {v.features}")
print(f" Pricing: {v.pricing} | Best For: {v.best_for}")
# Evaluation Metrics
eval_metrics = {
"Answer Relevance": "คำตอบตรงคำถามหรือไม่ (LLM-as-judge)",
"Faithfulness": "คำตอบตรงกับ Context หรือไม่ (ไม่ Hallucinate)",
"Context Relevance": "Chunk ที่ดึงมาเกี่ยวข้องหรือไม่",
"Context Recall": "ดึง Chunk ครับหรือือไม่ เทียบกับ Ground Truth",
"Latency P95": "เวลาตอบ 95th percentile",
"Cost per Query": "ค่าใช้จ่ายต่อ Query (Embedding + LLM)",
}
print(f"\n\nRAG Evaluation Metrics:")
for k, v in eval_metrics.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- Chunk Size: เริ่ม 512 tokens แล้ว Tune ตาม Use Case
- Hybrid: ใช้ Hybrid Search เสมอ ดีกว่า Semantic อย่างเดียว
- Re-rank: เพิ่ม Re-ranker เพิ่ม Precision อย่างมาก
- Evaluate: วัดผล Faithfulness Context Relevance ทุก Version
- Metadata: ใส่ Metadata ทุก Chunk กรองได้ ลด Noise
RAG คืออะไร
Retrieval-Augmented Generation LLM Knowledge Base ค้นหาข้อมูลก่อนสร้างคำตอบ ลด Hallucination อัพเดทได้ อ้างอิงแหล่ง Chatbot Q&A Document
Chunking Strategy ทำอย่างไร
Fixed Size Sentence Semantic Recursive 256-512 tokens Overlap 10-20% ใหญ่ Context มาก Noise มาก เล็ก Context น้อย Tune ตาม Use Case
เลือก Embedding Model อย่างไร
OpenAI text-embedding-3 Cohere BGE-M3 E5-Mistral Nomic MTEB Leaderboard ภาษา Dimension ราคา Latency Open Source Privacy
Optimize RAG Performance อย่างไร
Hybrid Search Semantic Keyword Re-ranking Cross-encoder Multi-query HyDE Metadata Filter Parent-Child Contextual Compression Evaluate
สรุป
RAG Architecture Best Practices Chunking Embedding Vector Store Retrieval Hybrid Search Re-ranking Multi-query LLM Evaluation Faithfulness Production Optimization
