LangChain Agent Performance
LangChain Agent Performance Tuning เพิ่มความเร็ว Caching Async Parallel Streaming Model Selection Prompt Optimization Production
| Optimization | Latency Reduction | Cost Reduction | Complexity |
|---|---|---|---|
| LLM Cache | 90%+ (cache hit) | 90%+ (no API call) | ต่ำ (ง่าย) |
| Async/Parallel Tools | 50-70% | ไม่เปลี่ยน | ปานกลาง |
| Streaming | TTFT 80%+ ลด | ไม่เปลี่ยน | ต่ำ |
| Model Router | 30-50% | 40-60% | ปานกลาง |
| Prompt Shortening | 20-40% | 20-40% | ต่ำ |
| Retrieval Optimization | 30-50% | 10-20% | ปานกลาง |
Caching Strategy
# === LangChain Caching ===
# from langchain.cache import InMemoryCache, RedisCache, SQLiteCache
# from langchain_community.cache import RedisSemanticCache
# from langchain.globals import set_llm_cache
# import redis
#
# # Option 1: In-Memory Cache (Dev)
# set_llm_cache(InMemoryCache())
#
# # Option 2: Redis Cache (Production)
# redis_client = redis.Redis(host="redis", port=6379)
# set_llm_cache(RedisCache(redis_client, ttl=3600))
#
# # Option 3: Semantic Cache (Similar queries)
# from langchain_openai import OpenAIEmbeddings
# set_llm_cache(RedisSemanticCache(
# redis_url="redis://redis:6379",
# embedding=OpenAIEmbeddings(),
# score_threshold=0.95, # similarity threshold
# ttl=3600
# ))
#
# # Tool Result Cache
# from functools import lru_cache
# @lru_cache(maxsize=1000)
# def cached_search(query: str) -> str:
# return search_tool.run(query)
from dataclasses import dataclass
@dataclass
class CacheConfig:
cache_type: str
backend: str
ttl: str
hit_rate: str
use_case: str
caches = [
CacheConfig("InMemoryCache",
"Python dict (RAM)",
"Until restart",
"สูง (Exact match)",
"Development Single Instance"),
CacheConfig("SQLiteCache",
"SQLite File",
"Persistent",
"สูง (Exact match)",
"Single Server Persistent"),
CacheConfig("RedisCache",
"Redis Server",
"Configurable (1hr default)",
"สูง (Exact match)",
"Production Multi-instance Shared"),
CacheConfig("RedisSemanticCache",
"Redis + Embeddings",
"Configurable",
"สูงมาก (Similar queries hit)",
"Production Natural Language Queries"),
CacheConfig("Tool LRU Cache",
"Python lru_cache",
"Until eviction (maxsize)",
"ปานกลาง-สูง",
"Search DB API results"),
]
print("=== Caching Strategy ===")
for c in caches:
print(f" [{c.cache_type}] Backend: {c.backend}")
print(f" TTL: {c.ttl} | Hit Rate: {c.hit_rate}")
print(f" Use: {c.use_case}")
Async & Streaming
# === Async Parallel Streaming ===
# Async LLM Call
# response = await llm.ainvoke("query")
#
# Parallel Tool Execution
# import asyncio
# async def run_tools_parallel(tools, queries):
# tasks = [tool.ainvoke(q) for tool, q in zip(tools, queries)]
# return await asyncio.gather(*tasks)
#
# # Before: Sequential (3s + 2s + 1s = 6s)
# result1 = search_tool.invoke("query1") # 3s
# result2 = db_tool.invoke("query2") # 2s
# result3 = calc_tool.invoke("query3") # 1s
#
# # After: Parallel (max(3s, 2s, 1s) = 3s)
# results = await asyncio.gather(
# search_tool.ainvoke("query1"),
# db_tool.ainvoke("query2"),
# calc_tool.ainvoke("query3"),
# )
#
# # Streaming
# async for chunk in llm.astream("query"):
# print(chunk.content, end="", flush=True)
#
# # Batch
# responses = await llm.abatch([
# "query1", "query2", "query3"
# ], config={"max_concurrency": 5})
@dataclass
class AsyncPattern:
pattern: str
before_latency: str
after_latency: str
improvement: str
code_change: str
patterns = [
AsyncPattern("Sequential → Parallel Tools",
"Tool1(3s) + Tool2(2s) + Tool3(1s) = 6s",
"max(3s, 2s, 1s) = 3s",
"50% faster",
"asyncio.gather(*tool_tasks)"),
AsyncPattern("Sync → Async LLM",
"Block thread during LLM call (3s)",
"Non-blocking await (3s but concurrent)",
"Throughput 5-10x",
"await llm.ainvoke() แทน llm.invoke()"),
AsyncPattern("Full Response → Streaming",
"Wait 3s → show all text",
"Show first token 200ms → stream rest",
"TTFT 90% faster",
"async for chunk in llm.astream()"),
AsyncPattern("Single → Batch Request",
"10 calls × 1s each = 10s",
"1 batch call = 2s",
"80% faster",
"await llm.abatch(queries, max_concurrency=5)"),
]
print("=== Async Patterns ===")
for p in patterns:
print(f" [{p.pattern}]")
print(f" Before: {p.before_latency}")
print(f" After: {p.after_latency}")
print(f" Improve: {p.improvement}")
print(f" Code: {p.code_change}")
Production Monitoring
# === Performance Monitoring ===
@dataclass
class PerfMetric:
metric: str
target: str
tool: str
alert: str
metrics = [
PerfMetric("LLM Latency P99",
"< 5 seconds",
"LangSmith / Custom Prometheus",
"> 10s → Check model Switch to faster"),
PerfMetric("Time to First Token (TTFT)",
"< 500ms",
"LangSmith / Client-side Timer",
"> 2s → Enable Streaming Check Network"),
PerfMetric("Cache Hit Rate",
"> 30%",
"Redis INFO stats / Custom Counter",
"< 10% → Review Cache Key Strategy"),
PerfMetric("Token Usage per Request",
"< 2000 tokens avg",
"LangSmith / OpenAI Usage API",
"> 4000 → Shorten Prompt Trim Context"),
PerfMetric("Tool Execution Time",
"< 2 seconds per tool",
"Custom Timer / LangSmith",
"> 5s → Cache Tool Results Parallel Exec"),
PerfMetric("Error Rate",
"< 1%",
"LangSmith / Sentry",
"> 5% → Check API Key Rate Limit Model"),
]
print("=== Performance Metrics ===")
for m in metrics:
print(f" [{m.metric}] Target: {m.target}")
print(f" Tool: {m.tool}")
print(f" Alert: {m.alert}")
เคล็ดลับ
- Cache: ใช้ RedisSemanticCache ลด LLM Call 30-70%
- Async: ใช้ asyncio.gather รัน Tool Parallel ลด 50%
- Streaming: เปิด Streaming ลด TTFT 90%
- Model Router: ใช้ Model เล็กสำหรับงานง่าย ประหยัด 40-60%
- LangSmith: ใช้ LangSmith ดู Trace หา Bottleneck
LangChain Agent ช้าเพราะอะไร
LLM API Latency Multi-call Chain of Thought Tool Sequential Context ยาว Token มาก Network Cold Start No Cache
Caching ทำอย่างไร
InMemoryCache RedisCache RedisSemanticCache SQLiteCache Tool LRU Cache Embedding Cache TTL Hit Rate 30-70% Reduce LLM Call
Async & Parallel ทำอย่างไร
ainvoke astream abatch asyncio.gather Parallel Tools Streaming TTFT Batch Concurrent Semaphore Rate Limit Non-blocking
Production Optimization มีอะไร
Model Router GPT-3.5 GPT-4 Prompt Short Context Trim Summary Memory Hybrid Search top_k Re-rank Load Balancer Auto-scaling
สรุป
LangChain Agent Performance Tuning Cache Redis Async Parallel Streaming Model Router Prompt Optimization LangSmith Monitoring Production
