SiamCafe.net Blog
Technology

LangChain Agent Performance Tuning เพิ่มความเร็ว

langchain agent performance tuning เพมความเรว
LangChain Agent Performance Tuning เพิ่มความเร็ว | SiamCafe Blog
2025-06-11· อ. บอม — SiamCafe.net· 11,838 คำ

LangChain Agent Performance

LangChain Agent Performance Tuning เพิ่มความเร็ว Caching Async Parallel Streaming Model Selection Prompt Optimization Production

OptimizationLatency ReductionCost ReductionComplexity
LLM Cache90%+ (cache hit)90%+ (no API call)ต่ำ (ง่าย)
Async/Parallel Tools50-70%ไม่เปลี่ยนปานกลาง
StreamingTTFT 80%+ ลดไม่เปลี่ยนต่ำ
Model Router30-50%40-60%ปานกลาง
Prompt Shortening20-40%20-40%ต่ำ
Retrieval Optimization30-50%10-20%ปานกลาง

Caching Strategy

# === LangChain Caching ===

# from langchain.cache import InMemoryCache, RedisCache, SQLiteCache
# from langchain_community.cache import RedisSemanticCache
# from langchain.globals import set_llm_cache
# import redis
#
# # Option 1: In-Memory Cache (Dev)
# set_llm_cache(InMemoryCache())
#
# # Option 2: Redis Cache (Production)
# redis_client = redis.Redis(host="redis", port=6379)
# set_llm_cache(RedisCache(redis_client, ttl=3600))
#
# # Option 3: Semantic Cache (Similar queries)
# from langchain_openai import OpenAIEmbeddings
# set_llm_cache(RedisSemanticCache(
#     redis_url="redis://redis:6379",
#     embedding=OpenAIEmbeddings(),
#     score_threshold=0.95,  # similarity threshold
#     ttl=3600
# ))
#
# # Tool Result Cache
# from functools import lru_cache
# @lru_cache(maxsize=1000)
# def cached_search(query: str) -> str:
#     return search_tool.run(query)

from dataclasses import dataclass

@dataclass
class CacheConfig:
    cache_type: str
    backend: str
    ttl: str
    hit_rate: str
    use_case: str

caches = [
    CacheConfig("InMemoryCache",
        "Python dict (RAM)",
        "Until restart",
        "สูง (Exact match)",
        "Development Single Instance"),
    CacheConfig("SQLiteCache",
        "SQLite File",
        "Persistent",
        "สูง (Exact match)",
        "Single Server Persistent"),
    CacheConfig("RedisCache",
        "Redis Server",
        "Configurable (1hr default)",
        "สูง (Exact match)",
        "Production Multi-instance Shared"),
    CacheConfig("RedisSemanticCache",
        "Redis + Embeddings",
        "Configurable",
        "สูงมาก (Similar queries hit)",
        "Production Natural Language Queries"),
    CacheConfig("Tool LRU Cache",
        "Python lru_cache",
        "Until eviction (maxsize)",
        "ปานกลาง-สูง",
        "Search DB API results"),
]

print("=== Caching Strategy ===")
for c in caches:
    print(f"  [{c.cache_type}] Backend: {c.backend}")
    print(f"    TTL: {c.ttl} | Hit Rate: {c.hit_rate}")
    print(f"    Use: {c.use_case}")

Async & Streaming

# === Async Parallel Streaming ===

# Async LLM Call
# response = await llm.ainvoke("query")
#
# Parallel Tool Execution
# import asyncio
# async def run_tools_parallel(tools, queries):
#     tasks = [tool.ainvoke(q) for tool, q in zip(tools, queries)]
#     return await asyncio.gather(*tasks)
#
# # Before: Sequential (3s + 2s + 1s = 6s)
# result1 = search_tool.invoke("query1")  # 3s
# result2 = db_tool.invoke("query2")      # 2s
# result3 = calc_tool.invoke("query3")    # 1s
#
# # After: Parallel (max(3s, 2s, 1s) = 3s)
# results = await asyncio.gather(
#     search_tool.ainvoke("query1"),
#     db_tool.ainvoke("query2"),
#     calc_tool.ainvoke("query3"),
# )
#
# # Streaming
# async for chunk in llm.astream("query"):
#     print(chunk.content, end="", flush=True)
#
# # Batch
# responses = await llm.abatch([
#     "query1", "query2", "query3"
# ], config={"max_concurrency": 5})

@dataclass
class AsyncPattern:
    pattern: str
    before_latency: str
    after_latency: str
    improvement: str
    code_change: str

patterns = [
    AsyncPattern("Sequential → Parallel Tools",
        "Tool1(3s) + Tool2(2s) + Tool3(1s) = 6s",
        "max(3s, 2s, 1s) = 3s",
        "50% faster",
        "asyncio.gather(*tool_tasks)"),
    AsyncPattern("Sync → Async LLM",
        "Block thread during LLM call (3s)",
        "Non-blocking await (3s but concurrent)",
        "Throughput 5-10x",
        "await llm.ainvoke() แทน llm.invoke()"),
    AsyncPattern("Full Response → Streaming",
        "Wait 3s → show all text",
        "Show first token 200ms → stream rest",
        "TTFT 90% faster",
        "async for chunk in llm.astream()"),
    AsyncPattern("Single → Batch Request",
        "10 calls × 1s each = 10s",
        "1 batch call = 2s",
        "80% faster",
        "await llm.abatch(queries, max_concurrency=5)"),
]

print("=== Async Patterns ===")
for p in patterns:
    print(f"  [{p.pattern}]")
    print(f"    Before: {p.before_latency}")
    print(f"    After: {p.after_latency}")
    print(f"    Improve: {p.improvement}")
    print(f"    Code: {p.code_change}")

Production Monitoring

# === Performance Monitoring ===

@dataclass
class PerfMetric:
    metric: str
    target: str
    tool: str
    alert: str

metrics = [
    PerfMetric("LLM Latency P99",
        "< 5 seconds",
        "LangSmith / Custom Prometheus",
        "> 10s → Check model Switch to faster"),
    PerfMetric("Time to First Token (TTFT)",
        "< 500ms",
        "LangSmith / Client-side Timer",
        "> 2s → Enable Streaming Check Network"),
    PerfMetric("Cache Hit Rate",
        "> 30%",
        "Redis INFO stats / Custom Counter",
        "< 10% → Review Cache Key Strategy"),
    PerfMetric("Token Usage per Request",
        "< 2000 tokens avg",
        "LangSmith / OpenAI Usage API",
        "> 4000 → Shorten Prompt Trim Context"),
    PerfMetric("Tool Execution Time",
        "< 2 seconds per tool",
        "Custom Timer / LangSmith",
        "> 5s → Cache Tool Results Parallel Exec"),
    PerfMetric("Error Rate",
        "< 1%",
        "LangSmith / Sentry",
        "> 5% → Check API Key Rate Limit Model"),
]

print("=== Performance Metrics ===")
for m in metrics:
    print(f"  [{m.metric}] Target: {m.target}")
    print(f"    Tool: {m.tool}")
    print(f"    Alert: {m.alert}")

เคล็ดลับ

LangChain Agent ช้าเพราะอะไร

LLM API Latency Multi-call Chain of Thought Tool Sequential Context ยาว Token มาก Network Cold Start No Cache

Caching ทำอย่างไร

InMemoryCache RedisCache RedisSemanticCache SQLiteCache Tool LRU Cache Embedding Cache TTL Hit Rate 30-70% Reduce LLM Call

Async & Parallel ทำอย่างไร

ainvoke astream abatch asyncio.gather Parallel Tools Streaming TTFT Batch Concurrent Semaphore Rate Limit Non-blocking

Production Optimization มีอะไร

Model Router GPT-3.5 GPT-4 Prompt Short Context Trim Summary Memory Hybrid Search top_k Re-rank Load Balancer Auto-scaling

สรุป

LangChain Agent Performance Tuning Cache Redis Async Parallel Streaming Model Router Prompt Optimization LangSmith Monitoring Production

📖 บทความที่เกี่ยวข้อง

Qwik Resumability Performance Tuning เพิ่มความเร็วอ่านบทความ → Tailwind CSS v4 Performance Tuning เพิ่มความเร็วอ่านบทความ → LlamaIndex RAG Performance Tuning เพิ่มความเร็วอ่านบทความ → CrewAI Multi-Agent Performance Tuning เพิ่มความเร็วอ่านบทความ → LangChain Agent Event Driven Designอ่านบทความ →

📚 ดูบทความทั้งหมด →