LangChain Agent Performance Tuning เพิ่มความเร็ว

LangChain Agent Performance

LangChain Agent Performance Tuning เพิ่มความเร็ว Caching Async Parallel Streaming Model Selection Prompt Optimization Production

เนื้อหาเกี่ยวข้อง — อ่านต่อ: Databricks Unity Catalog Data Pipeline ETL —

Optimization	Latency Reduction	Cost Reduction	Complexity
LLM Cache	90%+ (cache hit)	90%+ (no API call)	ต่ำ (ง่าย)
Async/Parallel Tools	50-70%	ไม่เปลี่ยน	ปานกลาง
Streaming	TTFT 80%+ ลด	ไม่เปลี่ยน	ต่ำ
Model Router	30-50%	40-60%	ปานกลาง
Prompt Shortening	20-40%	20-40%	ต่ำ
Retrieval Optimization	30-50%	10-20%	ปานกลาง

Caching Strategy

# === LangChain Caching ===

# from langchain.cache import InMemoryCache, RedisCache, SQLiteCache
# from langchain_community.cache import RedisSemanticCache
# from langchain.globals import set_llm_cache
# import redis
#
# # Option 1: In-Memory Cache (Dev)
# set_llm_cache(InMemoryCache())
#
# # Option 2: Redis Cache (Production)
# redis_client = redis.Redis(host="redis", port=6379)
# set_llm_cache(RedisCache(redis_client, ttl=3600))
#
# # Option 3: Semantic Cache (Similar queries)
# from langchain_openai import OpenAIEmbeddings
# set_llm_cache(RedisSemanticCache(
#     redis_url="redis://redis:6379",
#     embedding=OpenAIEmbeddings(),
#     score_threshold=0.95,  # similarity threshold
#     ttl=3600
# ))
#
# # Tool Result Cache
# from functools import lru_cache
# @lru_cache(maxsize=1000)
# def cached_search(query: str) -> str:
#     return search_tool.run(query)

from dataclasses import dataclass

@dataclass
class CacheConfig:
    cache_type: str
    backend: str
    ttl: str
    hit_rate: str
    use_case: str

caches = [
    CacheConfig("InMemoryCache",
        "Python dict (RAM)",
        "Until restart",
        "สูง (Exact match)",
        "Development Single Instance"),
    CacheConfig("SQLiteCache",
        "SQLite File",
        "Persistent",
        "สูง (Exact match)",
        "Single Server Persistent"),
    CacheConfig("RedisCache",
        "Redis Server",
        "Configurable (1hr default)",
        "สูง (Exact match)",
        "Production Multi-instance Shared"),
    CacheConfig("RedisSemanticCache",
        "Redis + Embeddings",
        "Configurable",
        "สูงมาก (Similar queries hit)",
        "Production Natural Language Queries"),
    CacheConfig("Tool LRU Cache",
        "Python lru_cache",
        "Until eviction (maxsize)",
        "ปานกลาง-สูง",
        "Search DB API results"),
]

print("=== Caching Strategy ===")
for c in caches:
    print(f"  [{c.cache_type}] Backend: {c.backend}")
    print(f"    TTL: {c.ttl} | Hit Rate: {c.hit_rate}")
    print(f"    Use: {c.use_case}")

Async & Streaming

# === Async Parallel Streaming ===

# Async LLM Call
# response = await llm.ainvoke("query")
#
# Parallel Tool Execution
# import asyncio
# async def run_tools_parallel(tools, queries):
#     tasks = [tool.ainvoke(q) for tool, q in zip(tools, queries)]
#     return await asyncio.gather(*tasks)
#
# # Before: Sequential (3s + 2s + 1s = 6s)
# result1 = search_tool.invoke("query1")  # 3s
# result2 = db_tool.invoke("query2")      # 2s
# result3 = calc_tool.invoke("query3")    # 1s
#
# # After: Parallel (max(3s, 2s, 1s) = 3s)
# results = await asyncio.gather(
#     search_tool.ainvoke("query1"),
#     db_tool.ainvoke("query2"),
#     calc_tool.ainvoke("query3"),
# )
#
# # Streaming
# async for chunk in llm.astream("query"):
#     print(chunk.content, end="", flush=True)
#
# # Batch
# responses = await llm.abatch([
#     "query1", "query2", "query3"
# ], config={"max_concurrency": 5})

@dataclass
class AsyncPattern:
    pattern: str
    before_latency: str
    after_latency: str
    improvement: str
    code_change: str

patterns = [
    AsyncPattern("Sequential → Parallel Tools",
        "Tool1(3s) + Tool2(2s) + Tool3(1s) = 6s",
        "max(3s, 2s, 1s) = 3s",
        "50% faster",
        "asyncio.gather(*tool_tasks)"),
    AsyncPattern("Sync → Async LLM",
        "Block thread during LLM call (3s)",
        "Non-blocking await (3s but concurrent)",
        "Throughput 5-10x",
        "await llm.ainvoke() แทน llm.invoke()"),
    AsyncPattern("Full Response → Streaming",
        "Wait 3s → show all text",
        "Show first token 200ms → stream rest",
        "TTFT 90% faster",
        "async for chunk in llm.astream()"),
    AsyncPattern("Single → Batch Request",
        "10 calls × 1s each = 10s",
        "1 batch call = 2s",
        "80% faster",
        "await llm.abatch(queries, max_concurrency=5)"),
]

print("=== Async Patterns ===")
for p in patterns:
    print(f"  [{p.pattern}]")
    print(f"    Before: {p.before_latency}")
    print(f"    After: {p.after_latency}")
    print(f"    Improve: {p.improvement}")
    print(f"    Code: {p.code_change}")

Production Monitoring

# === Performance Monitoring ===

@dataclass
class PerfMetric:
    metric: str
    target: str
    tool: str
    alert: str

metrics = [
    PerfMetric("LLM Latency P99",
        "< 5 seconds",
        "LangSmith / Custom Prometheus",
        "> 10s → Check model Switch to faster"),
    PerfMetric("Time to First Token (TTFT)",
        "< 500ms",
        "LangSmith / Client-side Timer",
        "> 2s → Enable Streaming Check Network"),
    PerfMetric("Cache Hit Rate",
        "> 30%",
        "Redis INFO stats / Custom Counter",
        "< 10% → Review Cache Key Strategy"),
    PerfMetric("Token Usage per Request",
        "< 2000 tokens avg",
        "LangSmith / OpenAI Usage API",
        "> 4000 → Shorten Prompt Trim Context"),
    PerfMetric("Tool Execution Time",
        "< 2 seconds per tool",
        "Custom Timer / LangSmith",
        "> 5s → Cache Tool Results Parallel Exec"),
    PerfMetric("Error Rate",
        "< 1%",
        "LangSmith / Sentry",
        "> 5% → Check API Key Rate Limit Model"),
]

print("=== Performance Metrics ===")
for m in metrics:
    print(f"  [{m.metric}] Target: {m.target}")
    print(f"    Tool: {m.tool}")
    print(f"    Alert: {m.alert}")

เคล็ดลับ

Cache: ใช้ RedisSemanticCache ลด LLM Call 30-70%
Async: ใช้ asyncio.gather รัน Tool Parallel ลด 50%
Streaming: เปิด Streaming ลด TTFT 90%
Model Router: ใช้ Model เล็กสำหรับงานง่าย ประหยัด 40-60%
LangSmith: ใช้ LangSmith ดู Trace หา Bottleneck

LangChain Agent ช้าเพราะอะไร

LLM API Latency Multi-call Chain of Thought Tool Sequential Context ยาว Token มาก Network Cold Start No Cache

แนะนำเพิ่มเติม — ดูสัญญาณเทรดที่ XM Signal

เนื้อหาเกี่ยวข้อง — Calico Network Policy Best Practices ที่ต้องรู้

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง A/B Testing ML Capacity Planning