Distributed Tracing SaaS
Distributed Tracing OpenTelemetry Jaeger Tempo Microservices Span Trace Context Propagation SaaS Architecture
| Tool | Type | Storage | Cost | Best For |
|---|---|---|---|---|
| Jaeger | Open Source | ES/Cassandra/Kafka | Free (self-host) | Self-hosted Production |
| Grafana Tempo | Open Source | S3/GCS Object Storage | Free (self-host) | Grafana Ecosystem |
| Zipkin | Open Source | ES/Cassandra/MySQL | Free (self-host) | Simple Setup |
| Datadog APM | SaaS | Managed | $$$ | Enterprise Full-stack |
| AWS X-Ray | SaaS | Managed | $$ | AWS Native |
| Honeycomb | SaaS | Managed | $$ | Observability-focused |
OpenTelemetry Setup
# === OpenTelemetry Python Setup ===
# pip install opentelemetry-api opentelemetry-sdk
# pip install opentelemetry-exporter-otlp
# pip install opentelemetry-instrumentation-flask
# pip install opentelemetry-instrumentation-requests
# from opentelemetry import trace
# from opentelemetry.sdk.trace import TracerProvider
# from opentelemetry.sdk.trace.export import BatchSpanProcessor
# from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# from opentelemetry.sdk.resources import Resource
# from opentelemetry.instrumentation.flask import FlaskInstrumentor
# from opentelemetry.instrumentation.requests import RequestsInstrumentor
#
# # Configure resource
# resource = Resource.create({
# "service.name": "order-service",
# "service.version": "1.2.0",
# "deployment.environment": "production",
# })
#
# # Configure tracer
# provider = TracerProvider(resource=resource)
# exporter = OTLPSpanExporter(endpoint="http://otel-collector:4317")
# provider.add_span_processor(BatchSpanProcessor(exporter))
# trace.set_tracer_provider(provider)
#
# # Auto-instrument
# app = Flask(__name__)
# FlaskInstrumentor().instrument_app(app)
# RequestsInstrumentor().instrument()
#
# # Custom span
# tracer = trace.get_tracer("order-service")
# with tracer.start_as_current_span("process_order") as span:
# span.set_attribute("order.id", order_id)
# span.set_attribute("order.total", total)
# result = process(order)
from dataclasses import dataclass
@dataclass
class OTelComponent:
component: str
package: str
purpose: str
config: str
components = [
OTelComponent("SDK",
"opentelemetry-sdk",
"Core SDK สร้าง Tracer Span",
"TracerProvider + Resource + SpanProcessor"),
OTelComponent("OTLP Exporter",
"opentelemetry-exporter-otlp",
"ส่ง Trace ไป Collector ผ่าน gRPC/HTTP",
"endpoint: otel-collector:4317"),
OTelComponent("Auto-instrumentation",
"opentelemetry-instrumentation-*",
"Auto-create Span สำหรับ Framework",
"Flask, Django, FastAPI, requests, SQLAlchemy"),
OTelComponent("Propagator",
"opentelemetry-propagator-*",
"ส่ง Context ข้าม Service ผ่าน Header",
"W3C TraceContext (default), B3, Jaeger"),
OTelComponent("Sampler",
"Built-in SDK",
"กำหนด % ที่จะเก็บ Trace",
"TraceIdRatioBased(0.1) = เก็บ 10%"),
]
print("=== OTel Components ===")
for c in components:
print(f" [{c.component}] {c.package}")
print(f" Purpose: {c.purpose}")
print(f" Config: {c.config}")
Collector & Backend
# === OTel Collector Config ===
# otel-collector-config.yaml
# receivers:
# otlp:
# protocols:
# grpc:
# endpoint: 0.0.0.0:4317
# http:
# endpoint: 0.0.0.0:4318
#
# processors:
# batch:
# timeout: 5s
# send_batch_size: 1024
# tail_sampling:
# decision_wait: 10s
# policies:
# - name: error-policy
# type: status_code
# status_code: { status_codes: [ERROR] }
# - name: latency-policy
# type: latency
# latency: { threshold_ms: 1000 }
# - name: probabilistic
# type: probabilistic
# probabilistic: { sampling_percentage: 10 }
#
# exporters:
# otlp/tempo:
# endpoint: tempo:4317
# tls:
# insecure: true
# otlp/jaeger:
# endpoint: jaeger:4317
#
# service:
# pipelines:
# traces:
# receivers: [otlp]
# processors: [tail_sampling, batch]
# exporters: [otlp/tempo]
@dataclass
class CollectorPipeline:
stage: str
component: str
config: str
note: str
pipeline = [
CollectorPipeline("Receive",
"OTLP Receiver (gRPC + HTTP)",
"port 4317 (gRPC), 4318 (HTTP)",
"รับ Trace จากทุก Service"),
CollectorPipeline("Process - Sample",
"Tail-based Sampling",
"เก็บ 100% Error, 100% > 1s latency, 10% ที่เหลือ",
"ลด Storage 80-90% เก็บ Trace ที่สำคัญ"),
CollectorPipeline("Process - Batch",
"Batch Processor",
"timeout: 5s, batch_size: 1024",
"รวม Span ก่อนส่ง ลด Network call"),
CollectorPipeline("Export",
"OTLP Exporter → Tempo/Jaeger",
"endpoint: tempo:4317",
"ส่ง Trace ไป Backend เก็บและค้นหา"),
CollectorPipeline("Store",
"Tempo (S3) / Jaeger (ES)",
"S3: ถูก Scale ง่าย / ES: ค้นหาดี",
"Retention 7-30 วัน ลบอัตโนมัติ"),
]
print("=== Collector Pipeline ===")
for p in pipeline:
print(f" [{p.stage}] {p.component}")
print(f" Config: {p.config}")
print(f" Note: {p.note}")
Monitoring & Alerting
# === Trace-based Monitoring ===
@dataclass
class TraceMetric:
metric: str
source: str
threshold: str
alert: str
metrics = [
TraceMetric("Request Latency p99",
"Span duration histogram",
"< 500ms (API), < 2s (Page Load)",
"> 1s API → Warning, > 3s → Critical"),
TraceMetric("Error Rate",
"Span status = ERROR / Total spans",
"< 1%",
"> 1% → Warning, > 5% → Critical"),
TraceMetric("Throughput (RPM)",
"Root span count per minute",
"Monitor trend ไม่ให้ drop > 20%",
"Drop > 30% → Critical (possible outage)"),
TraceMetric("Service Dependency",
"Service Map from trace data",
"ทุก Service ต้อง Healthy",
"Dependency down → Critical"),
TraceMetric("Trace Completeness",
"Spans per trace vs expected",
"ทุก Service ต้องมี Span ใน Trace",
"Missing span → Warning (instrumentation gap)"),
]
print("=== Trace Metrics ===")
for m in metrics:
print(f" [{m.metric}] Source: {m.source}")
print(f" Threshold: {m.threshold}")
print(f" Alert: {m.alert}")
เคล็ดลับ
- OTel: ใช้ OpenTelemetry เป็น Standard ไม่ Lock-in Vendor
- Sampling: ใช้ Tail-based Sampling เก็บ 100% Error + Slow Trace
- Attributes: เพิ่ม Custom Attribute user_id order_id ค้นหาง่าย
- Collector: ใช้ Collector เป็น Gateway ไม่ส่งตรงจาก Service ไป Backend
- Tempo: ใช้ Grafana Tempo + S3 ถูกที่สุดสำหรับ Trace Storage
Distributed Tracing คืออะไร
ติดตาม Request ข้าม Microservices Span Trace Trace ID Context Propagation Bottleneck OpenTelemetry Jaeger Tempo Datadog APM
OpenTelemetry ตั้งค่าอย่างไร
SDK TracerProvider Resource OTLP Exporter Auto-instrumentation Flask Django Sampler 1-10% Collector Gateway W3C TraceContext Propagation
SaaS Architecture เป็นอย่างไร
Agent Collector Backend Frontend Alerting Storage S3 Kafka Buffer Multi-tenant Jaeger Tempo Elasticsearch Service Map Dependency Graph
Best Practices มีอะไร
OpenTelemetry Standard Tail-based Sampling Custom Attribute Alert Latency p99 Error Rate Service Map Retention 7-30 วัน Trace-based Testing
สรุป
Distributed Tracing OpenTelemetry Jaeger Tempo SaaS Collector Sampling Span Trace Context Propagation Service Map Alert Production
