SigNoz Hexagonal Observability
SigNoz Observability Hexagonal Architecture OpenTelemetry Traces Metrics Logs ClickHouse APM Dashboard Alerts Production
| Hex Layer | Traces | Metrics | Logs |
|---|---|---|---|
| Domain | Domain Service Calls | Business KPI (Orders Revenue) | Business Events |
| Application | Use Case Orchestration | Latency Throughput Error | Use Case Events |
| Infrastructure | DB API Queue Calls | CPU Memory Disk Network | Technical Events |
OpenTelemetry Setup
# === OpenTelemetry + SigNoz Setup ===
# Install SigNoz (Docker Compose)
# git clone https://github.com/SigNoz/signoz.git
# cd signoz/deploy
# docker compose -f docker/clickhouse-setup/docker-compose.yaml up -d
#
# Access Dashboard: http://localhost:3301
#
# Install OpenTelemetry SDK (Python)
# pip install opentelemetry-api opentelemetry-sdk
# pip install opentelemetry-exporter-otlp
# pip install opentelemetry-instrumentation-flask
# pip install opentelemetry-instrumentation-requests
# pip install opentelemetry-instrumentation-sqlalchemy
from dataclasses import dataclass
@dataclass
class OTelComponent:
component: str
hex_layer: str
what_it_captures: str
signoz_feature: str
components = [
OTelComponent("Tracer (Spans)",
"All Layers",
"Request Flow ข้าม Service + Duration + Status",
"Distributed Tracing → Trace Explorer"),
OTelComponent("Meter (Metrics)",
"All Layers",
"Counter Histogram Gauge (Rate Error Duration)",
"Metrics → Dashboard + Alerts"),
OTelComponent("Logger (Logs)",
"All Layers",
"Structured Log + trace_id Correlation",
"Logs → Log Explorer (Correlated with Traces)"),
OTelComponent("Propagator (Context)",
"Infrastructure (HTTP/gRPC)",
"Trace Context ข้าม Service (W3C TraceContext)",
"Service Map → Dependency Graph"),
OTelComponent("Auto-instrumentation",
"Infrastructure",
"Flask Express Spring DB HTTP Client อัตโนมัติ",
"Traces + Metrics โดยไม่ต้องเขียน Code"),
]
print("=== OpenTelemetry Components ===")
for c in components:
print(f" [{c.component}] Layer: {c.hex_layer}")
print(f" Captures: {c.what_it_captures}")
print(f" SigNoz: {c.signoz_feature}")
Hexagonal Instrumentation
# === Instrumentation per Hexagonal Layer ===
# Domain Layer (Business Logic)
# @tracer.start_as_current_span("domain.process_order")
# def process_order(order: Order) -> OrderResult:
# span = trace.get_current_span()
# span.set_attribute("order.id", order.id)
# span.set_attribute("order.amount", order.total)
# orders_counter.add(1, {"status": "processing"})
# # Business logic...
# return result
#
# Application Layer (Use Cases)
# @tracer.start_as_current_span("usecase.create_order")
# async def create_order(request: CreateOrderRequest) -> CreateOrderResponse:
# span = trace.get_current_span()
# span.set_attribute("user.id", request.user_id)
# with usecase_duration.record({"usecase": "create_order"}):
# order = Order.create(request)
# result = process_order(order)
# await notify_customer(result)
# return response
#
# Infrastructure Layer (Adapters)
# Auto-instrumented by OpenTelemetry:
# - HTTP (Flask/FastAPI): Request Duration Status Code
# - Database (SQLAlchemy): Query Duration Statement
# - HTTP Client (requests): External Call Duration
@dataclass
class InstrumentationGuide:
layer: str
span_name: str
attributes: str
metrics: str
logs: str
guides = [
InstrumentationGuide("Domain",
"domain.{service}.{method}",
"order.id order.amount user.id product.id",
"orders_total revenue_total conversion_rate",
"OrderCreated PaymentProcessed InventoryUpdated"),
InstrumentationGuide("Application",
"usecase.{name}",
"user.id request.type session.id",
"usecase_duration usecase_errors auth_attempts",
"UseCaseStarted UseCaseCompleted AuthFailed"),
InstrumentationGuide("Infrastructure (DB)",
"db.{operation} (Auto-instrumented)",
"db.system db.statement db.operation",
"db_query_duration db_connections_active",
"SlowQuery ConnectionPoolExhausted"),
InstrumentationGuide("Infrastructure (HTTP)",
"http.{method} (Auto-instrumented)",
"http.method http.url http.status_code",
"http_request_duration http_requests_total",
"HTTPError TimeoutError RateLimited"),
InstrumentationGuide("Infrastructure (Queue)",
"queue.{operation}",
"messaging.system messaging.destination",
"queue_messages_produced queue_lag",
"MessagePublished MessageConsumed ConsumerError"),
]
print("=== Instrumentation Guide ===")
for g in guides:
print(f"\n [{g.layer}] Span: {g.span_name}")
print(f" Attributes: {g.attributes}")
print(f" Metrics: {g.metrics}")
print(f" Logs: {g.logs}")
Production Dashboard
# === SigNoz Production Setup ===
@dataclass
class DashboardPanel:
panel: str
query: str
target: str
alert: str
panels = [
DashboardPanel("Request Rate (RPM)",
"rate(http_requests_total[5m]) * 60",
"Baseline ±20%",
"Drop > 50% → P1 Service Down"),
DashboardPanel("Error Rate (%)",
"rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m])",
"< 1%",
"> 1% → P2 | > 5% → P1"),
DashboardPanel("P99 Latency (ms)",
"histogram_quantile(0.99, rate(http_request_duration_bucket[5m]))",
"< 500ms",
"> 1000ms → P2 | > 3000ms → P1"),
DashboardPanel("Business: Orders/min",
"rate(orders_total[5m]) * 60",
"Baseline ±20%",
"Drop > 30% → P2 Check Conversion"),
DashboardPanel("DB Query Duration P95",
"histogram_quantile(0.95, rate(db_query_duration_bucket[5m]))",
"< 100ms",
"> 500ms → P2 Slow Query"),
DashboardPanel("ClickHouse Storage",
"signoz_storage_bytes",
"< 80% Disk",
"> 90% → P2 Increase Retention/Sampling"),
]
print("=== Dashboard Panels ===")
for p in panels:
print(f" [{p.panel}]")
print(f" Query: {p.query}")
print(f" Target: {p.target}")
print(f" Alert: {p.alert}")
เคล็ดลับ
- OpenTelemetry: ใช้ OpenTelemetry Standard ไม่ Lock-in Vendor
- Sampling: ใช้ Tail-based Sampling เก็บเฉพาะ Error/Slow Traces
- Correlation: ใส่ trace_id ใน Log ทุกบรรทัด Correlate Trace+Log
- Business Metrics: สร้าง Business KPI Dashboard (Orders Revenue)
- ClickHouse: ClickHouse เร็วมาก Query Trace/Metric มหาศาลได้
SigNoz คืออะไร
Open Source APM Observability Traces Metrics Logs OpenTelemetry ClickHouse Dashboard Alerts ทดแทน Datadog New Relic ถูกกว่า
Hexagonal Observability คืออะไร
Domain Business KPI Application Use Case Performance Infrastructure Technical DB HTTP Queue Port Adapter Trace Metric Log แยก Layer
Traces Metrics Logs ตั้งค่าอย่างไร
OpenTelemetry SDK Tracer Span Meter Counter Histogram Logger OTLP Exporter SigNoz Auto-instrumentation Flask SQLAlchemy W3C
Production Best Practices มีอะไร
Sampling 10-20% Tail-based Retention 7-90 วัน Alerts RED Metrics Dashboard Business KPI Security PII Scrub ClickHouse Cluster Scale
สรุป
SigNoz Observability Hexagonal Architecture OpenTelemetry Traces Metrics Logs ClickHouse Domain Application Infrastructure Dashboard Production
