SRE 话题文档:分布式追踪系统
本文档面向生产环境,涵盖分布式追踪系统架构、Jaeger/Zipkin 部署、OpenTelemetry 集成、性能分析等核心运维场景。
1. 生产环境部署架构
1.1 架构图(ASCII)
┌─────────────────────────────────────────────────────────────────────────────┐
│ 分布式追踪系统架构 │
└─────────────────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────────────┐
│ 应用层 │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ Service │ │ Service │ │ Service │ │ Service │ │ Service │ │
│ │ A │──▶│ B │──▶│ C │ │ D │──▶│ E │ │
│ │(SDK) │ │(SDK) │ │(SDK) │ │(SDK) │ │(SDK) │ │
│ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │
└───────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────┘
│ │ │ │ │
└─────────────┴─────────────┴──────┬──────┴─────────────┘
│
┌──────▼──────┐
│ OpenTelemetry │
│ Collector │
│ (采集层) │
└──────┬──────┘
│
┌──────────────────────────┼──────────────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Jaeger │ │ Zipkin │ │ Tempo │
│ (追踪后端) │ │ (追踪后端) │ │ (追踪后端) │
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
│ │ │
└──────────────────────────┼──────────────────────────┘
│
┌───────▼───────┐
│ Storage │
│ (ES/Cassandra)│
│ /S3/TempoDB │
└───────────────┘
┌─────────────────────────────────────────────────────────────────────────────┐
│ 可视化 & 告警 │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Jaeger │ │ Zipkin │ │ Grafana │ │ Prometheus │ │
│ │ UI │ │ UI │ │ Tempo UI │ │ (指标) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────────────────┘
1.2 Kubernetes 部署配置
# jaeger-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: jaeger
namespace: observability
spec:
replicas: 1
selector:
matchLabels:
app: jaeger
template:
metadata:
labels:
app: jaeger
spec:
containers:
- name: jaeger
image: jaegertracing/all-in-one:1.52
ports:
- containerPort: 16686
name: ui
- containerPort: 14268
name: collector-http
- containerPort: 14250
name: collector-grpc
- containerPort: 6831
name: agent-udp
protocol: UDP
env:
- name: COLLECTOR_ZIPKIN_HOST_PORT
value: ":9411"
- name: SPAN_STORAGE_TYPE
value: elasticsearch
- name: ES_SERVER_URLS
value: "http://elasticsearch:9200"
- name: ES_INDEX_PREFIX
value: "jaeger"
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
---
apiVersion: v1
kind: Service
metadata:
name: jaeger
namespace: observability
spec:
ports:
- port: 16686
targetPort: 16686
name: ui
- port: 14268
targetPort: 14268
name: collector-http
- port: 14250
targetPort: 14250
name: collector-grpc
- port: 6831
targetPort: 6831
protocol: UDP
name: agent-udp
- port: 9411
targetPort: 9411
name: zipkin
selector:
app: jaeger
---
# OpenTelemetry Collector
apiVersion: apps/v1
kind: Deployment
metadata:
name: otel-collector
namespace: observability
spec:
replicas: 2
selector:
matchLabels:
app: otel-collector
template:
metadata:
labels:
app: otel-collector
spec:
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:0.89.0
args:
- "--config=/etc/otelcol/config.yaml"
ports:
- containerPort: 4317
name: otlp-grpc
- containerPort: 4318
name: otlp-http
- containerPort: 9411
name: zipkin
volumeMounts:
- name: config
mountPath: /etc/otelcol
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: config
configMap:
name: otel-collector-config
---
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
namespace: observability
data:
config.yaml: |
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
zipkin:
endpoint: 0.0.0.0:9411
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 400
tail_sampling:
decision_wait: 10s
policies:
- name: latency-policy
type: latency
latency:
threshold_ms: 500
- name: error-policy
type: status_code
status_code:
status_codes:
- ERROR
- name: probabilistic-policy
type: probabilistic
probabilistic:
sampling_percentage: 10
exporters:
jaeger:
endpoint: jaeger:14250
tls:
insecure: true
prometheus:
endpoint: 0.0.0.0:8889
logging:
loglevel: info
service:
pipelines:
traces:
receivers: [otlp, zipkin, jaeger]
processors: [memory_limiter, batch, tail_sampling]
exporters: [jaeger]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [prometheus]
1.3 Docker Compose 部署
# docker-compose.yml - 分布式追踪开发环境
version: '3.8'
services:
# Jaeger all-in-one
jaeger:
image: jaegertracing/all-in-one:1.52
container_name: jaeger
environment:
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
- SPAN_STORAGE_TYPE=elasticsearch
- ES_SERVER_URLS=http://elasticsearch:9200
- ES_INDEX_PREFIX=jaeger
ports:
- "16686:16686" # UI
- "14268:14268" # Collector HTTP
- "14250:14250" # Collector gRPC
- "6831:6831/udp" # Agent UDP
- "9411:9411" # Zipkin compatible
networks:
- tracing-net
depends_on:
- elasticsearch
# Zipkin
zipkin:
image: openzipkin/zipkin:latest
container_name: zipkin
environment:
- STORAGE_TYPE=elasticsearch
- ES_HOSTS=elasticsearch:9200
ports:
- "9412:9411" # UI & API
networks:
- tracing-net
depends_on:
- elasticsearch
# Grafana Tempo (轻量级追踪后端)
tempo:
image: grafana/tempo:latest
container_name: tempo
command: ["-config.file=/etc/tempo.yaml"]
volumes:
- ./tempo.yaml:/etc/tempo.yaml
- tempo-data:/var/tempo
ports:
- "3200:3200" # Tempo HTTP
- "9095:9095" # Tempo gRPC
networks:
- tracing-net
# OpenTelemetry Collector
otel-collector:
image: otel/opentelemetry-collector-contrib:0.89.0
container_name: otel-collector
command: ["--config=/etc/otelcol/config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otelcol/config.yaml
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "8889:8889" # Prometheus metrics
networks:
- tracing-net
depends_on:
- jaeger
- tempo
# Elasticsearch (存储后端)
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
container_name: elasticsearch
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
volumes:
- es-data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
networks:
- tracing-net
# Grafana (可视化)
grafana:
image: grafana/grafana:10.2.0
container_name: grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
volumes:
- grafana-data:/var/lib/grafana
ports:
- "3000:3000"
networks:
- tracing-net
depends_on:
- tempo
- elasticsearch
volumes:
es-data:
grafana-data:
tempo-data:
networks:
tracing-net:
driver: bridge
2. OpenTelemetry 集成
2.1 OpenTelemetry SDK 配置
Python 应用示例
# tracing.py - OpenTelemetry Python SDK 配置
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.mysql import MySQLInstrumentor
import os
def init_tracer(service_name: str, service_version: str = "1.0.0"):
"""初始化 OpenTelemetry 追踪器"""
# 创建资源(服务标识)
resource = Resource.create({
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENV", "production"),
"service.instance.id": os.getenv("HOSTNAME", "unknown"),
})
# 创建 TracerProvider
provider = TracerProvider(resource=resource)
# 配置 OTLP Exporter
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector:4317"),
headers={"api-key": os.getenv("OTEL_API_KEY", "")},
)
# 添加批处理处理器
processor = BatchSpanProcessor(
otlp_exporter,
max_queue_size=2048,
schedule_delay_millis=5000,
export_timeout_millis=30000,
max_export_batch_size=512,
)
provider.add_span_processor(processor)
# 设置全局 TracerProvider
trace.set_tracer_provider(provider)
# 自动仪表化常见库
RequestsInstrumentor().instrument()
RedisInstrumentor().instrument()
MySQLInstrumentor().instrument()
return trace.get_tracer(service_name, service_version)
# Flask 应用示例
from flask import Flask, request
import requests
app = Flask(__name__)
# 初始化追踪
tracer = init_tracer("order-service", "1.0.0")
FlaskInstrumentor().instrument_app(app)
@app.route("/orders", methods=["POST"])
def create_order():
with tracer.start_as_current_span("create_order") as span:
# 添加 Span 属性
span.set_attribute("order.user_id", request.json.get("user_id"))
span.set_attribute("order.items_count", len(request.json.get("items", [])))
# 调用下游服务
with tracer.start_as_current_span("call_inventory") as child_span:
response = requests.post(
"http://inventory-service/check",
json={"items": request.json.get("items")}
)
child_span.set_attribute("inventory.status", response.status_code)
# 调用数据库
with tracer.start_as_current_span("db_insert_order") as db_span:
# 模拟数据库操作
db_span.set_attribute("db.system", "mysql")
db_span.set_attribute("db.statement", "INSERT INTO orders...")
span.set_attribute("order.id", "order-12345")
return {"order_id": "order-12345", "status": "created"}
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000)
Java 应用示例
// TracingConfig.java - OpenTelemetry Java SDK 配置
package com.example.tracing;
import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter;
import io.opentelemetry.sdk.OpenTelemetrySdk;
import io.opentelemetry.sdk.resources.Resource;
import io.opentelemetry.sdk.trace.SdkTracerProvider;
import io.opentelemetry.sdk.trace.export.BatchSpanProcessor;
import io.opentelemetry.semconv.resource.attributes.ResourceAttributes;
public class TracingConfig {
private static final String SERVICE_NAME = "payment-service";
private static final String SERVICE_VERSION = "1.0.0";
public static OpenTelemetry initOpenTelemetry() {
// 创建资源
Resource resource = Resource.builder()
.put(ResourceAttributes.SERVICE_NAME, SERVICE_NAME)
.put(ResourceAttributes.SERVICE_VERSION, SERVICE_VERSION)
.put("deployment.environment", System.getenv().getOrDefault("ENV", "production"))
.build();
// 创建 OTLP Exporter
String otlpEndpoint = System.getenv().getOrDefault(
"OTEL_EXPORTER_OTLP_ENDPOINT",
"http://otel-collector:4317"
);
OtlpGrpcSpanExporter spanExporter = OtlpGrpcSpanExporter.builder()
.setEndpoint(otlpEndpoint)
.build();
// 创建 TracerProvider
SdkTracerProvider tracerProvider = SdkTracerProvider.builder()
.addSpanProcessor(BatchSpanProcessor.builder(spanExporter)
.setMaxQueueSize(2048)
.setScheduleDelay(java.time.Duration.ofMillis(5000))
.setExporterTimeout(java.time.Duration.ofMillis(30000))
.setMaxExportBatchSize(512)
.build())
.setResource(resource)
.build();
// 构建 OpenTelemetry 实例
return OpenTelemetrySdk.builder()
.setTracerProvider(tracerProvider)
.buildAndRegisterGlobal();
}
public static Tracer getTracer() {
return io.opentelemetry.api.GlobalOpenTelemetry.getTracer(SERVICE_NAME, SERVICE_VERSION);
}
}
// PaymentService.java - 使用示例
package com.example.service;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.StatusCode;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.context.Scope;
import com.example.tracing.TracingConfig;
public class PaymentService {
private final Tracer tracer = TracingConfig.getTracer();
public PaymentResult processPayment(PaymentRequest request) {
// 创建 Span
Span span = tracer.spanBuilder("process_payment")
.startSpan();
try (Scope scope = span.makeCurrent()) {
// 添加属性
span.setAttribute("payment.user_id", request.getUserId());
span.setAttribute("payment.amount", request.getAmount());
span.setAttribute("payment.currency", request.getCurrency());
// 调用支付网关
Span gatewaySpan = tracer.spanBuilder("call_payment_gateway")
.startSpan();
try (Scope gatewayScope = gatewaySpan.makeCurrent()) {
PaymentResult result = callPaymentGateway(request);
gatewaySpan.setAttribute("gateway.response_code", result.getCode());
gatewaySpan.end();
return result;
}
} catch (Exception e) {
span.recordException(e);
span.setStatus(StatusCode.ERROR, e.getMessage());
throw e;
} finally {
span.end();
}
}
}
Go 应用示例
// tracing.go - OpenTelemetry Go SDK 配置
package tracing
import (
"context"
"os"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
"go.opentelemetry.io/otel/trace"
)
var tracer trace.Tracer
func InitTracer(serviceName, serviceVersion string) (func(context.Context) error, error) {
ctx := context.Background()
// 创建 OTLP Exporter
exporter, err := otlptracegrpc.New(ctx,
otlptracegrpc.WithEndpoint(os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")),
otlptracegrpc.WithInsecure(),
)
if err != nil {
return nil, err
}
// 创建资源
res, err := resource.Merge(
resource.Default(),
resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceName(serviceName),
semconv.ServiceVersion(serviceVersion),
attribute.String("deployment.environment", os.Getenv("ENV")),
),
)
if err != nil {
return nil, err
}
// 创建 TracerProvider
tp := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter,
sdktrace.WithMaxQueueSize(2048),
sdktrace.WithBatchTimeout(5000),
sdktrace.WithExportTimeout(30000),
sdktrace.WithMaxExportBatchSize(512),
),
sdktrace.WithResource(res),
sdktrace.WithSampler(sdktrace.ParentBased(
sdktrace.TraceIDRatioBased(0.1), // 10% 采样
)),
)
// 设置全局 TracerProvider
otel.SetTracerProvider(tp)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
// 创建 Tracer
tracer = tp.Tracer(serviceName, trace.WithInstrumentationVersion(serviceVersion))
return tp.Shutdown, nil
}
// 使用示例
func ProcessOrder(ctx context.Context, orderID string) error {
ctx, span := tracer.Start(ctx, "process_order",
trace.WithAttributes(
attribute.String("order.id", orderID),
),
)
defer span.End()
// 调用库存服务
ctx, inventorySpan := tracer.Start(ctx, "check_inventory")
err := checkInventory(ctx, orderID)
if err != nil {
inventorySpan.RecordError(err)
inventorySpan.SetStatus(codes.Error, err.Error())
}
inventorySpan.End()
return err
}
2.2 OpenTelemetry Collector 配置
# otel-collector-config.yaml
receivers:
# OTLP 接收器(推荐)
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
max_recv_msg_size_mib: 16
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "*"
# Jaeger 接收器(兼容旧系统)
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
thrift_compact:
endpoint: 0.0.0.0:6831
# Zipkin 接收器(兼容旧系统)
zipkin:
endpoint: 0.0.0.0:9411
# Prometheus 接收器(自监控)
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
static_configs:
- targets: ['localhost:8888']
processors:
# 内存限制
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
# 批处理
batch:
timeout: 1s
send_batch_size: 1024
send_batch_max_size: 2048
# 尾部采样策略
tail_sampling:
decision_wait: 10s
num_traces: 100000
expected_new_traces_per_sec: 1000
policies:
# 错误追踪 - 100% 保留
- name: error-policy
type: status_code
status_code:
status_codes:
- ERROR
# 延迟追踪 - 保留超过 500ms 的
- name: latency-policy
type: latency
latency:
threshold_ms: 500
# 概率采样 - 10% 随机保留
- name: probabilistic-policy
type: probabilistic
probabilistic:
sampling_percentage: 10
# 特定服务 - 重要服务 100% 保留
- name: service-policy
type: string_attribute
string_attribute:
key: service.name
values:
- payment-service
- order-service
- auth-service
# 属性处理
attributes:
actions:
# 提取特定属性
- key: http.url
action: extract
pattern: ^https?://(?P<http.host>[^/]+)
# 删除敏感信息
- key: http.header.authorization
action: delete
- key: http.header.cookie
action: delete
# 资源处理
resource:
attributes:
- key: deployment.environment
value: production
action: insert
- key: telemetry.sdk.name
value: opentelemetry
action: insert
# 过滤规则
filter:
spans:
# 过滤健康检查
exclude:
match_type: strict
span_names:
- /health
- /metrics
- /ready
exporters:
# Jaeger Exporter
jaeger:
endpoint: jaeger:14250
tls:
insecure: true
timeout: 30s
# Tempo Exporter
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
# Elasticsearch Exporter
elasticsearch:
endpoints:
- http://elasticsearch:9200
index: "traces-%{service.name}"
flush_interval: 10s
# Prometheus Exporter(指标)
prometheus:
endpoint: 0.0.0.0:8889
namespace: otelcol
# 日志输出(调试)
logging:
loglevel: info
sampling_initial: 5
sampling_thereafter: 200
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
zpages:
endpoint: 0.0.0.0:55679
service:
extensions:
- health_check
- pprof
- zpages
pipelines:
traces:
receivers:
- otlp
- jaeger
- zipkin
processors:
- memory_limiter
- filter
- attributes
- resource
- tail_sampling
- batch
exporters:
- jaeger
- otlp/tempo
metrics:
receivers:
- otlp
- prometheus
processors:
- memory_limiter
- batch
exporters:
- prometheus
logs:
receivers:
- otlp
processors:
- memory_limiter
- batch
exporters:
- logging
telemetry:
logs:
level: info
metrics:
address: 0.0.0.0:8888
3. 追踪数据分析
3.1 追踪数据模型
┌─────────────────────────────────────────────────────────────────────────────┐
│ 追踪数据模型 │
└─────────────────────────────────────────────────────────────────────────────┘
Trace(追踪)
├── Trace ID: 唯一标识一次完整请求
├── Root Span: 入口 Span
└── Spans: 所有相关 Span 集合
Span(跨度)
├── Trace ID: 所属 Trace
├── Span ID: Span 唯一标识
├── Parent Span ID: 父 Span ID(根 Span 为空)
├── Operation Name: 操作名称
├── Start Time: 开始时间
├── Duration: 持续时间
├── Tags/Attributes: 属性键值对
├── Logs/Events: 事件日志
├── Status: 状态(OK/ERROR)
└── Links: 关联其他 Span
Context(上下文)
├── Trace Context: Trace ID + Span ID + Trace Flags
├── Baggage: 跨服务传递的用户数据
└── Propagation: 上下文传播机制
3.2 关键性能指标
| 指标 | 说明 | 告警阈值 |
|---|---|---|
| P99 延迟 | 99 分位响应时间 | > 1s |
| 错误率 | 失败请求占比 | > 1% |
| 吞吐量 | 每秒请求数 | 突降 50% |
| 服务依赖 | 下游调用次数 | 异常增长 |
| Span 数量 | 单 Trace Span 数 | > 100 |
3.3 性能分析命令
# ==============================================================================
# Jaeger 查询命令
# ==============================================================================
# 通过 API 查询追踪
curl "http://jaeger:16686/api/traces?service=order-service&limit=20"
# 查询特定 Trace
curl "http://jaeger:16686/api/traces/{trace-id}"
# 查询错误追踪
curl "http://jaeger:16686/api/traces?service=payment-service&tags=%7B%22error%22:true%7D"
# 查询慢追踪(> 1s)
curl "http://jaeger:16686/api/traces?service=api-gateway&minDuration=1s"
# 查询特定操作
curl "http://jaeger:16686/api/traces?service=user-service&operation=POST%20/users"
# 获取服务列表
curl "http://jaeger:16686/api/services"
# 获取操作列表
curl "http://jaeger:16686/api/services/{service}/operations"
# ==============================================================================
# Elasticsearch 查询
# ==============================================================================
# 查询最近的追踪
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
"query": {
"bool": {
"must": [
{"term": {"process.serviceName": "order-service"}}
]
}
},
"sort": [{"startTimeMillis": "desc"}],
"size": 10
}'
# 查询错误 Span
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
"query": {
"term": {"tags.error": true}
},
"size": 100
}'
# 聚合分析 - P99 延迟
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
"size": 0,
"aggs": {
"services": {
"terms": {"field": "process.serviceName"},
"aggs": {
"p99_duration": {
"percentiles": {
"field": "duration",
"percents": [99]
}
}
}
}
}
}'
# 错误率分析
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
"size": 0,
"aggs": {
"total": {"value_count": {"field": "traceID"}},
"errors": {
"filter": {"term": {"tags.error": true}}
}
}
}'
4. 监控与告警
4.1 Prometheus 指标配置
# prometheus-tracing.yml
scrape_configs:
- job_name: 'otel-collector'
static_configs:
- targets: ['otel-collector:8889']
labels:
component: 'otel-collector'
- job_name: 'jaeger'
static_configs:
- targets: ['jaeger:14269']
labels:
component: 'jaeger'
- job_name: 'tempo'
static_configs:
- targets: ['tempo:3200']
labels:
component: 'tempo'
4.2 告警规则
# tracing-alerts.yml
groups:
- name: tracing-alerts
rules:
# P99 延迟告警
- alert: HighP99Latency
expr: |
histogram_quantile(0.99,
sum(rate(http_server_requests_duration_seconds_bucket[5m])) by (le, service)
) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "服务 P99 延迟过高"
description: "服务 {{ $labels.service }} P99 延迟 {{ $value | printf \"%.2f\" }}s"