SRE 话题文档:分布式追踪系统

本文档面向生产环境,涵盖分布式追踪系统架构、Jaeger/Zipkin 部署、OpenTelemetry 集成、性能分析等核心运维场景。


1. 生产环境部署架构

1.1 架构图(ASCII)

┌─────────────────────────────────────────────────────────────────────────────┐
│                        分布式追踪系统架构                                     │
└─────────────────────────────────────────────────────────────────────────────┘

  ┌─────────────────────────────────────────────────────────────────────────┐
  │                          应用层                                          │
  │  ┌─────────┐   ┌─────────┐   ┌─────────┐   ┌─────────┐   ┌─────────┐   │
  │  │ Service │   │ Service │   │ Service │   │ Service │   │ Service │   │
  │  │    A    │──▶│    B    │──▶│    C    │   │    D    │──▶│    E    │   │
  │  │(SDK)    │   │(SDK)    │   │(SDK)    │   │(SDK)    │   │(SDK)    │   │
  │  └────┬────┘   └────┬────┘   └────┬────┘   └────┬────┘   └────┬────┘   │
  └───────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────┘
          │             │             │             │             │
          └─────────────┴─────────────┴──────┬──────┴─────────────┘
                                               │
                                        ┌──────▼──────┐
                                        │  OpenTelemetry │
                                        │  Collector    │
                                        │  (采集层)      │
                                        └──────┬──────┘
                                               │
                    ┌──────────────────────────┼──────────────────────────┐
                    │                          │                          │
                    ▼                          ▼                          ▼
           ┌───────────────┐          ┌───────────────┐          ┌───────────────┐
           │   Jaeger      │          │   Zipkin      │          │   Tempo       │
           │   (追踪后端)   │          │   (追踪后端)   │          │   (追踪后端)   │
           └───────┬───────┘          └───────┬───────┘          └───────┬───────┘
                   │                          │                          │
                   └──────────────────────────┼──────────────────────────┘
                                              │
                                      ┌───────▼───────┐
                                      │   Storage     │
                                      │ (ES/Cassandra)│
                                      │   /S3/TempoDB │
                                      └───────────────┘

┌─────────────────────────────────────────────────────────────────────────────┐
│  可视化 & 告警                                                               │
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         │
│  │   Jaeger    │  │   Zipkin    │  │   Grafana   │  │ Prometheus  │        │
│  │     UI      │  │     UI      │  │  Tempo UI   │  │  (指标)      │        │
│  └─────────────┘  └─────────────┘  └─────────────┘  └─────────────┘         │
└─────────────────────────────────────────────────────────────────────────────┘

1.2 Kubernetes 部署配置

# jaeger-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: jaeger
  namespace: observability
spec:
  replicas: 1
  selector:
    matchLabels:
      app: jaeger
  template:
    metadata:
      labels:
        app: jaeger
    spec:
      containers:
      - name: jaeger
        image: jaegertracing/all-in-one:1.52
        ports:
        - containerPort: 16686
          name: ui
        - containerPort: 14268
          name: collector-http
        - containerPort: 14250
          name: collector-grpc
        - containerPort: 6831
          name: agent-udp
          protocol: UDP
        env:
        - name: COLLECTOR_ZIPKIN_HOST_PORT
          value: ":9411"
        - name: SPAN_STORAGE_TYPE
          value: elasticsearch
        - name: ES_SERVER_URLS
          value: "http://elasticsearch:9200"
        - name: ES_INDEX_PREFIX
          value: "jaeger"
        resources:
          requests:
            cpu: 200m
            memory: 256Mi
          limits:
            cpu: 500m
            memory: 512Mi
---
apiVersion: v1
kind: Service
metadata:
  name: jaeger
  namespace: observability
spec:
  ports:
  - port: 16686
    targetPort: 16686
    name: ui
  - port: 14268
    targetPort: 14268
    name: collector-http
  - port: 14250
    targetPort: 14250
    name: collector-grpc
  - port: 6831
    targetPort: 6831
    protocol: UDP
    name: agent-udp
  - port: 9411
    targetPort: 9411
    name: zipkin
  selector:
    app: jaeger
---
# OpenTelemetry Collector
apiVersion: apps/v1
kind: Deployment
metadata:
  name: otel-collector
  namespace: observability
spec:
  replicas: 2
  selector:
    matchLabels:
      app: otel-collector
  template:
    metadata:
      labels:
        app: otel-collector
    spec:
      containers:
      - name: otel-collector
        image: otel/opentelemetry-collector-contrib:0.89.0
        args:
          - "--config=/etc/otelcol/config.yaml"
        ports:
        - containerPort: 4317
          name: otlp-grpc
        - containerPort: 4318
          name: otlp-http
        - containerPort: 9411
          name: zipkin
        volumeMounts:
        - name: config
          mountPath: /etc/otelcol
        resources:
          requests:
            cpu: 200m
            memory: 256Mi
          limits:
            cpu: 500m
            memory: 512Mi
      volumes:
      - name: config
        configMap:
          name: otel-collector-config
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: otel-collector-config
  namespace: observability
data:
  config.yaml: |
    receivers:
      otlp:
        protocols:
          grpc:
            endpoint: 0.0.0.0:4317
          http:
            endpoint: 0.0.0.0:4318
      zipkin:
        endpoint: 0.0.0.0:9411
      jaeger:
        protocols:
          grpc:
            endpoint: 0.0.0.0:14250
          thrift_http:
            endpoint: 0.0.0.0:14268

    processors:
      batch:
        timeout: 1s
        send_batch_size: 1024
      memory_limiter:
        check_interval: 1s
        limit_mib: 400
      tail_sampling:
        decision_wait: 10s
        policies:
          - name: latency-policy
            type: latency
            latency:
              threshold_ms: 500
          - name: error-policy
            type: status_code
            status_code:
              status_codes:
                - ERROR
          - name: probabilistic-policy
            type: probabilistic
            probabilistic:
              sampling_percentage: 10

    exporters:
      jaeger:
        endpoint: jaeger:14250
        tls:
          insecure: true
      prometheus:
        endpoint: 0.0.0.0:8889
      logging:
        loglevel: info

    service:
      pipelines:
        traces:
          receivers: [otlp, zipkin, jaeger]
          processors: [memory_limiter, batch, tail_sampling]
          exporters: [jaeger]
        metrics:
          receivers: [otlp]
          processors: [memory_limiter, batch]
          exporters: [prometheus]

1.3 Docker Compose 部署

# docker-compose.yml - 分布式追踪开发环境
version: '3.8'

services:
  # Jaeger all-in-one
  jaeger:
    image: jaegertracing/all-in-one:1.52
    container_name: jaeger
    environment:
      - COLLECTOR_ZIPKIN_HOST_PORT=:9411
      - SPAN_STORAGE_TYPE=elasticsearch
      - ES_SERVER_URLS=http://elasticsearch:9200
      - ES_INDEX_PREFIX=jaeger
    ports:
      - "16686:16686"   # UI
      - "14268:14268"   # Collector HTTP
      - "14250:14250"   # Collector gRPC
      - "6831:6831/udp" # Agent UDP
      - "9411:9411"     # Zipkin compatible
    networks:
      - tracing-net
    depends_on:
      - elasticsearch

  # Zipkin
  zipkin:
    image: openzipkin/zipkin:latest
    container_name: zipkin
    environment:
      - STORAGE_TYPE=elasticsearch
      - ES_HOSTS=elasticsearch:9200
    ports:
      - "9412:9411"     # UI & API
    networks:
      - tracing-net
    depends_on:
      - elasticsearch

  # Grafana Tempo (轻量级追踪后端)
  tempo:
    image: grafana/tempo:latest
    container_name: tempo
    command: ["-config.file=/etc/tempo.yaml"]
    volumes:
      - ./tempo.yaml:/etc/tempo.yaml
      - tempo-data:/var/tempo
    ports:
      - "3200:3200"     # Tempo HTTP
      - "9095:9095"     # Tempo gRPC
    networks:
      - tracing-net

  # OpenTelemetry Collector
  otel-collector:
    image: otel/opentelemetry-collector-contrib:0.89.0
    container_name: otel-collector
    command: ["--config=/etc/otelcol/config.yaml"]
    volumes:
      - ./otel-collector-config.yaml:/etc/otelcol/config.yaml
    ports:
      - "4317:4317"     # OTLP gRPC
      - "4318:4318"     # OTLP HTTP
      - "8889:8889"     # Prometheus metrics
    networks:
      - tracing-net
    depends_on:
      - jaeger
      - tempo

  # Elasticsearch (存储后端)
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
    container_name: elasticsearch
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    volumes:
      - es-data:/usr/share/elasticsearch/data
    ports:
      - "9200:9200"
    networks:
      - tracing-net

  # Grafana (可视化)
  grafana:
    image: grafana/grafana:10.2.0
    container_name: grafana
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin123
    volumes:
      - grafana-data:/var/lib/grafana
    ports:
      - "3000:3000"
    networks:
      - tracing-net
    depends_on:
      - tempo
      - elasticsearch

volumes:
  es-data:
  grafana-data:
  tempo-data:

networks:
  tracing-net:
    driver: bridge

2. OpenTelemetry 集成

2.1 OpenTelemetry SDK 配置

Python 应用示例

# tracing.py - OpenTelemetry Python SDK 配置
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.mysql import MySQLInstrumentor
import os

def init_tracer(service_name: str, service_version: str = "1.0.0"):
    """初始化 OpenTelemetry 追踪器"""

    # 创建资源(服务标识)
    resource = Resource.create({
        SERVICE_NAME: service_name,
        SERVICE_VERSION: service_version,
        "deployment.environment": os.getenv("ENV", "production"),
        "service.instance.id": os.getenv("HOSTNAME", "unknown"),
    })

    # 创建 TracerProvider
    provider = TracerProvider(resource=resource)

    # 配置 OTLP Exporter
    otlp_exporter = OTLPSpanExporter(
        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector:4317"),
        headers={"api-key": os.getenv("OTEL_API_KEY", "")},
    )

    # 添加批处理处理器
    processor = BatchSpanProcessor(
        otlp_exporter,
        max_queue_size=2048,
        schedule_delay_millis=5000,
        export_timeout_millis=30000,
        max_export_batch_size=512,
    )
    provider.add_span_processor(processor)

    # 设置全局 TracerProvider
    trace.set_tracer_provider(provider)

    # 自动仪表化常见库
    RequestsInstrumentor().instrument()
    RedisInstrumentor().instrument()
    MySQLInstrumentor().instrument()

    return trace.get_tracer(service_name, service_version)


# Flask 应用示例
from flask import Flask, request
import requests

app = Flask(__name__)

# 初始化追踪
tracer = init_tracer("order-service", "1.0.0")
FlaskInstrumentor().instrument_app(app)

@app.route("/orders", methods=["POST"])
def create_order():
    with tracer.start_as_current_span("create_order") as span:
        # 添加 Span 属性
        span.set_attribute("order.user_id", request.json.get("user_id"))
        span.set_attribute("order.items_count", len(request.json.get("items", [])))

        # 调用下游服务
        with tracer.start_as_current_span("call_inventory") as child_span:
            response = requests.post(
                "http://inventory-service/check",
                json={"items": request.json.get("items")}
            )
            child_span.set_attribute("inventory.status", response.status_code)

        # 调用数据库
        with tracer.start_as_current_span("db_insert_order") as db_span:
            # 模拟数据库操作
            db_span.set_attribute("db.system", "mysql")
            db_span.set_attribute("db.statement", "INSERT INTO orders...")

        span.set_attribute("order.id", "order-12345")
        return {"order_id": "order-12345", "status": "created"}

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)

Java 应用示例

// TracingConfig.java - OpenTelemetry Java SDK 配置
package com.example.tracing;

import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter;
import io.opentelemetry.sdk.OpenTelemetrySdk;
import io.opentelemetry.sdk.resources.Resource;
import io.opentelemetry.sdk.trace.SdkTracerProvider;
import io.opentelemetry.sdk.trace.export.BatchSpanProcessor;
import io.opentelemetry.semconv.resource.attributes.ResourceAttributes;

public class TracingConfig {

    private static final String SERVICE_NAME = "payment-service";
    private static final String SERVICE_VERSION = "1.0.0";

    public static OpenTelemetry initOpenTelemetry() {
        // 创建资源
        Resource resource = Resource.builder()
            .put(ResourceAttributes.SERVICE_NAME, SERVICE_NAME)
            .put(ResourceAttributes.SERVICE_VERSION, SERVICE_VERSION)
            .put("deployment.environment", System.getenv().getOrDefault("ENV", "production"))
            .build();

        // 创建 OTLP Exporter
        String otlpEndpoint = System.getenv().getOrDefault(
            "OTEL_EXPORTER_OTLP_ENDPOINT", 
            "http://otel-collector:4317"
        );

        OtlpGrpcSpanExporter spanExporter = OtlpGrpcSpanExporter.builder()
            .setEndpoint(otlpEndpoint)
            .build();

        // 创建 TracerProvider
        SdkTracerProvider tracerProvider = SdkTracerProvider.builder()
            .addSpanProcessor(BatchSpanProcessor.builder(spanExporter)
                .setMaxQueueSize(2048)
                .setScheduleDelay(java.time.Duration.ofMillis(5000))
                .setExporterTimeout(java.time.Duration.ofMillis(30000))
                .setMaxExportBatchSize(512)
                .build())
            .setResource(resource)
            .build();

        // 构建 OpenTelemetry 实例
        return OpenTelemetrySdk.builder()
            .setTracerProvider(tracerProvider)
            .buildAndRegisterGlobal();
    }

    public static Tracer getTracer() {
        return io.opentelemetry.api.GlobalOpenTelemetry.getTracer(SERVICE_NAME, SERVICE_VERSION);
    }
}

// PaymentService.java - 使用示例
package com.example.service;

import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.StatusCode;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.context.Scope;
import com.example.tracing.TracingConfig;

public class PaymentService {

    private final Tracer tracer = TracingConfig.getTracer();

    public PaymentResult processPayment(PaymentRequest request) {
        // 创建 Span
        Span span = tracer.spanBuilder("process_payment")
            .startSpan();

        try (Scope scope = span.makeCurrent()) {
            // 添加属性
            span.setAttribute("payment.user_id", request.getUserId());
            span.setAttribute("payment.amount", request.getAmount());
            span.setAttribute("payment.currency", request.getCurrency());

            // 调用支付网关
            Span gatewaySpan = tracer.spanBuilder("call_payment_gateway")
                .startSpan();
            try (Scope gatewayScope = gatewaySpan.makeCurrent()) {
                PaymentResult result = callPaymentGateway(request);
                gatewaySpan.setAttribute("gateway.response_code", result.getCode());
                gatewaySpan.end();
                return result;
            }

        } catch (Exception e) {
            span.recordException(e);
            span.setStatus(StatusCode.ERROR, e.getMessage());
            throw e;
        } finally {
            span.end();
        }
    }
}

Go 应用示例

// tracing.go - OpenTelemetry Go SDK 配置
package tracing

import (
    "context"
    "os"

    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    "go.opentelemetry.io/otel/propagation"
    "go.opentelemetry.io/otel/sdk/resource"
    sdktrace "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
    "go.opentelemetry.io/otel/trace"
)

var tracer trace.Tracer

func InitTracer(serviceName, serviceVersion string) (func(context.Context) error, error) {
    ctx := context.Background()

    // 创建 OTLP Exporter
    exporter, err := otlptracegrpc.New(ctx,
        otlptracegrpc.WithEndpoint(os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")),
        otlptracegrpc.WithInsecure(),
    )
    if err != nil {
        return nil, err
    }

    // 创建资源
    res, err := resource.Merge(
        resource.Default(),
        resource.NewWithAttributes(
            semconv.SchemaURL,
            semconv.ServiceName(serviceName),
            semconv.ServiceVersion(serviceVersion),
            attribute.String("deployment.environment", os.Getenv("ENV")),
        ),
    )
    if err != nil {
        return nil, err
    }

    // 创建 TracerProvider
    tp := sdktrace.NewTracerProvider(
        sdktrace.WithBatcher(exporter,
            sdktrace.WithMaxQueueSize(2048),
            sdktrace.WithBatchTimeout(5000),
            sdktrace.WithExportTimeout(30000),
            sdktrace.WithMaxExportBatchSize(512),
        ),
        sdktrace.WithResource(res),
        sdktrace.WithSampler(sdktrace.ParentBased(
            sdktrace.TraceIDRatioBased(0.1), // 10% 采样
        )),
    )

    // 设置全局 TracerProvider
    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
        propagation.TraceContext{},
        propagation.Baggage{},
    ))

    // 创建 Tracer
    tracer = tp.Tracer(serviceName, trace.WithInstrumentationVersion(serviceVersion))

    return tp.Shutdown, nil
}

// 使用示例
func ProcessOrder(ctx context.Context, orderID string) error {
    ctx, span := tracer.Start(ctx, "process_order",
        trace.WithAttributes(
            attribute.String("order.id", orderID),
        ),
    )
    defer span.End()

    // 调用库存服务
    ctx, inventorySpan := tracer.Start(ctx, "check_inventory")
    err := checkInventory(ctx, orderID)
    if err != nil {
        inventorySpan.RecordError(err)
        inventorySpan.SetStatus(codes.Error, err.Error())
    }
    inventorySpan.End()

    return err
}

2.2 OpenTelemetry Collector 配置

# otel-collector-config.yaml
receivers:
  # OTLP 接收器(推荐)
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
        max_recv_msg_size_mib: 16
      http:
        endpoint: 0.0.0.0:4318
        cors:
          allowed_origins:
            - "*"

  # Jaeger 接收器(兼容旧系统)
  jaeger:
    protocols:
      grpc:
        endpoint: 0.0.0.0:14250
      thrift_http:
        endpoint: 0.0.0.0:14268
      thrift_compact:
        endpoint: 0.0.0.0:6831

  # Zipkin 接收器(兼容旧系统)
  zipkin:
    endpoint: 0.0.0.0:9411

  # Prometheus 接收器(自监控)
  prometheus:
    config:
      scrape_configs:
        - job_name: 'otel-collector'
          static_configs:
            - targets: ['localhost:8888']

processors:
  # 内存限制
  memory_limiter:
    check_interval: 1s
    limit_mib: 512
    spike_limit_mib: 128

  # 批处理
  batch:
    timeout: 1s
    send_batch_size: 1024
    send_batch_max_size: 2048

  # 尾部采样策略
  tail_sampling:
    decision_wait: 10s
    num_traces: 100000
    expected_new_traces_per_sec: 1000
    policies:
      # 错误追踪 - 100% 保留
      - name: error-policy
        type: status_code
        status_code:
          status_codes:
            - ERROR

      # 延迟追踪 - 保留超过 500ms 的
      - name: latency-policy
        type: latency
        latency:
          threshold_ms: 500

      # 概率采样 - 10% 随机保留
      - name: probabilistic-policy
        type: probabilistic
        probabilistic:
          sampling_percentage: 10

      # 特定服务 - 重要服务 100% 保留
      - name: service-policy
        type: string_attribute
        string_attribute:
          key: service.name
          values:
            - payment-service
            - order-service
            - auth-service

  # 属性处理
  attributes:
    actions:
      # 提取特定属性
      - key: http.url
        action: extract
        pattern: ^https?://(?P<http.host>[^/]+)
      # 删除敏感信息
      - key: http.header.authorization
        action: delete
      - key: http.header.cookie
        action: delete

  # 资源处理
  resource:
    attributes:
      - key: deployment.environment
        value: production
        action: insert
      - key: telemetry.sdk.name
        value: opentelemetry
        action: insert

  # 过滤规则
  filter:
    spans:
      # 过滤健康检查
      exclude:
        match_type: strict
        span_names:
          - /health
          - /metrics
          - /ready

exporters:
  # Jaeger Exporter
  jaeger:
    endpoint: jaeger:14250
    tls:
      insecure: true
    timeout: 30s

  # Tempo Exporter
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  # Elasticsearch Exporter
  elasticsearch:
    endpoints:
      - http://elasticsearch:9200
    index: "traces-%{service.name}"
    flush_interval: 10s

  # Prometheus Exporter(指标)
  prometheus:
    endpoint: 0.0.0.0:8889
    namespace: otelcol

  # 日志输出(调试)
  logging:
    loglevel: info
    sampling_initial: 5
    sampling_thereafter: 200

extensions:
  health_check:
    endpoint: 0.0.0.0:13133
  pprof:
    endpoint: 0.0.0.0:1777
  zpages:
    endpoint: 0.0.0.0:55679

service:
  extensions:
    - health_check
    - pprof
    - zpages

  pipelines:
    traces:
      receivers:
        - otlp
        - jaeger
        - zipkin
      processors:
        - memory_limiter
        - filter
        - attributes
        - resource
        - tail_sampling
        - batch
      exporters:
        - jaeger
        - otlp/tempo

    metrics:
      receivers:
        - otlp
        - prometheus
      processors:
        - memory_limiter
        - batch
      exporters:
        - prometheus

    logs:
      receivers:
        - otlp
      processors:
        - memory_limiter
        - batch
      exporters:
        - logging

  telemetry:
    logs:
      level: info
    metrics:
      address: 0.0.0.0:8888

3. 追踪数据分析

3.1 追踪数据模型

┌─────────────────────────────────────────────────────────────────────────────┐
│                           追踪数据模型                                       │
└─────────────────────────────────────────────────────────────────────────────┘

Trace(追踪)
├── Trace ID: 唯一标识一次完整请求
├── Root Span: 入口 Span
└── Spans: 所有相关 Span 集合

Span(跨度)
├── Trace ID: 所属 Trace
├── Span ID: Span 唯一标识
├── Parent Span ID: 父 Span ID(根 Span 为空)
├── Operation Name: 操作名称
├── Start Time: 开始时间
├── Duration: 持续时间
├── Tags/Attributes: 属性键值对
├── Logs/Events: 事件日志
├── Status: 状态(OK/ERROR)
└── Links: 关联其他 Span

Context(上下文)
├── Trace Context: Trace ID + Span ID + Trace Flags
├── Baggage: 跨服务传递的用户数据
└── Propagation: 上下文传播机制

3.2 关键性能指标

指标 说明 告警阈值
P99 延迟 99 分位响应时间 > 1s
错误率 失败请求占比 > 1%
吞吐量 每秒请求数 突降 50%
服务依赖 下游调用次数 异常增长
Span 数量 单 Trace Span 数 > 100

3.3 性能分析命令

# ==============================================================================
# Jaeger 查询命令
# ==============================================================================

# 通过 API 查询追踪
curl "http://jaeger:16686/api/traces?service=order-service&limit=20"

# 查询特定 Trace
curl "http://jaeger:16686/api/traces/{trace-id}"

# 查询错误追踪
curl "http://jaeger:16686/api/traces?service=payment-service&tags=%7B%22error%22:true%7D"

# 查询慢追踪(> 1s)
curl "http://jaeger:16686/api/traces?service=api-gateway&minDuration=1s"

# 查询特定操作
curl "http://jaeger:16686/api/traces?service=user-service&operation=POST%20/users"

# 获取服务列表
curl "http://jaeger:16686/api/services"

# 获取操作列表
curl "http://jaeger:16686/api/services/{service}/operations"

# ==============================================================================
# Elasticsearch 查询
# ==============================================================================

# 查询最近的追踪
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
  "query": {
    "bool": {
      "must": [
        {"term": {"process.serviceName": "order-service"}}
      ]
    }
  },
  "sort": [{"startTimeMillis": "desc"}],
  "size": 10
}'

# 查询错误 Span
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
  "query": {
    "term": {"tags.error": true}
  },
  "size": 100
}'

# 聚合分析 - P99 延迟
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
  "size": 0,
  "aggs": {
    "services": {
      "terms": {"field": "process.serviceName"},
      "aggs": {
        "p99_duration": {
          "percentiles": {
            "field": "duration",
            "percents": [99]
          }
        }
      }
    }
  }
}'

# 错误率分析
curl -X POST "http://elasticsearch:9200/jaeger-span-*/_search" -H 'Content-Type: application/json' -d'
{
  "size": 0,
  "aggs": {
    "total": {"value_count": {"field": "traceID"}},
    "errors": {
      "filter": {"term": {"tags.error": true}}
    }
  }
}'

4. 监控与告警

4.1 Prometheus 指标配置

# prometheus-tracing.yml
scrape_configs:
  - job_name: 'otel-collector'
    static_configs:
      - targets: ['otel-collector:8889']
        labels:
          component: 'otel-collector'

  - job_name: 'jaeger'
    static_configs:
      - targets: ['jaeger:14269']
        labels:
          component: 'jaeger'

  - job_name: 'tempo'
    static_configs:
      - targets: ['tempo:3200']
        labels:
          component: 'tempo'

4.2 告警规则

# tracing-alerts.yml
groups:
  - name: tracing-alerts
    rules:
      # P99 延迟告警
      - alert: HighP99Latency
        expr: |
          histogram_quantile(0.99, 
            sum(rate(http_server_requests_duration_seconds_bucket[5m])) by (le, service)
          ) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "服务 P99 延迟过高"
          description: "服务 {{ $labels.service }} P99 延迟 {{ $value | printf \"%.2f\" }}s"

results matching ""

    No results matching ""