SRE 话题文档:Elasticsearch 集群运维
本文档面向生产环境,涵盖 Elasticsearch 集群架构部署、索引管理、性能调优、监控告警等核心运维场景。
1. 生产环境部署架构
1.1 架构图(ASCII)
┌─────────────────────────────────────────────────────────────────────────────┐
│ Elasticsearch 生产集群架构 │
└─────────────────────────────────────────────────────────────────────────────┘
┌─────────────────┐
│ Kibana UI │
│ (可视化界面) │
└────────┬────────┘
│
┌──────────────────┼──────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Node-01 │ │ Node-02 │ │ Node-03 │
│ (Master+Data) │ │ (Master+Data) │ │ (Master+Data) │
│ │ │ │ │ │
│ ┌───────────┐ │ │ ┌───────────┐ │ │ ┌───────────┐ │
│ │ Index 1 │ │ │ │ Index 1 │ │ │ │ Index 2 │ │
│ │ (Shard 0) │ │ │ │ (Shard 1) │ │ │ │ (Shard 0) │ │
│ ├───────────┤ │ │ ├───────────┤ │ │ ├───────────┤ │
│ │ Index 2 │ │ │ │ Index 3 │ │ │ │ Index 3 │ │
│ │ (Shard 1) │ │ │ │ (Shard 0) │ │ │ │ (Shard 1) │ │
│ └───────────┘ │ │ └───────────┘ │ │ └───────────┘ │
│ │ │ │ │ │
│ Port: 9200 │ │ Port: 9200 │ │ Port: 9200 │
│ Port: 9300 │ │ Port: 9300 │ │ Port: 9300 │
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
│ │ │
└──────────────────┼──────────────────┘
│
┌───────┴───────┐
│ Load Balancer │
│ (Nginx/HAProxy)│
└───────┬───────┘
│
┌───────┴───────┐
│ Logstash │
│ (数据摄入) │
└───────┬───────┘
│
┌───────┴───────┐
│ Beats │
│ (Filebeat/ │
│ Metricbeat) │
└───────────────┘
┌─────────────────────────────────────────────────────────────────────────────┐
│ 监控组件 │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │ │ Grafana │ │ Cerebro │ │ Curator │ │
│ │ (监控) │ │ (可视化) │ │ (集群管理) │ │ (索引清理) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────────────────┘
1.2 Kubernetes 部署配置
# elasticsearch-cluster.yaml - 使用 ECK Operator
apiVersion: elasticsearch.k8s.elastic.co/v1
kind: Elasticsearch
metadata:
name: elasticsearch
namespace: elastic
spec:
version: 8.11.0
auth:
fileRealm:
- secretName: elastic-users
http:
tls:
selfSignedCertificate:
disabled: true
nodeSets:
# Master + Data 节点
- name: master-data
count: 3
config:
node.roles: ["master", "data", "ingest"]
node.store.allow_mmap: false
cluster.routing.allocation.awareness.attributes: zone
podTemplate:
spec:
initContainers:
- name: sysctl
image: busybox
command: ['sysctl', '-w', 'vm.max_map_count=262144']
securityContext:
privileged: true
containers:
- name: elasticsearch
resources:
requests:
memory: 8Gi
cpu: 2
limits:
memory: 8Gi
cpu: 4
env:
- name: ES_JAVA_OPTS
value: "-Xms4g -Xmx4g"
volumeMounts:
- name: data
mountPath: /usr/share/elasticsearch/data
volumes:
- name: data
persistentVolumeClaim:
claimName: elasticsearch-data
volumeClaimTemplates:
- metadata:
name: elasticsearch-data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 500Gi
storageClassName: fast-ssd
---
# Kibana
apiVersion: kibana.k8s.elastic.co/v1
kind: Kibana
metadata:
name: kibana
namespace: elastic
spec:
version: 8.11.0
count: 1
elasticsearchRef:
name: elasticsearch
http:
tls:
selfSignedCertificate:
disabled: true
podTemplate:
spec:
containers:
- name: kibana
resources:
requests:
memory: 1Gi
cpu: 0.5
limits:
memory: 2Gi
cpu: 1
---
# Logstash
apiVersion: logstash.k8s.elastic.co/v1alpha1
kind: Logstash
metadata:
name: logstash
namespace: elastic
spec:
version: 8.11.0
count: 2
elasticsearchRefs:
- name: elasticsearch
clusterName: production
pipelines:
- pipeline.id: main
config.string: |
input {
beats {
port => 5044
}
}
output {
elasticsearch {
hosts => [ "${ES_HOSTS}" ]
user => "${ES_USER}"
password => "${ES_PASSWORD}"
ssl => false
index => "logs-%{+YYYY.MM.dd}"
}
}
podTemplate:
spec:
containers:
- name: logstash
resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2
1.3 Docker Compose 部署
# docker-compose.yml - Elasticsearch 开发环境
version: '3.8'
services:
# Elasticsearch Node 1
es01:
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
container_name: es01
environment:
- node.name=es01
- cluster.name=es-docker-cluster
- discovery.seed_hosts=es02,es03
- cluster.initial_master_nodes=es01,es02,es03
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- xpack.security.enabled=false
- xpack.security.enrollment.enabled=false
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- es01-data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
- "9300:9300"
networks:
- elastic-net
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9200/_cluster/health || exit 1"]
interval: 30s
timeout: 10s
retries: 5
# Elasticsearch Node 2
es02:
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
container_name: es02
environment:
- node.name=es02
- cluster.name=es-docker-cluster
- discovery.seed_hosts=es01,es03
- cluster.initial_master_nodes=es01,es02,es03
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- xpack.security.enabled=false
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- es02-data:/usr/share/elasticsearch/data
networks:
- elastic-net
# Elasticsearch Node 3
es03:
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
container_name: es03
environment:
- node.name=es03
- cluster.name=es-docker-cluster
- discovery.seed_hosts=es01,es02
- cluster.initial_master_nodes=es01,es02,es03
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- xpack.security.enabled=false
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- es03-data:/usr/share/elasticsearch/data
networks:
- elastic-net
# Kibana
kibana:
image: docker.elastic.co/kibana/kibana:8.11.0
container_name: kibana
environment:
- ELASTICSEARCH_HOSTS=http://es01:9200
- SERVER_NAME=kibana
- SERVER_HOST=0.0.0.0
ports:
- "5601:5601"
networks:
- elastic-net
depends_on:
es01:
condition: service_healthy
# Logstash
logstash:
image: docker.elastic.co/logstash/logstash:8.11.0
container_name: logstash
volumes:
- ./logstash/pipeline:/usr/share/logstash/pipeline
ports:
- "5044:5044"
networks:
- elastic-net
depends_on:
- es01
# Filebeat
filebeat:
image: docker.elastic.co/beats/filebeat:8.11.0
container_name: filebeat
user: root
volumes:
- ./filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
networks:
- elastic-net
depends_on:
- logstash
# Cerebro (集群管理)
cerebro:
image: lmenezes/cerebro:latest
container_name: cerebro
ports:
- "9000:9000"
networks:
- elastic-net
volumes:
es01-data:
es02-data:
es03-data:
networks:
elastic-net:
driver: bridge
2. 核心配置与优化
2.1 elasticsearch.yml 配置
# ==============================================================================
# Elasticsearch 节点配置
# 路径:/etc/elasticsearch/elasticsearch.yml
# ==============================================================================
# ------------------------------------------------------------------------------
# 集群配置
# ------------------------------------------------------------------------------
# 集群名称(同集群必须一致)
cluster.name: production-cluster
# 节点名称(集群内唯一)
node.name: node-01
# 节点角色
# master: 可选举为 Master
# data: 存储数据
# ingest: 预处理管道
# ml: 机器学习
node.roles: [master, data, ingest]
# 节点属性(用于分片分配感知)
node.attr.zone: zone-a
node.attr.rack: rack-1
# ------------------------------------------------------------------------------
# 网络配置
# ------------------------------------------------------------------------------
# 绑定地址
network.host: 0.0.0.0
# HTTP 端口(REST API)
http.port: 9200
# Transport 端口(节点间通信)
transport.port: 9300
# 发布地址(其他节点访问此节点)
network.publish_host: 192.168.1.101
# ------------------------------------------------------------------------------
# 发现配置
# ------------------------------------------------------------------------------
# Master 候选节点列表
discovery.seed_hosts:
- 192.168.1.101:9300
- 192.168.1.102:9300
- 192.168.1.103:9300
# 初始 Master 节点列表(仅首次启动)
cluster.initial_master_nodes:
- node-01
- node-02
- node-03
# 选举配置
discovery.cluster_formation_warning_timeout: 30s
# ------------------------------------------------------------------------------
# 内存配置
# ------------------------------------------------------------------------------
# 锁定内存(防止 Swap)
bootstrap.memory_lock: true
# ------------------------------------------------------------------------------
# 索引配置
# ------------------------------------------------------------------------------
# 自动创建索引
action.auto_create_index: true
# ------------------------------------------------------------------------------
# 安全配置
# ------------------------------------------------------------------------------
# 启用安全
xpack.security.enabled: true
xpack.security.transport.ssl.enabled: true
xpack.security.http.ssl.enabled: true
# SSL 配置
xpack.security.transport.ssl.keystore.path: elastic-certificates.p12
xpack.security.transport.ssl.truststore.path: elastic-certificates.p12
xpack.security.http.ssl.keystore.path: http-certificates.p12
# ------------------------------------------------------------------------------
# 网关配置
# ------------------------------------------------------------------------------
# 恢复前所需 Master 候选节点数
gateway.expected_master_nodes: 3
# 恢复前所需数据节点数
gateway.expected_data_nodes: 3
# 恢复超时
gateway.recover_after_time: 5m
# ------------------------------------------------------------------------------
# 分片分配配置
# ------------------------------------------------------------------------------
# 分片分配感知
cluster.routing.allocation.awareness.attributes: zone
# 强制感知属性
cluster.routing.allocation.awareness.force.zone.values: zone-a,zone-b
# 分片分配过滤(排除特定节点)
# cluster.routing.allocation.exclude._name: node-old-*
# 单节点最大分片数
cluster.max_shards_per_node: 1000
# ------------------------------------------------------------------------------
# 慢日志配置
# ------------------------------------------------------------------------------
# 查询慢日志
index.search.slowlog.threshold.query.warn: 10s
index.search.slowlog.threshold.query.info: 5s
index.search.slowlog.threshold.query.debug: 2s
# 索引慢日志
index.indexing.slowlog.threshold.index.warn: 10s
index.indexing.slowlog.threshold.index.info: 5s
2.2 JVM 配置
# ==============================================================================
# JVM 配置
# 路径:/etc/elasticsearch/jvm.options
# ==============================================================================
# 堆内存(不超过物理内存 50%,最大 31GB)
-Xms8g
-Xmx8g
# GC 配置
-XX:+UseG1GC
-XX:G1HeapRegionSize=16m
-XX:InitiatingHeapOccupancyPercent=30
-XX:MaxGCPauseMillis=200
# 内存溢出处理
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/var/log/elasticsearch/heap_dump.hprof
# GC 日志
-Xlog:gc*,gc+age=trace,gc+heap=debug:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m
# 临时目录
-Djava.io.tmpdir=${ES_TMPDIR}
# 禁用 JVM 选项验证
--add-opens=java.base/java.lang=ALL-UNNAMED
# ==============================================================================
# 调优建议
# ==============================================================================
# 1. 堆内存 = 物理内存 * 50%,不超过 31GB
# 2. 预留内存给文件系统缓存
# 3. 使用 G1GC(ES 8.x 默认)
# 4. InitiatingHeapOccupancyPercent 设为 25-35
# 5. 确保 Xms = Xmx
2.3 系统参数配置
# ==============================================================================
# 系统参数配置
# ==============================================================================
# vm.max_map_count(Elasticsearch 必需)
sysctl -w vm.max_map_count=262144
# 永久配置 - /etc/sysctl.conf
vm.max_map_count=262144
# 文件描述符
ulimit -n 65536
# 永久配置 - /etc/security/limits.conf
elasticsearch soft nofile 65536
elasticsearch hard nofile 65536
elasticsearch soft memlock unlimited
elasticsearch hard memlock unlimited
# 禁用 Swap
swapoff -a
# 或在 /etc/fstab 注释 swap 行
# 线程数
ulimit -u 4096
# /etc/security/limits.conf
elasticsearch soft nproc 4096
elasticsearch hard nproc 4096
3. 索引管理
3.1 索引模板
// 索引模板 - 日志索引
PUT _index_template/logs-template
{
"index_patterns": ["logs-*"],
"priority": 100,
"template": {
"settings": {
"number_of_shards": 3,
"number_of_replicas": 1,
"refresh_interval": "30s",
"index.lifecycle.name": "logs-policy",
"index.lifecycle.rollover_alias": "logs"
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"message": {
"type": "text",
"fields": {
"keyword": { "type": "keyword", "ignore_above": 256 }
}
},
"level": { "type": "keyword" },
"logger": { "type": "keyword" },
"service": { "type": "keyword" },
"host": { "type": "keyword" },
"trace_id": { "type": "keyword" },
"span_id": { "type": "keyword" },
"duration_ms": { "type": "long" },
"request": {
"properties": {
"method": { "type": "keyword" },
"path": { "type": "keyword" },
"status_code": { "type": "integer" }
}
}
}
},
"aliases": {
"logs-search": {}
}
}
}
// 索引模板 - 时序数据
PUT _index_template/metrics-template
{
"index_patterns": ["metrics-*"],
"priority": 100,
"template": {
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1,
"refresh_interval": "60s"
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"metric_name": { "type": "keyword" },
"value": { "type": "double" },
"tags": { "type": "object", "enabled": false }
}
}
}
}
3.2 ILM 生命周期管理
// ILM 策略 - 日志数据
PUT _ilm/policy/logs-policy
{
"policy": {
"phases": {
"hot": {
"min_age": "0ms",
"actions": {
"rollover": {
"max_primary_shard_size": "50gb",
"max_age": "1d",
"max_docs": 100000000
},
"set_priority": {
"priority": 100
}
}
},
"warm": {
"min_age": "7d",
"actions": {
"shrink": {
"number_of_shards": 1
},
"forcemerge": {
"max_num_segments": 1
},
"allocate": {
"require": {
"data": "warm"
}
},
"set_priority": {
"priority": 50
}
}
},
"cold": {
"min_age": "30d",
"actions": {
"freeze": {},
"allocate": {
"require": {
"data": "cold"
}
},
"set_priority": {
"priority": 0
}
}
},
"delete": {
"min_age": "90d",
"actions": {
"delete": {}
}
}
}
}
}
// ILM 策略 - 指标数据
PUT _ilm/policy/metrics-policy
{
"policy": {
"phases": {
"hot": {
"actions": {
"rollover": {
"max_primary_shard_size": "30gb",
"max_age": "7d"
}
}
},
"warm": {
"min_age": "14d",
"actions": {
"forcemerge": {
"max_num_segments": 1
}
}
},
"delete": {
"min_age": "30d",
"actions": {
"delete": {}
}
}
}
}
}
3.3 索引操作命令
# ==============================================================================
# 索引管理命令
# ==============================================================================
# 创建索引
curl -X PUT "localhost:9200/my-index?pretty" -H 'Content-Type: application/json' -d'
{
"settings": {
"number_of_shards": 3,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"title": { "type": "text" },
"date": { "type": "date" }
}
}
}'
# 查看索引设置
curl -X GET "localhost:9200/my-index/_settings?pretty"
# 查看索引映射
curl -X GET "localhost:9200/my-index/_mapping?pretty"
# 更新副本数
curl -X PUT "localhost:9200/my-index/_settings" -H 'Content-Type: application/json' -d'
{
"number_of_replicas": 2
}'
# 关闭索引
curl -X POST "localhost:9200/my-index/_close?pretty"
# 打开索引
curl -X POST "localhost:9200/my-index/_open?pretty"
# 删除索引
curl -X DELETE "localhost:9200/my-index?pretty"
# 强制合并段
curl -X POST "localhost:9200/my-index/_forcemerge?max_num_segments=1"
# 刷新索引
curl -X POST "localhost:9200/my-index/_refresh"
# 清除缓存
curl -X POST "localhost:9200/my-index/_cache/clear"
# ==============================================================================
# 批量操作
# ==============================================================================
# 批量索引文档
curl -X POST "localhost:9200/_bulk?pretty" -H 'Content-Type: application/json' -d'
{"index": {"_index": "my-index", "_id": "1"}}
{"title": "Document 1", "date": "2024-01-01"}
{"index": {"_index": "my-index", "_id": "2"}}
{"title": "Document 2", "date": "2024-01-02"}
{"update": {"_index": "my-index", "_id": "1"}}
{"doc": {"title": "Updated Document 1"}}
{"delete": {"_index": "my-index", "_id": "2"}}
'
# 重新索引
curl -X POST "localhost:9200/_reindex?pretty" -H 'Content-Type: application/json' -d'
{
"source": {
"index": "source-index"
},
"dest": {
"index": "dest-index"
}
}'
4. 集群管理
4.1 集群健康检查
# ==============================================================================
# 集群状态检查
# ==============================================================================
# 集群健康
curl -X GET "localhost:9200/_cluster/health?pretty"
# 集群状态
curl -X GET "localhost:9200/_cluster/state?pretty"
# 节点状态
curl -X GET "localhost:9200/_nodes/stats?pretty"
# 节点信息
curl -X GET "localhost:9200/_nodes?pretty"
# 索引统计
curl -X GET "localhost:9200/_stats?pretty"
# 分片状态
curl -X GET "localhost:9200/_cat/shards?v"
# 索引列表
curl -X GET "localhost:9200/_cat/indices?v&health=yellow"
# 节点资源使用
curl -X GET "localhost:9200/_cat/nodes?v&h=name,heap.percent,ram.percent,cpu,load_1m"
# 分配解释
curl -X GET "localhost:9200/_cluster/allocation/explain?pretty"
# 待处理任务
curl -X GET "localhost:9200/_cluster/pending_tasks?pretty"
4.2 分片管理
# ==============================================================================
# 分片管理
# ==============================================================================
# 查看分片分配
curl -X GET "localhost:9200/_cat/shards?v&s=state"
# 重新分配分片
curl -X POST "localhost:9200/_cluster/reroute?pretty" -H 'Content-Type: application/json' -d'
{
"commands": [
{
"move": {
"index": "my-index",
"shard": 0,
"from_node": "node-01",
"to_node": "node-02"
}
}
]
}'
# 分配副本分片
curl -X POST "localhost:9200/_cluster/reroute?pretty" -H 'Content-Type: application/json' -d'
{
"commands": [
{
"allocate_replica": {
"index": "my-index",
"shard": 0,
"node": "node-03"
}
}
]
}'
# 取消分片分配
curl -X POST "localhost:9200/_cluster/reroute?pretty" -H 'Content-Type: application/json' -d'
{
"commands": [
{
"cancel": {
"index": "my-index",
"shard": 0,
"node": "node-01"
}
}
]
}'
# 设置分片分配设置
curl -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
{
"transient": {
"cluster.routing.allocation.enable": "all"
}
}'
# 分片分配设置选项:
# all - 允许所有分片分配
# primaries - 仅允许主分片分配
# new_primaries - 仅允许新主分片分配
# none - 不允许任何分片分配
4.3 快照与恢复
# ==============================================================================
# 快照仓库配置
# ==============================================================================
# 注册快照仓库(共享文件系统)
curl -X PUT "localhost:9200/_snapshot/my_backup" -H 'Content-Type: application/json' -d'
{
"type": "fs",
"settings": {
"location": "/mount/backups/my_backup",
"compress": true
}
}'
# 注册 S3 快照仓库
curl -X PUT "localhost:9200/_snapshot/s3_backup" -H 'Content-Type: application/json' -d'
{
"type": "s3",
"settings": {
"bucket": "my-elasticsearch-backups",
"region": "us-east-1",
"base_path": "snapshots",
"compress": true
}
}'
# 查看仓库列表
curl -X GET "localhost:9200/_snapshot?pretty"
# 验证仓库
curl -X POST "localhost:9200/_snapshot/my_backup/_verify?pretty"
# ==============================================================================
# 创建快照
# ==============================================================================
# 创建快照(所有索引)
curl -X PUT "localhost:9200/_snapshot/my_backup/snapshot_1?wait_for_completion=true"
# 创建快照(指定索引)
curl -X PUT "localhost:9200/_snapshot/my_backup/snapshot_2" -H 'Content-Type: application/json' -d'
{
"indices": "index1,index2",
"ignore_unavailable": true,
"include_global_state": false
}'
# 查看快照状态
curl -X GET "localhost:9200/_snapshot/my_backup/snapshot_1?pretty"
# 查看快照进度
curl -X GET "localhost:9200/_snapshot/my_backup/_status?pretty"
# ==============================================================================
# 恢复快照
# ==============================================================================
# 恢复快照
curl -X POST "localhost:9200/_snapshot/my_backup/snapshot_1/_restore" -H 'Content-Type: application/json' -d'
{
"indices": "index1,index2",
"ignore_unavailable": true,
"include_global_state": false,
"rename_pattern": "(.+)",
"rename_replacement": "restored_$1"
}'
# 查看恢复状态
curl -X GET "localhost:9200/_recovery?pretty"
# ==============================================================================
# 删除快照
# ==============================================================================
# 删除快照
curl -X DELETE "localhost:9200/_snapshot/my_backup/snapshot_1"
# 删除仓库
curl -X DELETE "localhost:9200/_snapshot/my_backup"
5. 监控与告警
5.1 Prometheus Exporter
# elasticsearch-exporter.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: elasticsearch-exporter
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: elasticsearch-exporter
template:
metadata:
labels:
app: elasticsearch-exporter
spec:
containers:
- name: exporter
image: quay.io/prometheuscommunity/elasticsearch-exporter:latest
args:
- "--es.uri=http://elasticsearch:9200"
- "--es.all"
- "--es.indices"
- "--es.indices_settings"
- "--es.shards"
ports:
- containerPort: 9114
name: metrics
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
---
apiVersion: v1
kind: Service
metadata:
name: elasticsearch-exporter
namespace: monitoring
spec:
ports:
- port: 9114
targetPort: 9114
name: metrics
selector:
app: elasticsearch-exporter
5.2 告警规则
# elasticsearch-alerts.yml
groups:
- name: elasticsearch-alerts
rules:
# 集群状态异常
- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: "Elasticsearch 集群状态红色"
description: "集群 {{ $labels.cluster }} 状态为红色,部分主分片不可用"
- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch 集群状态黄色"
description: "集群 {{ $labels.cluster }} 状态为黄色,部分副本分片不可用"
# 节点数量
- alert: ElasticsearchNodeCount
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 1m
labels:
severity: critical
annotations:
summary: "Elasticsearch 节点数量不足"
description: "集群节点数量 {{ $value }},少于预期的 3 个"
# 堆内存使用
- alert: ElasticsearchHeapHigh
expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch 堆内存使用过高"
description: "节点 {{ $labels.instance }} 堆内存使用率 {{ $value | printf \"%.1f\" }}"
# 磁盘空间
- alert: ElasticsearchDiskSpaceLow
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch 磁盘空间不足"
description: "节点 {{ $labels.instance }} 磁盘使用率超过 85%"
# GC 时间过长
- alert: ElasticsearchGCLong
expr: rate(elasticsearch_jvm_gc_collection_seconds_sum[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch GC 时间过长"
description: "节点 {{ $labels.instance }} GC 时间 {{ $value | printf \"%.2f\" }}s/s"
# 索引写入速率下降
- alert: ElasticsearchIndexingDrop
expr: rate(elasticsearch_indices_indexing_index_total[5m]) < 100
for: 10m
labels:
severity: warning
annotations:
summary: "Elasticsearch 索引写入速率下降"
description: "索引写入速率低于预期"
# 搜索延迟
- alert: ElasticsearchSearchLatencyHigh
expr: histogram_quantile(0.95, rate(elasticsearch_indices_search_query_time_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch 搜索延迟过高"
description: "P95 搜索延迟 {{ $value | printf \"%.2f\" }}s"
# 待处理任务
- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch 待处理任务过多"
description: "待处理任务数 {{ $value }}"
6. 故障排查
6.1 常见问题排查
# ==============================================================================
# 集群状态黄色
# ==============================================================================
# 查看未分配分片
curl -X GET "localhost:9200/_cat/shards?v&h=index,shard,prirep,state,unassigned.reason" | grep UNASSIGNED
# 查看分配解释
curl -X GET "localhost:9200/_cluster/allocation/explain?pretty" -H 'Content-Type: application/json' -d'
{
"index": "my-index",
"shard": 0,
"primary": false
}'
# 常见原因:
# 1. 节点数量不足:增加节点或减少副本数
# 2. 磁盘空间不足:清理索引或扩容
# 3. 分片分配限制:调整 cluster.max_shards_per_node
# ==============================================================================
# 集群状态红色
# ==============================================================================
# 查看丢失的主分片
curl -X GET "localhost:9200/_cat/shards?v" | grep "UNASSIGNED" | grep "p"
# 尝试分配主分片(风险:可能丢失数据)
curl -X POST "localhost:9200/_cluster/reroute?pretty" -H 'Content-Type: application/json' -d'
{
"commands": [
{
"allocate_stale_primary": {
"index": "my-index",
"shard": 0,
"node": "node-01",
"accept_data_loss": true
}
}
]
}'
# ==============================================================================
# 性能问题排查
# ==============================================================================
# 查看热点线程
curl -X GET "localhost:9200/_nodes/hot_threads?threads=10"
# 查看任务
curl -X GET "localhost:9200/_tasks?detailed=true&pretty"
# 取消长时间运行的任务
curl -X POST "localhost:9200/_tasks/<task_id>/_cancel"
# 查看索引统计
curl -X GET "localhost:9200/my-index/_stats?pretty"
# 查看段信息
curl -X GET "localhost:9200/my-index/_segments?pretty"
# 查看字段数据缓存
curl -X GET "localhost:9200/_nodes/stats/indices/fielddata?pretty"
7. 最佳实践
7.1 索引设计原则
| 原则 | 说明 |
|---|---|
| 分片数 | 单分片 10-50GB,总分数不超过节点数 * 20 |
| 副本数 | 生产环境至少 1 个副本 |
| 映射 | 避免动态映射,使用显式映射 |
| 别名 | 使用别名管理索引切换 |
| ILM | 配置生命周期管理策略 |
7.2 资源配置建议
| 资源 | 建议 |
|---|---|
| 堆内存 | 物理内存 50%,最大 31GB |
| CPU | 每节点 8-16 核 |
| 磁盘 | SSD,IOPS > 1000 |
| 网络 | 万兆网卡 |
8. 参考资料
- Elasticsearch 官方文档: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
- ECK Operator: https://www.elastic.co/guide/en/cloud-on-k8s/current/index.html
- Elasticsearch 调优: https://www.elastic.co/guide/en/elasticsearch/reference/current/important-settings.html
文档版本: 1.0 更新日期: 2024-01-15 适用环境: Elasticsearch 8.x