170 lines
9.7 KiB
YAML
170 lines
9.7 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: elasticsearch
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
app.kubernetes.io/name: elasticsearch
|
|
{{- include "custom.labels" . | nindent 4 }}
|
|
spec:
|
|
groups:
|
|
- name: ElasticsearchExporter
|
|
rules:
|
|
- alert: ElasticsearchHeapUsageTooHigh
|
|
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Elasticsearch Heap Usage Too High (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The heap usage is over 90%\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchHeapUsageWarning
|
|
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch Heap Usage warning (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The heap usage is over 80%\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchDiskOutOfSpace
|
|
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Elasticsearch disk out of space (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The disk usage is over 90%\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchDiskSpaceLow
|
|
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch disk space low (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The disk usage is over 80%\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchClusterRed
|
|
expr: elasticsearch_cluster_health_status{color="red"} == 1
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Elasticsearch Cluster Red (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elastic Cluster Red status\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchClusterYellow
|
|
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch Cluster Yellow (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elastic Cluster Yellow status\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
# 1m delay allows a restart without triggering an alert.
|
|
- alert: ElasticsearchHealthyNodes
|
|
expr: elasticsearch_cluster_health_number_of_nodes < 3
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Elasticsearch Healthy Nodes (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Missing node in Elasticsearch cluster\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
# 1m delay allows a restart without triggering an alert.
|
|
- alert: ElasticsearchHealthyDataNodes
|
|
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Elasticsearch Healthy Data Nodes (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Missing data node in Elasticsearch cluster\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchRelocatingShards
|
|
expr: elasticsearch_cluster_health_relocating_shards > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Elasticsearch relocating shards (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elasticsearch is relocating shards\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchRelocatingShardsTooLong
|
|
expr: elasticsearch_cluster_health_relocating_shards > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch relocating shards too long (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elasticsearch has been relocating shards for 15min\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchInitializingShards
|
|
expr: elasticsearch_cluster_health_initializing_shards > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Elasticsearch initializing shards (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elasticsearch is initializing shards\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchInitializingShardsTooLong
|
|
expr: elasticsearch_cluster_health_initializing_shards > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch initializing shards too long (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elasticsearch has been initializing shards for 15 min\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchUnassignedShards
|
|
expr: elasticsearch_cluster_health_unassigned_shards > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Elasticsearch unassigned shards (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elasticsearch has unassigned shards\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchPendingTasks
|
|
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch pending tasks (instance {{ `{{ $labels.instance }}` }})
|
|
description: "Elasticsearch has pending tasks. Cluster works slowly.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchNoNewDocuments
|
|
expr: increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch no new documents (instance {{ `{{ $labels.instance }}` }})
|
|
description: "No new documents for 10 min!\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
# Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance.
|
|
- alert: ElasticsearchHighIndexingLatency
|
|
expr: rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch High Indexing Latency (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ `{{ $value }}` }}s).\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
# Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload.
|
|
- alert: ElasticsearchHighIndexingRate
|
|
expr: sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch High Indexing Rate (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
# Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume.
|
|
- alert: ElasticsearchHighQueryRate
|
|
expr: sum(rate(elasticsearch_indices_search_query_total[1m])) > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch High Query Rate (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|
|
- alert: ElasticsearchHighQueryLatency
|
|
expr: rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Elasticsearch High Query Latency (instance {{ `{{ $labels.instance }}` }})
|
|
description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ `{{ $value }}` }}s).\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"
|