Files
helm-charts/charts/postgres-cluster/prometheus_rules/cluster-ha-critical.yaml
Alex Lebens 1ca985edc7
All checks were successful
release-charts-postgres-cluster / release (push) Successful in 18s
rebase this chart on cnpg provided chart
2025-05-13 00:14:16 -05:00

27 lines
1.3 KiB
YAML

{{- $alert := "CNPGClusterHACritical" -}}
{{- if not (has $alert .excludeRules) -}}
alert: {{ $alert }}
annotations:
summary: CNPG Cluster has no standby replicas!
description: |-
CloudNativePG Cluster "{{ .labels.job }}" has no ready standby replicas. Your cluster at a severe
risk of data loss and downtime if the primary instance fails.
The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint
will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main.
This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less
instances. The replaced instance may need some time to catch-up with the cluster primary instance.
This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this
case you may want to silence it.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
expr: |
max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 1
for: 5m
labels:
severity: critical
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}