rebase this chart on cnpg provided chart

2025-05-13 00:14:16 -05:00
parent 47d7604aac
commit 1ca985edc7
31 changed files with 1329 additions and 346 deletions
--- a/charts/postgres-cluster/prometheus_rules/cluster-backends_waiting-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-backends_waiting-warning.yaml
@@ -0,0 +1,16 @@
+{{- $alert := "CNPGClusterBackendsWaitingWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster a backend is waiting for longer than 5 minutes.
+  description: |-
+    Pod {{`{{`}} $labels.pod {{`}}`}}
+    has been waiting for longer than 5 minutes
+expr: |
+  cnpg_backends_waiting_total > 300
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-database_deadlock_conflicts-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-database_deadlock_conflicts-warning.yaml
@@ -0,0 +1,16 @@
+{{- $alert := "CNPGClusterDatabaseDeadlockConflictsWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster has over 10 deadlock conflicts.
+  description: |-
+    There are over 10 deadlock conflicts in
+    {{`{{`}} $labels.pod {{`}}`}}
+expr: |
+  cnpg_pg_stat_database_deadlocks > 10
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-ha-critical.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-ha-critical.yaml
@@ -0,0 +1,26 @@
+{{- $alert := "CNPGClusterHACritical" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster has no standby replicas!
+  description: |-
+    CloudNativePG Cluster "{{ .labels.job }}" has no ready standby replicas. Your cluster at a severe
+    risk of data loss and downtime if the primary instance fails.
+
+    The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint
+    will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main.
+
+    This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less
+    instances. The replaced instance may need some time to catch-up with the cluster primary instance.
+
+    This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this
+    case you may want to silence it.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
+expr: |
+  max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 1
+for: 5m
+labels:
+  severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-ha-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-ha-warning.yaml
@@ -0,0 +1,24 @@
+{{- $alert := "CNPGClusterHAWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster less than 2 standby replicas.
+  description: |-
+    CloudNativePG Cluster "{{ .labels.job }}" has only {{ .value }} standby replicas, putting
+    your cluster at risk if another instance fails. The cluster is still able to operate normally, although
+    the `-ro` and `-r` endpoints operate at reduced capacity.
+
+    This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may
+    need some time to catch-up with the cluster primary instance.
+
+    This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances.
+    In this case you may want to silence it.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
+expr: |
+  max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 2
+for: 5m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-high_connection-critical.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-high_connection-critical.yaml
@@ -0,0 +1,17 @@
+{{- $alert := "CNPGClusterHighConnectionsCritical" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Instance maximum number of connections critical!
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
+    the maximum number of connections.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
+expr: |
+  sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95
+for: 5m
+labels:
+  severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-high_connection-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-high_connection-warning.yaml
@@ -0,0 +1,17 @@
+{{- $alert := "CNPGClusterHighConnectionsWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Instance is approaching the maximum number of connections.
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
+    the maximum number of connections.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
+expr: |
+  sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80
+for: 5m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-high_replication_lag.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-high_replication_lag.yaml
@@ -0,0 +1,19 @@
+{{- $alert := "CNPGClusterHighReplicationLag" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster high replication lag
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is experiencing a high replication lag of
+    {{ .value }}ms.
+
+    High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
+expr: |
+  max(cnpg_pg_replication_lag{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000
+for: 5m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-instances_on_same_node.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-instances_on_same_node.yaml
@@ -0,0 +1,19 @@
+{{- $alert := "CNPGClusterInstancesOnSameNode" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster instances are located on the same node.
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has {{ .value }}
+    instances on the same node {{ .labels.node }}.
+
+    A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
+expr: |
+  count by (node) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1
+for: 5m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-last_failed_archive_time-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-last_failed_archive_time-warning.yaml
@@ -0,0 +1,15 @@
+{{- $alert := "CNPGClusterLastFailedArchiveTimeWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster last time archiving failed.
+  description: |-
+    Archiving failed for {{`{{`}} $labels.pod {{`}}`}}
+expr: |
+  (cnpg_pg_stat_archiver_last_failed_time - cnpg_pg_stat_archiver_last_archived_time) > 1
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-long_running_transaction-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-long_running_transaction-warning.yaml
@@ -0,0 +1,16 @@
+{{- $alert := "CNPGClusterLongRunningTransactionWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster query is taking longer than 5 minutes.
+  description: |-
+    CloudNativePG Cluster Pod {{`{{`}} $labels.pod {{`}}`}}
+    is taking more than 5 minutes (300 seconds) for a query.
+expr: |-
+  cnpg_backends_max_tx_duration_seconds > 300
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-low_disk_space-critical.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-low_disk_space-critical.yaml
@@ -0,0 +1,24 @@
+{{- $alert := "CNPGClusterLowDiskSpaceCritical" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Instance is running out of disk space!
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs!
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
+expr: |
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.9 OR
+  max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+      /
+      sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+      *
+      on(namespace, persistentvolumeclaim) group_left(volume)
+      kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ .podSelector }}"}
+  ) > 0.9
+for: 5m
+labels:
+  severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-low_disk_space-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-low_disk_space-warning.yaml
@@ -0,0 +1,24 @@
+{{- $alert := "CNPGClusterLowDiskSpaceWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Instance is running out of disk space.
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
+expr: |
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR
+  max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.7 OR
+  max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+      /
+      sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"})
+      *
+      on(namespace, persistentvolumeclaim) group_left(volume)
+      kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ .podSelector }}"}
+  ) > 0.7
+for: 5m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-offline.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-offline.yaml
@@ -0,0 +1,19 @@
+{{- $alert := "CNPGClusterOffline" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster has no running instances!
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has no ready instances.
+
+    Having an offline cluster means your applications will not be able to access the database, leading to
+    potential service disruption and/or data loss.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
+expr: |
+  (count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR on() vector(0)) == 0
+for: 5m
+labels:
+  severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-pg_database_xid_age-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-pg_database_xid_age-warning.yaml
@@ -0,0 +1,16 @@
+{{- $alert := "CNPGClusterPGDatabaseXidAgeWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster has a number of transactions from the frozen XID to the current one.
+  description: |-
+    Over 300,000,000 transactions from frozen xid
+    on pod {{`{{`}} $labels.pod {{`}}`}}
+expr: |
+  cnpg_pg_database_xid_age > 300000000
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-pg_replication-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-pg_replication-warning.yaml
@@ -0,0 +1,15 @@
+{{- $alert := "CNPGClusterPGReplicationWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster standby is lagging behind the primary.
+  description: |-
+    Standby is lagging behind by over 300 seconds (5 minutes)
+expr: |
+  cnpg_pg_replication_lag > 300
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-replica_failing_replication-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-replica_failing_replication-warning.yaml
@@ -0,0 +1,16 @@
+{{- $alert := "CNPGClusterReplicaFailingReplicationWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster has a replica is failing to replicate.
+  description: |-
+    Replica {{`{{`}} $labels.pod {{`}}`}}
+    is failing to replicate
+expr: |
+  cnpg_pg_replication_in_recovery > cnpg_pg_replication_is_wal_receiver_up
+for: 1m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}
--- a/charts/postgres-cluster/prometheus_rules/cluster-zone_spread-warning.yaml
+++ b/charts/postgres-cluster/prometheus_rules/cluster-zone_spread-warning.yaml
@@ -0,0 +1,18 @@
+{{- $alert := "CNPGClusterZoneSpreadWarning" -}}
+{{- if not (has $alert .excludeRules) -}}
+alert: {{ $alert }}
+annotations:
+  summary: CNPG Cluster instances in the same zone.
+  description: |-
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone.
+
+    A disaster in one availability zone will lead to a potential service disruption and/or data loss.
+  runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
+expr: |
+  {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
+for: 5m
+labels:
+  severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
+{{- end -}}