diff --git a/charts/postgres-cluster/Chart.yaml b/charts/postgres-cluster/Chart.yaml index b5739ff..90c9919 100644 --- a/charts/postgres-cluster/Chart.yaml +++ b/charts/postgres-cluster/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: postgres-cluster -version: 4.0.3 +version: 4.1.0 description: Chart for cloudnative-pg cluster keywords: - database diff --git a/charts/postgres-cluster/README.md b/charts/postgres-cluster/README.md index 1339ea6..79f1518 100644 --- a/charts/postgres-cluster/README.md +++ b/charts/postgres-cluster/README.md @@ -1,6 +1,6 @@ # postgres-cluster -![Version: 4.0.3](https://img.shields.io/badge/Version-4.0.3-informational?style=flat-square) ![AppVersion: v1.25.0](https://img.shields.io/badge/AppVersion-v1.25.0-informational?style=flat-square) +![Version: 4.1.0](https://img.shields.io/badge/Version-4.1.0-informational?style=flat-square) ![AppVersion: v1.25.0](https://img.shields.io/badge/AppVersion-v1.25.0-informational?style=flat-square) Chart for cloudnative-pg cluster @@ -44,7 +44,7 @@ Chart for cloudnative-pg cluster | cluster.image | object | `{"pullPolicy":"IfNotPresent","repository":"ghcr.io/cloudnative-pg/postgresql","tag":"17.2-22"}` | Default image | | cluster.instances | int | `3` | | | cluster.logLevel | string | `"info"` | | -| cluster.monitoring | object | `{"enabled":false,"podMonitor":{"enabled":true},"prometheusRule":{"enabled":false,"excludeRules":[]}}` | Enable default monitoring and alert rules | +| cluster.monitoring | object | `{"enabled":false,"podMonitor":{"enabled":true},"prometheusRule":{"enableDefaultRules":true,"enabled":false,"excludeRules":[]}}` | Enable default monitoring and alert rules | | cluster.postgresGID | int | `26` | | | cluster.postgresUID | int | `26` | The UID and GID of the postgres user inside the image | | cluster.postgresql | object | `{"parameters":{"hot_standby_feedback":"on","max_slot_wal_keep_size":"2000MB","shared_buffers":"128MB"},"shared_preload_libraries":[]}` | Parameters to be set for the database itself See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration | diff --git a/charts/postgres-cluster/templates/prometheus-rule.yaml b/charts/postgres-cluster/templates/prometheus-rule.yaml index 363d308..3c1f834 100644 --- a/charts/postgres-cluster/templates/prometheus-rule.yaml +++ b/charts/postgres-cluster/templates/prometheus-rule.yaml @@ -27,4 +27,71 @@ spec: - {{ $tpl }} {{- end -}} {{- end -}} + {{- if .Values.cluster.monitoring.prometheusRule.enableDefaultRules }} + - name: cloudnative-pg/default-rules + rules: + - alert: LongRunningTransaction + annotations: + description: Pod {{ $labels.pod }} is taking more than 5 minutes (300 seconds) for a query. + summary: A query is taking longer than 5 minutes. + expr: |- + cnpg_backends_max_tx_duration_seconds > 300 + for: 1m + labels: + severity: warning + - alert: BackendsWaiting + annotations: + description: Pod {{ $labels.pod }} has been waiting for longer than 5 minutes + summary: If a backend is waiting for longer than 5 minutes + expr: |- + cnpg_backends_waiting_total > 300 + for: 1m + labels: + severity: warning + - alert: PGDatabaseXidAge + annotations: + description: Over 300,000,000 transactions from frozen xid on pod {{ $labels.pod }} + summary: Number of transactions from the frozen XID to the current one + expr: |- + cnpg_pg_database_xid_age > 300000000 + for: 1m + labels: + severity: warning + - alert: PGReplication + annotations: + description: Standby is lagging behind by over 300 seconds (5 minutes) + summary: The standby is lagging behind the primary + expr: |- + cnpg_pg_replication_lag > 300 + for: 1m + labels: + severity: warning + - alert: LastFailedArchiveTime + annotations: + description: Archiving failed for {{ $labels.pod }} + summary: Checks the last time archiving failed. Will be < 0 when it has not failed. + expr: |- + (cnpg_pg_stat_archiver_last_failed_time - cnpg_pg_stat_archiver_last_archived_time) > 1 + for: 1m + labels: + severity: warning + - alert: DatabaseDeadlockConflicts + annotations: + description: There are over 10 deadlock conflicts in {{ $labels.pod }} + summary: Checks the number of database conflicts + expr: |- + cnpg_pg_stat_database_deadlocks > 10 + for: 1m + labels: + severity: warning + - alert: ReplicaFailingReplication + annotations: + description: Replica {{ $labels.pod }} is failing to replicate + summary: Checks if the replica is failing to replicate + expr: |- + cnpg_pg_replication_in_recovery > cnpg_pg_replication_is_wal_receiver_up + for: 1m + labels: + severity: warning + {{- end }} {{ end }} diff --git a/charts/postgres-cluster/values.yaml b/charts/postgres-cluster/values.yaml index 9c98511..607acde 100644 --- a/charts/postgres-cluster/values.yaml +++ b/charts/postgres-cluster/values.yaml @@ -74,6 +74,7 @@ cluster: enabled: true prometheusRule: enabled: false + enableDefaultRules: true excludeRules: [] # -- Parameters to be set for the database itself