change resource

fix helm/prom bracket interaction
add default rules
2025-01-08 15:33:59 -06:00 · 2025-01-08 15:20:28 -06:00 · 2025-01-07 14:22:25 -06:00 · 2025-01-07 13:45:34 -06:00
4 changed files with 77 additions and 9 deletions
--- a/charts/postgres-cluster/Chart.yaml
+++ b/charts/postgres-cluster/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: postgres-cluster
-version: 4.0.2
+version: 4.1.2
 description: Chart for cloudnative-pg cluster
 keywords:
  - database
--- a/charts/postgres-cluster/README.md
+++ b/charts/postgres-cluster/README.md
@@ -1,6 +1,6 @@
 # postgres-cluster

-![Version: 4.0.2](https://img.shields.io/badge/Version-4.0.2-informational?style=flat-square) ![AppVersion: v1.25.0](https://img.shields.io/badge/AppVersion-v1.25.0-informational?style=flat-square)
+![Version: 4.1.2](https://img.shields.io/badge/Version-4.1.2-informational?style=flat-square) ![AppVersion: v1.25.0](https://img.shields.io/badge/AppVersion-v1.25.0-informational?style=flat-square)

 Chart for cloudnative-pg cluster

@@ -44,14 +44,14 @@ Chart for cloudnative-pg cluster
 | cluster.image | object | `{"pullPolicy":"IfNotPresent","repository":"ghcr.io/cloudnative-pg/postgresql","tag":"17.2-22"}` | Default image |
 | cluster.instances | int | `3` |  |
 | cluster.logLevel | string | `"info"` |  |
-| cluster.monitoring | object | `{"enabled":false,"podMonitor":{"enabled":true},"prometheusRule":{"enabled":false,"excludeRules":[]}}` | Enable default monitoring and alert rules |
+| cluster.monitoring | object | `{"enabled":false,"podMonitor":{"enabled":true},"prometheusRule":{"enableDefaultRules":true,"enabled":false,"excludeRules":[]}}` | Enable default monitoring and alert rules |
 | cluster.postgresGID | int | `26` |  |
 | cluster.postgresUID | int | `26` | The UID and GID of the postgres user inside the image |
 | cluster.postgresql | object | `{"parameters":{"hot_standby_feedback":"on","max_slot_wal_keep_size":"2000MB","shared_buffers":"128MB"},"shared_preload_libraries":[]}` | Parameters to be set for the database itself See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration |
 | cluster.primaryUpdateMethod | string | `"switchover"` | Method to follow to upgrade the primary server during a rolling update procedure, after all replicas have been successfully updated. It can be switchover (default) or in-place (restart). |
 | cluster.primaryUpdateStrategy | string | `"unsupervised"` | Strategy to follow to upgrade the primary server during a rolling update procedure, after all replicas have been successfully updated: it can be automated (unsupervised - default) or manual (supervised) |
 | cluster.priorityClassName | string | `""` |  |
-| cluster.resources | object | `{"limits":{"cpu":"800m","hugepages-2Mi":"256Mi","memory":"1Gi"},"requests":{"cpu":"10m","memory":"256Mi"}}` | Default resources |
+| cluster.resources | object | `{"limits":{"cpu":1,"hugepages-2Mi":"256Mi","memory":"2Gi"},"requests":{"cpu":"100m","memory":"256Mi"}}` | Default resources |
 | cluster.storage.size | string | `"10Gi"` |  |
 | cluster.storage.storageClass | string | `""` |  |
 | cluster.walStorage | object | `{"size":"2Gi","storageClass":""}` | Default storage size |
--- a/charts/postgres-cluster/templates/prometheus-rule.yaml
+++ b/charts/postgres-cluster/templates/prometheus-rule.yaml
@@ -14,10 +14,10 @@ spec:
    - name: cloudnative-pg/{{ include "cluster.name" . }}
      rules:
        {{- $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}}
-        {{- $_ := set $dict "value"       "{{ $value }}" -}}
+        {{- $_ := set $dict "value"       "{{`{{`}} $value {{`}}`}}" -}}
        {{- $_ := set $dict "namespace"   .Release.Namespace -}}
        {{- $_ := set $dict "cluster"     (printf "%s-cluster" (include "cluster.name" .) ) -}}
-        {{- $_ := set $dict "labels"      (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}}
+        {{- $_ := set $dict "labels"      (dict "job" "{{`{{`}} $labels.job {{`}}`}}" "node" "{{`{{`}} $labels.node {{`}}`}}" "pod" "{{`{{`}} $labels.pod {{`}}`}}") -}}
        {{- $_ := set $dict "podSelector" (printf "%s-cluster-([1-9][0-9]*)$" (include "cluster.name" .) ) -}}
        {{- $_ := set $dict "Values"      .Values -}}
        {{- $_ := set $dict "Template"    .Template -}}
@@ -27,4 +27,71 @@ spec:
        - {{ $tpl }}
        {{- end -}}
        {{- end -}}
+    {{- if .Values.cluster.monitoring.prometheusRule.enableDefaultRules }}
+    - name: cloudnative-pg/default-rules
+      rules:
+        - alert: LongRunningTransaction
+          annotations:
+            description: Pod {{`{{`}} $labels.pod {{`}}`}} is taking more than 5 minutes (300 seconds) for a query.
+            summary: A query is taking longer than 5 minutes.
+          expr: |-
+            cnpg_backends_max_tx_duration_seconds > 300
+          for: 1m
+          labels:
+            severity: warning
+        - alert: BackendsWaiting
+          annotations:
+            description: Pod {{`{{`}} $labels.pod {{`}}`}} has been waiting for longer than 5 minutes
+            summary: If a backend is waiting for longer than 5 minutes
+          expr: |-
+            cnpg_backends_waiting_total > 300
+          for: 1m
+          labels:
+            severity: warning
+        - alert: PGDatabaseXidAge
+          annotations:
+            description: Over 300,000,000 transactions from frozen xid on pod {{`{{`}} $labels.pod {{`}}`}}
+            summary: Number of transactions from the frozen XID to the current one
+          expr: |-
+            cnpg_pg_database_xid_age > 300000000
+          for: 1m
+          labels:
+            severity: warning
+        - alert: PGReplication
+          annotations:
+            description: Standby is lagging behind by over 300 seconds (5 minutes)
+            summary: The standby is lagging behind the primary
+          expr: |-
+            cnpg_pg_replication_lag > 300
+          for: 1m
+          labels:
+            severity: warning
+        - alert: LastFailedArchiveTime
+          annotations:
+            description: Archiving failed for {{`{{`}} $labels.pod {{`}}`}}
+            summary: Checks the last time archiving failed. Will be < 0 when it has not failed.
+          expr: |-
+            (cnpg_pg_stat_archiver_last_failed_time - cnpg_pg_stat_archiver_last_archived_time) > 1
+          for: 1m
+          labels:
+            severity: warning
+        - alert: DatabaseDeadlockConflicts
+          annotations:
+            description: There are over 10 deadlock conflicts in {{`{{`}} $labels.pod {{`}}`}}
+            summary: Checks the number of database conflicts
+          expr: |-
+            cnpg_pg_stat_database_deadlocks > 10
+          for: 1m
+          labels:
+            severity: warning
+        - alert: ReplicaFailingReplication
+          annotations:
+            description: Replica {{`{{`}} $labels.pod {{`}}`}} is failing to replicate
+            summary: Checks if the replica is failing to replicate
+          expr: |-
+            cnpg_pg_replication_in_recovery > cnpg_pg_replication_is_wal_receiver_up
+          for: 1m
+          labels:
+            severity: warning
+    {{- end }}
 {{ end }}
--- a/charts/postgres-cluster/values.yaml
+++ b/charts/postgres-cluster/values.yaml
@@ -42,10 +42,10 @@ cluster:
  resources:
    requests:
      memory: 256Mi
-      cpu: 10m
+      cpu: 100m
    limits:
-      memory: 1Gi
-      cpu: 800m
+      memory: 2Gi
+      cpu: 1
      hugepages-2Mi: 256Mi

  # -- See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-AffinityConfiguration
@@ -74,6 +74,7 @@ cluster:
      enabled: true
    prometheusRule:
      enabled: false
+      enableDefaultRules: true
      excludeRules: []

  # -- Parameters to be set for the database itself
Author	SHA1	Message	Date
Alex Lebens	ec6f44c6bc	change resource	2025-01-08 15:33:59 -06:00
Alex Lebens	35f331e29a	fix helm/prom bracket interaction	2025-01-08 15:20:28 -06:00
Alex Lebens	3b0481fcb1	add default rules	2025-01-07 14:22:25 -06:00
Alex Lebens	e2dfd70dc4	change default resources	2025-01-07 13:45:34 -06:00