diff --git a/clusters/cl01tl/manifests/ntfy/Cluster-ntfy-postgresql-18-cluster.yaml b/clusters/cl01tl/manifests/ntfy/Cluster-ntfy-postgresql-18-cluster.yaml new file mode 100644 index 000000000..77108d5a9 --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/Cluster-ntfy-postgresql-18-cluster.yaml @@ -0,0 +1,57 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: ntfy-postgresql-18-cluster + namespace: ntfy + labels: + app.kubernetes.io/name: ntfy-postgresql-18-cluster + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm +spec: + instances: 3 + imageName: "ghcr.io/cloudnative-pg/postgresql:18.3-standard-trixie" + imagePullPolicy: IfNotPresent + postgresUID: 26 + postgresGID: 26 + storage: + size: 10Gi + storageClass: local-path + walStorage: + size: 2Gi + storageClass: local-path + resources: + limits: + hugepages-2Mi: 256Mi + requests: + cpu: 20m + memory: 80Mi + affinity: + enablePodAntiAffinity: true + topologyKey: kubernetes.io/hostname + primaryUpdateMethod: switchover + primaryUpdateStrategy: unsupervised + logLevel: info + enableSuperuserAccess: false + enablePDB: true + postgresql: + parameters: + hot_standby_feedback: "on" + max_slot_wal_keep_size: 2000MB + shared_buffers: 128MB + monitoring: + enablePodMonitor: true + disableDefaultQueries: false + plugins: + - name: barman-cloud.cloudnative-pg.io + enabled: true + isWALArchiver: true + parameters: + barmanObjectName: "ntfy-postgresql-18-backup-garage-local" + serverName: "ntfy-postgresql-18-backup-1" + bootstrap: + initdb: + database: app + owner: app diff --git a/clusters/cl01tl/manifests/ntfy/Deployment-ntfy.yaml b/clusters/cl01tl/manifests/ntfy/Deployment-ntfy.yaml index 5df065182..be1ccc338 100644 --- a/clusters/cl01tl/manifests/ntfy/Deployment-ntfy.yaml +++ b/clusters/cl01tl/manifests/ntfy/Deployment-ntfy.yaml @@ -38,7 +38,7 @@ spec: - serve env: - name: TZ - value: US/Central + value: America/Chicago - name: NTFY_BASE_URL value: https://ntfy.alexlebens.net - name: NTFY_LISTEN_HTTP @@ -57,13 +57,16 @@ spec: - name: NTFY_BEHIND_PROXY value: "true" - name: NTFY_ATTACHMENT_CACHE_DIR - value: /var/cache/ntfy/attachments + valueFrom: + secretKeyRef: + key: attachment-cache-dir + name: ntfy-config-secret - name: NTFY_ATTACHMENT_TOTAL_SIZE_LIMIT - value: 4G + value: 10G - name: NTFY_ATTACHMENT_FILE_SIZE_LIMIT - value: 15M + value: 150M - name: NTFY_ATTACHMENT_EXPIRY_DURATION - value: 36h + value: 72h - name: NTFY_ENABLE_SIGNUP value: "false" - name: NTFY_ENABLE_LOGIN @@ -78,13 +81,12 @@ spec: value: :9090 - name: NTFY_LOG_LEVEL value: info - image: binwiederhier/ntfy:v2.21.0 - imagePullPolicy: IfNotPresent + image: binwiederhier/ntfy:v2.21.0@sha256:2b9e12d56a538f4402da51328eeca02696c4b207ab7fbe031c27e51a22ca9b86 name: main resources: requests: cpu: 10m - memory: 128Mi + memory: 40Mi volumeMounts: - mountPath: /var/cache/ntfy name: cache diff --git a/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-config-secret.yaml b/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-config-secret.yaml new file mode 100644 index 000000000..712a3c54f --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-config-secret.yaml @@ -0,0 +1,18 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: ntfy-config-secret + namespace: ntfy + labels: + app.kubernetes.io/name: ntfy-config-secret + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy +spec: + secretStoreRef: + kind: ClusterSecretStore + name: vault + data: + - secretKey: attachment-cache-dir + remoteRef: + key: /garage/home-infra/ntfy-attachments + property: attachment-cache-dir diff --git a/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-postgresql-18-backup-garage-local-secret.yaml b/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-postgresql-18-backup-garage-local-secret.yaml new file mode 100644 index 000000000..50bbc1f71 --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-postgresql-18-backup-garage-local-secret.yaml @@ -0,0 +1,38 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: ntfy-postgresql-18-backup-garage-local-secret + namespace: ntfy + labels: + app.kubernetes.io/name: ntfy-postgresql-18-backup-garage-local-secret + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm +spec: + secretStoreRef: + kind: ClusterSecretStore + name: vault + data: + - secretKey: ACCESS_REGION + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_REGION + - secretKey: ACCESS_KEY_ID + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_KEY_ID + - secretKey: ACCESS_SECRET_KEY + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_SECRET_KEY diff --git a/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-postgresql-18-recovery-secret.yaml b/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-postgresql-18-recovery-secret.yaml new file mode 100644 index 000000000..70e839292 --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/ExternalSecret-ntfy-postgresql-18-recovery-secret.yaml @@ -0,0 +1,38 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: ntfy-postgresql-18-recovery-secret + namespace: ntfy + labels: + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: ntfy-postgresql-18-recovery-secret +spec: + secretStoreRef: + kind: ClusterSecretStore + name: vault + data: + - secretKey: ACCESS_REGION + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_REGION + - secretKey: ACCESS_KEY_ID + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_KEY_ID + - secretKey: ACCESS_SECRET_KEY + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_SECRET_KEY diff --git a/clusters/cl01tl/manifests/ntfy/HTTPRoute-ntfy.yaml b/clusters/cl01tl/manifests/ntfy/HTTPRoute-ntfy.yaml index 7e00d5159..568a1094c 100644 --- a/clusters/cl01tl/manifests/ntfy/HTTPRoute-ntfy.yaml +++ b/clusters/cl01tl/manifests/ntfy/HTTPRoute-ntfy.yaml @@ -23,7 +23,7 @@ spec: name: ntfy namespace: ntfy port: 80 - weight: 100 + weight: 1 matches: - path: type: PathPrefix diff --git a/clusters/cl01tl/manifests/ntfy/ObjectStore-ntfy-postgresql-18-backup-garage-local.yaml b/clusters/cl01tl/manifests/ntfy/ObjectStore-ntfy-postgresql-18-backup-garage-local.yaml new file mode 100644 index 000000000..72a58b291 --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/ObjectStore-ntfy-postgresql-18-backup-garage-local.yaml @@ -0,0 +1,33 @@ +apiVersion: barmancloud.cnpg.io/v1 +kind: ObjectStore +metadata: + name: ntfy-postgresql-18-backup-garage-local + namespace: ntfy + labels: + app.kubernetes.io/name: ntfy-postgresql-18-backup-garage-local + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm +spec: + retentionPolicy: 7d + instanceSidecarConfiguration: + env: + - name: AWS_REQUEST_CHECKSUM_CALCULATION + value: when_required + - name: AWS_RESPONSE_CHECKSUM_VALIDATION + value: when_required + configuration: + destinationPath: s3://postgres-backups/cl01tl/ntfy/ntfy-postgresql-18-cluster + endpointURL: http://garage-main.garage:3900 + s3Credentials: + accessKeyId: + name: ntfy-postgresql-18-backup-garage-local-secret + key: ACCESS_KEY_ID + secretAccessKey: + name: ntfy-postgresql-18-backup-garage-local-secret + key: ACCESS_SECRET_KEY + region: + name: ntfy-postgresql-18-backup-garage-local-secret + key: ACCESS_REGION diff --git a/clusters/cl01tl/manifests/ntfy/ObjectStore-ntfy-postgresql-18-recovery.yaml b/clusters/cl01tl/manifests/ntfy/ObjectStore-ntfy-postgresql-18-recovery.yaml new file mode 100644 index 000000000..34a77ee92 --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/ObjectStore-ntfy-postgresql-18-recovery.yaml @@ -0,0 +1,32 @@ +apiVersion: barmancloud.cnpg.io/v1 +kind: ObjectStore +metadata: + name: "ntfy-postgresql-18-recovery" + namespace: ntfy + labels: + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: "ntfy-postgresql-18-recovery" +spec: + configuration: + destinationPath: s3://postgres-backups/cl01tl/ntfy/ntfy-postgresql-18-cluster + endpointURL: http://garage-main.garage:3900 + wal: + compression: snappy + maxParallel: 1 + data: + compression: snappy + jobs: 1 + s3Credentials: + accessKeyId: + name: ntfy-postgresql-18-recovery-secret + key: ACCESS_KEY_ID + secretAccessKey: + name: ntfy-postgresql-18-recovery-secret + key: ACCESS_SECRET_KEY + region: + name: ntfy-postgresql-18-recovery-secret + key: ACCESS_REGION diff --git a/clusters/cl01tl/manifests/ntfy/PrometheusRule-ntfy-postgresql-18-alert-rules.yaml b/clusters/cl01tl/manifests/ntfy/PrometheusRule-ntfy-postgresql-18-alert-rules.yaml new file mode 100644 index 000000000..a58fdebe9 --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/PrometheusRule-ntfy-postgresql-18-alert-rules.yaml @@ -0,0 +1,270 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: ntfy-postgresql-18-alert-rules + namespace: ntfy + labels: + app.kubernetes.io/name: ntfy-postgresql-18-alert-rules + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm +spec: + groups: + - name: cloudnative-pg/ntfy-postgresql-18 + rules: + - alert: CNPGClusterBackendsWaitingWarning + annotations: + summary: CNPG Cluster a backend is waiting for longer than 5 minutes. + description: |- + Pod {{ $labels.pod }} + has been waiting for longer than 5 minutes + expr: | + cnpg_backends_waiting_total{namespace="ntfy"} > 300 + for: 1m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterDatabaseDeadlockConflictsWarning + annotations: + summary: CNPG Cluster has over 10 deadlock conflicts. + description: |- + There are over 10 deadlock conflicts in + {{ $labels.pod }} + expr: | + cnpg_pg_stat_database_deadlocks{namespace="ntfy"} > 10 + for: 1m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterHACritical + annotations: + summary: CNPG Cluster has no standby replicas! + description: |- + CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has no ready standby replicas. Your cluster at a severe + risk of data loss and downtime if the primary instance fails. + + The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint + will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main. + + This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less + instances. The replaced instance may need some time to catch-up with the cluster primary instance. + + This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this + case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md + expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="ntfy"} - cnpg_pg_replication_is_wal_receiver_up{namespace="ntfy"}) < 1 + for: 5m + labels: + severity: critical + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterHAWarning + annotations: + summary: CNPG Cluster less than 2 standby replicas. + description: |- + CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has only {{`{{`}} $value {{`}}`}} standby replicas, putting + your cluster at risk if another instance fails. The cluster is still able to operate normally, although + the `-ro` and `-r` endpoints operate at reduced capacity. + + This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may + need some time to catch-up with the cluster primary instance. + + This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances. + In this case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md + expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="ntfy"} - cnpg_pg_replication_is_wal_receiver_up{namespace="ntfy"}) < 2 + for: 5m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterHighConnectionsCritical + annotations: + summary: CNPG Instance maximum number of connections critical! + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md + expr: | + sum by (pod) (cnpg_backends_total{namespace="ntfy", pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="ntfy", pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 95 + for: 5m + labels: + severity: critical + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterHighConnectionsWarning + annotations: + summary: CNPG Instance is approaching the maximum number of connections. + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md + expr: | + sum by (pod) (cnpg_backends_total{namespace="ntfy", pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="ntfy", pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 80 + for: 5m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterHighReplicationLag + annotations: + summary: CNPG Cluster high replication lag + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" is experiencing a high replication lag of + {{`{{`}} $value {{`}}`}}ms. + + High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md + expr: | + max(cnpg_pg_replication_lag{namespace="ntfy",pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) * 1000 > 1000 + for: 5m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterInstancesOnSameNode + annotations: + summary: CNPG Cluster instances are located on the same node. + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" has {{`{{`}} $value {{`}}`}} + instances on the same node {{`{{`}} $labels.node {{`}}`}}. + + A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md + expr: | + count by (node) (kube_pod_info{namespace="ntfy", pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) > 1 + for: 5m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterLongRunningTransactionWarning + annotations: + summary: CNPG Cluster query is taking longer than 5 minutes. + description: |- + CloudNativePG Cluster Pod {{ $labels.pod }} + is taking more than 5 minutes (300 seconds) for a query. + expr: |- + cnpg_backends_max_tx_duration_seconds{namespace="ntfy"} > 300 + for: 1m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterLowDiskSpaceCritical + annotations: + summary: CNPG Instance is running out of disk space! + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" is running extremely low on disk space. Check attached PVCs! + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md + expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.9 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.9 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"} + ) > 0.9 + for: 5m + labels: + severity: critical + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterLowDiskSpaceWarning + annotations: + summary: CNPG Instance is running out of disk space. + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" is running low on disk space. Check attached PVCs. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md + expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.7 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.7 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="ntfy", persistentvolumeclaim=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"} + ) > 0.7 + for: 5m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterOffline + annotations: + summary: CNPG Cluster has no running instances! + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" has no ready instances. + + Having an offline cluster means your applications will not be able to access the database, leading to + potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md + expr: | + (count(cnpg_collector_up{namespace="ntfy",pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"}) OR on() vector(0)) == 0 + for: 5m + labels: + severity: critical + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterPGDatabaseXidAgeWarning + annotations: + summary: CNPG Cluster has a number of transactions from the frozen XID to the current one. + description: |- + Over 300,000,000 transactions from frozen xid + on pod {{ $labels.pod }} + expr: | + cnpg_pg_database_xid_age{namespace="ntfy"} > 300000000 + for: 1m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterPGReplicationWarning + annotations: + summary: CNPG Cluster standby is lagging behind the primary. + description: |- + Standby is lagging behind by over 300 seconds (5 minutes) + expr: | + cnpg_pg_replication_lag{namespace="ntfy"} > 300 + for: 1m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterReplicaFailingReplicationWarning + annotations: + summary: CNPG Cluster has a replica is failing to replicate. + description: |- + Replica {{ $labels.pod }} + is failing to replicate + expr: | + cnpg_pg_replication_in_recovery{namespace="ntfy"} > cnpg_pg_replication_is_wal_receiver_up{namespace="ntfy"} + for: 1m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster + - alert: CNPGClusterZoneSpreadWarning + annotations: + summary: CNPG Cluster instances in the same zone. + description: |- + CloudNativePG Cluster "ntfy/ntfy-postgresql-18-cluster" has instances in the same availability zone. + + A disaster in one availability zone will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md + expr: | + 3 > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="ntfy", pod=~"ntfy-postgresql-18-cluster-([1-9][0-9]*)$"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 + for: 5m + labels: + severity: warning + namespace: ntfy + cnpg_cluster: ntfy-postgresql-18-cluster diff --git a/clusters/cl01tl/manifests/ntfy/ScheduledBackup-ntfy-postgresql-18-scheduled-backup-live-backup.yaml b/clusters/cl01tl/manifests/ntfy/ScheduledBackup-ntfy-postgresql-18-scheduled-backup-live-backup.yaml new file mode 100644 index 000000000..2371378aa --- /dev/null +++ b/clusters/cl01tl/manifests/ntfy/ScheduledBackup-ntfy-postgresql-18-scheduled-backup-live-backup.yaml @@ -0,0 +1,24 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: "ntfy-postgresql-18-scheduled-backup-live-backup" + namespace: ntfy + labels: + app.kubernetes.io/name: "ntfy-postgresql-18-scheduled-backup-live-backup" + helm.sh/chart: postgres-18-cluster-7.11.2 + app.kubernetes.io/instance: ntfy + app.kubernetes.io/part-of: ntfy + app.kubernetes.io/version: "7.11.2" + app.kubernetes.io/managed-by: Helm +spec: + immediate: true + suspend: false + schedule: "0 15 14 * * *" + backupOwnerReference: self + cluster: + name: ntfy-postgresql-18-cluster + method: plugin + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io + parameters: + barmanObjectName: "ntfy-postgresql-18-backup-garage-local" diff --git a/clusters/cl01tl/manifests/rclone/CronJob-rclone-ntfy-attachments.yaml b/clusters/cl01tl/manifests/rclone/CronJob-rclone-ntfy-attachments.yaml new file mode 100644 index 000000000..5a70281a7 --- /dev/null +++ b/clusters/cl01tl/manifests/rclone/CronJob-rclone-ntfy-attachments.yaml @@ -0,0 +1,107 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: rclone-ntfy-attachments + labels: + app.kubernetes.io/controller: ntfy-attachments + app.kubernetes.io/instance: rclone + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: rclone + helm.sh/chart: rclone-4.6.2 + namespace: rclone +spec: + suspend: false + concurrencyPolicy: Forbid + startingDeadlineSeconds: 90 + timeZone: US/Central + schedule: "10 0 * * *" + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + parallelism: 1 + backoffLimit: 3 + template: + metadata: + labels: + app.kubernetes.io/controller: ntfy-attachments + app.kubernetes.io/instance: rclone + app.kubernetes.io/name: rclone + spec: + enableServiceLinks: false + serviceAccountName: default + automountServiceAccountToken: true + hostIPC: false + hostNetwork: false + hostPID: false + dnsPolicy: ClusterFirst + restartPolicy: Never + containers: + - args: + - sync + - src:ntfy-attachments + - dest:ntfy-attachments + - --s3-no-check-bucket + - --verbose + env: + - name: RCLONE_S3_PROVIDER + value: Other + - name: RCLONE_CONFIG_SRC_TYPE + value: s3 + - name: RCLONE_CONFIG_SRC_PROVIDER + value: Other + - name: RCLONE_CONFIG_SRC_ENV_AUTH + value: "false" + - name: RCLONE_CONFIG_SRC_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: ACCESS_KEY_ID + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_SRC_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: ACCESS_SECRET_KEY + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_SRC_REGION + valueFrom: + secretKeyRef: + key: ACCESS_REGION + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_SRC_ENDPOINT + valueFrom: + secretKeyRef: + key: SRC_ENDPOINT + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_SRC_S3_FORCE_PATH_STYLE + value: "true" + - name: RCLONE_CONFIG_DEST_TYPE + value: s3 + - name: RCLONE_CONFIG_DEST_PROVIDER + value: Other + - name: RCLONE_CONFIG_DEST_ENV_AUTH + value: "false" + - name: RCLONE_CONFIG_DEST_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: ACCESS_KEY_ID + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_DEST_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: ACCESS_SECRET_KEY + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_DEST_REGION + valueFrom: + secretKeyRef: + key: ACCESS_REGION + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_DEST_ENDPOINT + valueFrom: + secretKeyRef: + key: DEST_ENDPOINT + name: garage-ntfy-attachments-secret + - name: RCLONE_CONFIG_SRC_DEST_FORCE_PATH_STYLE + value: "true" + image: rclone/rclone:1.73.3 + imagePullPolicy: IfNotPresent + name: sync diff --git a/clusters/cl01tl/manifests/rclone/ExternalSecret-garage-ntfy-attachments-secret.yaml b/clusters/cl01tl/manifests/rclone/ExternalSecret-garage-ntfy-attachments-secret.yaml new file mode 100644 index 000000000..41b2b6259 --- /dev/null +++ b/clusters/cl01tl/manifests/rclone/ExternalSecret-garage-ntfy-attachments-secret.yaml @@ -0,0 +1,49 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: garage-ntfy-attachments-secret + namespace: rclone + labels: + app.kubernetes.io/name: garage-ntfy-attachments-secret + app.kubernetes.io/instance: rclone + app.kubernetes.io/part-of: rclone +spec: + secretStoreRef: + kind: ClusterSecretStore + name: vault + data: + - secretKey: ACCESS_KEY_ID + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/ntfy-attachments + metadataPolicy: None + property: ACCESS_KEY_ID + - secretKey: ACCESS_REGION + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/ntfy-attachments + metadataPolicy: None + property: ACCESS_REGION + - secretKey: ACCESS_SECRET_KEY + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/ntfy-attachments + metadataPolicy: None + property: ACCESS_SECRET_KEY + - secretKey: SRC_ENDPOINT + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/config/local + metadataPolicy: None + property: ENDPOINT + - secretKey: DEST_ENDPOINT + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/config/remote + metadataPolicy: None + property: ENDPOINT