diff --git a/clusters/cl01tl/manifests/blocky/ConfigMap-blocky.yaml b/clusters/cl01tl/manifests/blocky/ConfigMap-blocky.yaml index d97e98a05..0caabbfd1 100644 --- a/clusters/cl01tl/manifests/blocky/ConfigMap-blocky.yaml +++ b/clusters/cl01tl/manifests/blocky/ConfigMap-blocky.yaml @@ -129,7 +129,6 @@ data: ollama IN CNAME traefik-cl01tl omni-tools IN CNAME traefik-cl01tl overseerr IN CNAME traefik-cl01tl - pgadmin IN CNAME traefik-cl01tl photoview IN CNAME traefik-cl01tl plex IN CNAME traefik-cl01tl postiz IN CNAME traefik-cl01tl diff --git a/clusters/cl01tl/manifests/blocky/Deployment-blocky.yaml b/clusters/cl01tl/manifests/blocky/Deployment-blocky.yaml index f1eac31f0..70aef687f 100644 --- a/clusters/cl01tl/manifests/blocky/Deployment-blocky.yaml +++ b/clusters/cl01tl/manifests/blocky/Deployment-blocky.yaml @@ -22,7 +22,7 @@ spec: template: metadata: annotations: - checksum/configMaps: a0d9f2795d2d2013805f4996524f6a29bfa2576e1c3633068b6f011e9648b2e0 + checksum/configMaps: ca1ae3096857637a277dd8fc4536a8a2e830f60b13a7737ce35e587dd1c0f162 labels: app.kubernetes.io/controller: main app.kubernetes.io/instance: blocky diff --git a/clusters/cl01tl/manifests/gatus/ConfigMap-gatus.yaml b/clusters/cl01tl/manifests/gatus/ConfigMap-gatus.yaml index 51bb9de19..052750547 100644 --- a/clusters/cl01tl/manifests/gatus/ConfigMap-gatus.yaml +++ b/clusters/cl01tl/manifests/gatus/ConfigMap-gatus.yaml @@ -411,15 +411,6 @@ data: interval: 30s name: garage url: https://garage-webui.alexlebens.net - - alerts: - - type: ntfy - conditions: - - '[STATUS] == 200' - - '[CERTIFICATE_EXPIRATION] > 240h' - group: core - interval: 30s - name: pgadmin - url: https://pgadmin.alexlebens.net - alerts: - type: ntfy conditions: diff --git a/clusters/cl01tl/manifests/gatus/Deployment-gatus.yaml b/clusters/cl01tl/manifests/gatus/Deployment-gatus.yaml index 70c193134..7b053ce26 100644 --- a/clusters/cl01tl/manifests/gatus/Deployment-gatus.yaml +++ b/clusters/cl01tl/manifests/gatus/Deployment-gatus.yaml @@ -26,7 +26,7 @@ spec: app.kubernetes.io/name: gatus app.kubernetes.io/instance: gatus annotations: - checksum/config: 0a23a7fc337d39aea7dea5f7cb496e32467bf2856b90e3a0283ee49684422298 + checksum/config: 447b5241411f1849b091a8e97b007a6284e71b8990f97973979189590e17bbdf spec: serviceAccountName: default automountServiceAccountToken: false diff --git a/clusters/cl01tl/manifests/homepage/ConfigMap-homepage.yaml b/clusters/cl01tl/manifests/homepage/ConfigMap-homepage.yaml index 34ba695b2..a59d90b70 100644 --- a/clusters/cl01tl/manifests/homepage/ConfigMap-homepage.yaml +++ b/clusters/cl01tl/manifests/homepage/ConfigMap-homepage.yaml @@ -440,12 +440,6 @@ data: href: https://garage-ui-ps10rp.boreal-beaufort.ts.net siteMonitor: https://garage-ui-ps10rp.boreal-beaufort.ts.net statusStyle: dot - - Database: - icon: sh-pgadmin-light.webp - description: PGAdmin - href: https://pgadmin.alexlebens.net - siteMonitor: http://pgadmin.pgadmin:80 - statusStyle: dot - Database: icon: sh-whodb.webp description: WhoDB diff --git a/clusters/cl01tl/manifests/homepage/Deployment-homepage.yaml b/clusters/cl01tl/manifests/homepage/Deployment-homepage.yaml index ed8558e89..687c3cefc 100644 --- a/clusters/cl01tl/manifests/homepage/Deployment-homepage.yaml +++ b/clusters/cl01tl/manifests/homepage/Deployment-homepage.yaml @@ -24,7 +24,7 @@ spec: template: metadata: annotations: - checksum/configMaps: 45df1f089469334856418ad71a92e7aab49a18e2a0f222c66fcaebce3eac35c8 + checksum/configMaps: c16ee2840048690a8d35c3758e398da07fd7be50b7e003b5eae19bddc1b39a52 checksum/secrets: d3ba83f111cd32f92c909268c55ad8bbd4f9e299b74b35b33c1a011180d8b378 labels: app.kubernetes.io/controller: main diff --git a/clusters/cl01tl/manifests/outline/Cluster-outline-postgresql-18-cluster.yaml b/clusters/cl01tl/manifests/outline/Cluster-outline-postgresql-18-cluster.yaml new file mode 100644 index 000000000..3c7e72fad --- /dev/null +++ b/clusters/cl01tl/manifests/outline/Cluster-outline-postgresql-18-cluster.yaml @@ -0,0 +1,79 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: outline-postgresql-18-cluster + namespace: outline + labels: + helm.sh/chart: postgres-18-cluster-6.16.1 + app.kubernetes.io/name: outline-postgresql-18 + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline + app.kubernetes.io/version: "6.16.1" + app.kubernetes.io/managed-by: Helm +spec: + instances: 3 + imageName: "ghcr.io/cloudnative-pg/postgresql:18.1-standard-trixie" + imagePullPolicy: IfNotPresent + postgresUID: 26 + postgresGID: 26 + plugins: + - name: barman-cloud.cloudnative-pg.io + enabled: true + isWALArchiver: false + parameters: + barmanObjectName: "outline-postgresql-18-external-backup" + serverName: "outline-postgresql-18-backup-1" + - name: barman-cloud.cloudnative-pg.io + enabled: true + isWALArchiver: true + parameters: + barmanObjectName: "outline-postgresql-18-garage-local-backup" + serverName: "outline-postgresql-18-backup-1" + storage: + size: 10Gi + storageClass: local-path + walStorage: + size: 2Gi + storageClass: local-path + resources: + limits: + hugepages-2Mi: 256Mi + requests: + cpu: 100m + memory: 256Mi + affinity: + enablePodAntiAffinity: true + topologyKey: kubernetes.io/hostname + primaryUpdateMethod: switchover + primaryUpdateStrategy: unsupervised + logLevel: info + enableSuperuserAccess: false + enablePDB: true + postgresql: + parameters: + hot_standby_feedback: "on" + max_slot_wal_keep_size: 2000MB + shared_buffers: 128MB + monitoring: + enablePodMonitor: true + disableDefaultQueries: false + bootstrap: + initdb: + import: + source: + externalCluster: importSource + type: microservice + databases: + - app + schemaOnly: false + externalClusters: + - name: importSource + connectionParameters: + host: "outline-postgresql-17-cluster-rw" + port: "5432" + user: "app" + dbname: "app" + sslmode: "disable" + password: + name: outline-postgresql-17-cluster-app + key: password diff --git a/clusters/cl01tl/manifests/outline/ExternalSecret-outline-postgresql-18-cluster-backup-secret-garage.yaml b/clusters/cl01tl/manifests/outline/ExternalSecret-outline-postgresql-18-cluster-backup-secret-garage.yaml new file mode 100644 index 000000000..179121f9f --- /dev/null +++ b/clusters/cl01tl/manifests/outline/ExternalSecret-outline-postgresql-18-cluster-backup-secret-garage.yaml @@ -0,0 +1,35 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: outline-postgresql-18-cluster-backup-secret-garage + namespace: outline + labels: + app.kubernetes.io/name: outline-postgresql-18-cluster-backup-secret-garage + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline +spec: + secretStoreRef: + kind: ClusterSecretStore + name: vault + data: + - secretKey: ACCESS_KEY_ID + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_KEY_ID + - secretKey: ACCESS_SECRET_KEY + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_SECRET_KEY + - secretKey: ACCESS_REGION + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /garage/home-infra/postgres-backups + metadataPolicy: None + property: ACCESS_REGION diff --git a/clusters/cl01tl/manifests/outline/ExternalSecret-outline-postgresql-18-cluster-backup-secret.yaml b/clusters/cl01tl/manifests/outline/ExternalSecret-outline-postgresql-18-cluster-backup-secret.yaml new file mode 100644 index 000000000..823f9d233 --- /dev/null +++ b/clusters/cl01tl/manifests/outline/ExternalSecret-outline-postgresql-18-cluster-backup-secret.yaml @@ -0,0 +1,28 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: outline-postgresql-18-cluster-backup-secret + namespace: outline + labels: + app.kubernetes.io/name: outline-postgresql-18-cluster-backup-secret + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline +spec: + secretStoreRef: + kind: ClusterSecretStore + name: vault + data: + - secretKey: ACCESS_KEY_ID + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /digital-ocean/home-infra/postgres-backups + metadataPolicy: None + property: access + - secretKey: ACCESS_SECRET_KEY + remoteRef: + conversionStrategy: Default + decodingStrategy: None + key: /digital-ocean/home-infra/postgres-backups + metadataPolicy: None + property: secret diff --git a/clusters/cl01tl/manifests/outline/ObjectStore-outline-postgresql-18-external-backup.yaml b/clusters/cl01tl/manifests/outline/ObjectStore-outline-postgresql-18-external-backup.yaml new file mode 100644 index 000000000..11a0e15fb --- /dev/null +++ b/clusters/cl01tl/manifests/outline/ObjectStore-outline-postgresql-18-external-backup.yaml @@ -0,0 +1,24 @@ +apiVersion: barmancloud.cnpg.io/v1 +kind: ObjectStore +metadata: + name: "outline-postgresql-18-external-backup" + namespace: outline + labels: + helm.sh/chart: postgres-18-cluster-6.16.1 + app.kubernetes.io/name: outline-postgresql-18 + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline + app.kubernetes.io/version: "6.16.1" + app.kubernetes.io/managed-by: Helm +spec: + retentionPolicy: 30d + configuration: + destinationPath: s3://postgres-backups-ce540ddf106d186bbddca68a/cl01tl/outline/outline-postgresql-18-cluster + endpointURL: https://nyc3.digitaloceanspaces.com + s3Credentials: + accessKeyId: + name: outline-postgresql-18-cluster-backup-secret + key: ACCESS_KEY_ID + secretAccessKey: + name: outline-postgresql-18-cluster-backup-secret + key: ACCESS_SECRET_KEY diff --git a/clusters/cl01tl/manifests/outline/ObjectStore-outline-postgresql-18-garage-local-backup.yaml b/clusters/cl01tl/manifests/outline/ObjectStore-outline-postgresql-18-garage-local-backup.yaml new file mode 100644 index 000000000..7f53f5fdd --- /dev/null +++ b/clusters/cl01tl/manifests/outline/ObjectStore-outline-postgresql-18-garage-local-backup.yaml @@ -0,0 +1,27 @@ +apiVersion: barmancloud.cnpg.io/v1 +kind: ObjectStore +metadata: + name: "outline-postgresql-18-garage-local-backup" + namespace: outline + labels: + helm.sh/chart: postgres-18-cluster-6.16.1 + app.kubernetes.io/name: outline-postgresql-18 + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline + app.kubernetes.io/version: "6.16.1" + app.kubernetes.io/managed-by: Helm +spec: + retentionPolicy: 3d + configuration: + destinationPath: s3://postgres-backups/cl01tl/outline/outline-postgresql-18-cluster + endpointURL: http://garage-main.garage:3900 + s3Credentials: + accessKeyId: + name: outline-postgresql-18-cluster-backup-secret-garage + key: ACCESS_KEY_ID + secretAccessKey: + name: outline-postgresql-18-cluster-backup-secret-garage + key: ACCESS_SECRET_KEY + region: + name: outline-postgresql-18-cluster-backup-secret-garage + key: ACCESS_REGION diff --git a/clusters/cl01tl/manifests/outline/PrometheusRule-outline-postgresql-18-alert-rules.yaml b/clusters/cl01tl/manifests/outline/PrometheusRule-outline-postgresql-18-alert-rules.yaml new file mode 100644 index 000000000..d966090c8 --- /dev/null +++ b/clusters/cl01tl/manifests/outline/PrometheusRule-outline-postgresql-18-alert-rules.yaml @@ -0,0 +1,270 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: outline-postgresql-18-alert-rules + namespace: outline + labels: + helm.sh/chart: postgres-18-cluster-6.16.1 + app.kubernetes.io/name: outline-postgresql-18 + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline + app.kubernetes.io/version: "6.16.1" + app.kubernetes.io/managed-by: Helm +spec: + groups: + - name: cloudnative-pg/outline-postgresql-18 + rules: + - alert: CNPGClusterBackendsWaitingWarning + annotations: + summary: CNPG Cluster a backend is waiting for longer than 5 minutes. + description: |- + Pod {{ $labels.pod }} + has been waiting for longer than 5 minutes + expr: | + cnpg_backends_waiting_total > 300 + for: 1m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterDatabaseDeadlockConflictsWarning + annotations: + summary: CNPG Cluster has over 10 deadlock conflicts. + description: |- + There are over 10 deadlock conflicts in + {{ $labels.pod }} + expr: | + cnpg_pg_stat_database_deadlocks > 10 + for: 1m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterHACritical + annotations: + summary: CNPG Cluster has no standby replicas! + description: |- + CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has no ready standby replicas. Your cluster at a severe + risk of data loss and downtime if the primary instance fails. + + The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint + will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main. + + This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less + instances. The replaced instance may need some time to catch-up with the cluster primary instance. + + This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this + case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md + expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="outline"} - cnpg_pg_replication_is_wal_receiver_up{namespace="outline"}) < 1 + for: 5m + labels: + severity: critical + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterHAWarning + annotations: + summary: CNPG Cluster less than 2 standby replicas. + description: |- + CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has only {{`{{`}} $value {{`}}`}} standby replicas, putting + your cluster at risk if another instance fails. The cluster is still able to operate normally, although + the `-ro` and `-r` endpoints operate at reduced capacity. + + This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may + need some time to catch-up with the cluster primary instance. + + This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances. + In this case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md + expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="outline"} - cnpg_pg_replication_is_wal_receiver_up{namespace="outline"}) < 2 + for: 5m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterHighConnectionsCritical + annotations: + summary: CNPG Instance maximum number of connections critical! + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md + expr: | + sum by (pod) (cnpg_backends_total{namespace="outline", pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="outline", pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 95 + for: 5m + labels: + severity: critical + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterHighConnectionsWarning + annotations: + summary: CNPG Instance is approaching the maximum number of connections. + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md + expr: | + sum by (pod) (cnpg_backends_total{namespace="outline", pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="outline", pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 80 + for: 5m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterHighReplicationLag + annotations: + summary: CNPG Cluster high replication lag + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" is experiencing a high replication lag of + {{`{{`}} $value {{`}}`}}ms. + + High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md + expr: | + max(cnpg_pg_replication_lag{namespace="outline",pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) * 1000 > 1000 + for: 5m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterInstancesOnSameNode + annotations: + summary: CNPG Cluster instances are located on the same node. + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" has {{`{{`}} $value {{`}}`}} + instances on the same node {{`{{`}} $labels.node {{`}}`}}. + + A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md + expr: | + count by (node) (kube_pod_info{namespace="outline", pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) > 1 + for: 5m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterLongRunningTransactionWarning + annotations: + summary: CNPG Cluster query is taking longer than 5 minutes. + description: |- + CloudNativePG Cluster Pod {{ $labels.pod }} + is taking more than 5 minutes (300 seconds) for a query. + expr: |- + cnpg_backends_max_tx_duration_seconds > 300 + for: 1m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterLowDiskSpaceCritical + annotations: + summary: CNPG Instance is running out of disk space! + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" is running extremely low on disk space. Check attached PVCs! + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md + expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.9 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.9 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"} + ) > 0.9 + for: 5m + labels: + severity: critical + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterLowDiskSpaceWarning + annotations: + summary: CNPG Instance is running out of disk space. + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" is running low on disk space. Check attached PVCs. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md + expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.7 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.7 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="outline", persistentvolumeclaim=~"outline-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"} + ) > 0.7 + for: 5m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterOffline + annotations: + summary: CNPG Cluster has no running instances! + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" has no ready instances. + + Having an offline cluster means your applications will not be able to access the database, leading to + potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md + expr: | + (count(cnpg_collector_up{namespace="outline",pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"}) OR on() vector(0)) == 0 + for: 5m + labels: + severity: critical + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterPGDatabaseXidAgeWarning + annotations: + summary: CNPG Cluster has a number of transactions from the frozen XID to the current one. + description: |- + Over 300,000,000 transactions from frozen xid + on pod {{ $labels.pod }} + expr: | + cnpg_pg_database_xid_age > 300000000 + for: 1m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterPGReplicationWarning + annotations: + summary: CNPG Cluster standby is lagging behind the primary. + description: |- + Standby is lagging behind by over 300 seconds (5 minutes) + expr: | + cnpg_pg_replication_lag > 300 + for: 1m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterReplicaFailingReplicationWarning + annotations: + summary: CNPG Cluster has a replica is failing to replicate. + description: |- + Replica {{ $labels.pod }} + is failing to replicate + expr: | + cnpg_pg_replication_in_recovery > cnpg_pg_replication_is_wal_receiver_up + for: 1m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster + - alert: CNPGClusterZoneSpreadWarning + annotations: + summary: CNPG Cluster instances in the same zone. + description: |- + CloudNativePG Cluster "outline/outline-postgresql-18-cluster" has instances in the same availability zone. + + A disaster in one availability zone will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md + expr: | + 3 > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="outline", pod=~"outline-postgresql-18-cluster-([1-9][0-9]*)$"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 + for: 5m + labels: + severity: warning + namespace: outline + cnpg_cluster: outline-postgresql-18-cluster diff --git a/clusters/cl01tl/manifests/outline/RedisReplication-redis-replication-outline.yaml b/clusters/cl01tl/manifests/outline/RedisReplication-redis-replication-outline.yaml index 9950e4415..b73317956 100644 --- a/clusters/cl01tl/manifests/outline/RedisReplication-redis-replication-outline.yaml +++ b/clusters/cl01tl/manifests/outline/RedisReplication-redis-replication-outline.yaml @@ -13,7 +13,7 @@ spec: runAsUser: 1000 fsGroup: 1000 kubernetesConfig: - image: quay.io/opstree/redis:v8.0.3 + image: quay.io/opstree/redis:v8.4.0 imagePullPolicy: IfNotPresent resources: requests: @@ -29,4 +29,4 @@ spec: storage: 1Gi redisExporter: enabled: true - image: quay.io/opstree/redis-exporter:v1.48.0 + image: quay.io/opstree/redis-exporter:v1.80.1 diff --git a/clusters/cl01tl/manifests/outline/RedisSentinel-redis-sentinel-outline.yaml b/clusters/cl01tl/manifests/outline/RedisSentinel-redis-sentinel-outline.yaml new file mode 100644 index 000000000..eb0a20d24 --- /dev/null +++ b/clusters/cl01tl/manifests/outline/RedisSentinel-redis-sentinel-outline.yaml @@ -0,0 +1,23 @@ +apiVersion: redis.redis.opstreelabs.in/v1beta2 +kind: RedisSentinel +metadata: + name: redis-sentinel-outline + namespace: outline + labels: + app.kubernetes.io/name: redis-sentinel-outline + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline +spec: + clusterSize: 3 + podSecurityContext: + runAsUser: 1000 + fsGroup: 1000 + redisSentinelConfig: + redisReplicationName: redis-replication-outline + kubernetesConfig: + image: quay.io/opstree/redis-sentinel:v8.4.0 + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 10m + memory: 128Mi diff --git a/clusters/cl01tl/manifests/outline/ScheduledBackup-outline-postgresql-18-daily-backup-scheduled-backup.yaml b/clusters/cl01tl/manifests/outline/ScheduledBackup-outline-postgresql-18-daily-backup-scheduled-backup.yaml new file mode 100644 index 000000000..6e03a1ec5 --- /dev/null +++ b/clusters/cl01tl/manifests/outline/ScheduledBackup-outline-postgresql-18-daily-backup-scheduled-backup.yaml @@ -0,0 +1,24 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: "outline-postgresql-18-daily-backup-scheduled-backup" + namespace: outline + labels: + helm.sh/chart: postgres-18-cluster-6.16.1 + app.kubernetes.io/name: outline-postgresql-18 + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline + app.kubernetes.io/version: "6.16.1" + app.kubernetes.io/managed-by: Helm +spec: + immediate: true + suspend: false + schedule: "0 0 0 * * *" + backupOwnerReference: self + cluster: + name: outline-postgresql-18-cluster + method: plugin + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io + parameters: + barmanObjectName: "outline-postgresql-18-external-backup" diff --git a/clusters/cl01tl/manifests/outline/ScheduledBackup-outline-postgresql-18-live-backup-scheduled-backup.yaml b/clusters/cl01tl/manifests/outline/ScheduledBackup-outline-postgresql-18-live-backup-scheduled-backup.yaml new file mode 100644 index 000000000..6e5dabd19 --- /dev/null +++ b/clusters/cl01tl/manifests/outline/ScheduledBackup-outline-postgresql-18-live-backup-scheduled-backup.yaml @@ -0,0 +1,24 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: "outline-postgresql-18-live-backup-scheduled-backup" + namespace: outline + labels: + helm.sh/chart: postgres-18-cluster-6.16.1 + app.kubernetes.io/name: outline-postgresql-18 + app.kubernetes.io/instance: outline + app.kubernetes.io/part-of: outline + app.kubernetes.io/version: "6.16.1" + app.kubernetes.io/managed-by: Helm +spec: + immediate: true + suspend: false + schedule: "0 0 0 * * *" + backupOwnerReference: self + cluster: + name: outline-postgresql-18-cluster + method: plugin + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io + parameters: + barmanObjectName: "outline-postgresql-18-garage-local-backup" diff --git a/clusters/cl01tl/manifests/pgadmin/Deployment-pgadmin.yaml b/clusters/cl01tl/manifests/pgadmin/Deployment-pgadmin.yaml deleted file mode 100644 index 32877508f..000000000 --- a/clusters/cl01tl/manifests/pgadmin/Deployment-pgadmin.yaml +++ /dev/null @@ -1,84 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pgadmin - labels: - app.kubernetes.io/controller: main - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: pgadmin - helm.sh/chart: pgadmin4-4.5.0 - namespace: pgadmin -spec: - revisionHistoryLimit: 3 - replicas: 1 - strategy: - type: Recreate - selector: - matchLabels: - app.kubernetes.io/controller: main - app.kubernetes.io/name: pgadmin - app.kubernetes.io/instance: pgadmin - template: - metadata: - labels: - app.kubernetes.io/controller: main - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/name: pgadmin - spec: - enableServiceLinks: false - serviceAccountName: default - automountServiceAccountToken: true - hostIPC: false - hostNetwork: false - hostPID: false - dnsPolicy: ClusterFirst - initContainers: - - command: - - /bin/sh - - -ec - - | - /bin/chown -R 5050:5050 /var/lib/pgadmin - image: busybox:1.37.0 - imagePullPolicy: IfNotPresent - name: init-chmod-data - resources: - requests: - cpu: 10m - memory: 128Mi - securityContext: - runAsUser: 0 - volumeMounts: - - mountPath: /var/lib/pgadmin - name: data - containers: - - env: - - name: PGADMIN_CONFIG_ENHANCED_COOKIE_PROTECTION - value: "False" - - name: PGADMIN_DEFAULT_EMAIL - value: alexanderlebens@gmail.com - - name: PGADMIN_DEFAULT_PASSWORD - valueFrom: - secretKeyRef: - key: pgadmin-password - name: pgadmin-password-secret - envFrom: - - secretRef: - name: pgadmin-env-secret - image: dpage/pgadmin4:9.11 - imagePullPolicy: IfNotPresent - name: main - resources: - requests: - cpu: 10m - memory: 256Mi - securityContext: - runAsGroup: 5050 - runAsUser: 5050 - volumeMounts: - - mountPath: /var/lib/pgadmin - name: data - volumes: - - name: data - persistentVolumeClaim: - claimName: pgadmin4-data diff --git a/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-data-backup-secret.yaml b/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-data-backup-secret.yaml deleted file mode 100644 index 6716d94b6..000000000 --- a/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-data-backup-secret.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: pgadmin-data-backup-secret - namespace: pgadmin - labels: - app.kubernetes.io/name: pgadmin-data-backup-secret - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/part-of: pgadmin -spec: - secretStoreRef: - kind: ClusterSecretStore - name: vault - target: - template: - mergePolicy: Merge - engineVersion: v2 - data: - RESTIC_REPOSITORY: "{{ .BUCKET_ENDPOINT }}/pgadmin/pgadmin-data" - data: - - secretKey: BUCKET_ENDPOINT - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/volsync/restic/config - metadataPolicy: None - property: S3_BUCKET_ENDPOINT - - secretKey: RESTIC_PASSWORD - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/volsync/restic/config - metadataPolicy: None - property: RESTIC_PASSWORD - - secretKey: AWS_DEFAULT_REGION - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/volsync/restic/config - metadataPolicy: None - property: AWS_DEFAULT_REGION - - secretKey: AWS_ACCESS_KEY_ID - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /digital-ocean/home-infra/volsync-backups - metadataPolicy: None - property: access_key - - secretKey: AWS_SECRET_ACCESS_KEY - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /digital-ocean/home-infra/volsync-backups - metadataPolicy: None - property: secret_key diff --git a/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-env-secret.yaml b/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-env-secret.yaml deleted file mode 100644 index 520910d39..000000000 --- a/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-env-secret.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: pgadmin-env-secret - namespace: pgadmin - labels: - app.kubernetes.io/name: pgadmin-env-secret - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/part-of: pgadmin -spec: - secretStoreRef: - kind: ClusterSecretStore - name: vault - data: - - secretKey: PGADMIN_CONFIG_AUTHENTICATION_SOURCES - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/pgadmin/env - metadataPolicy: None - property: PGADMIN_CONFIG_AUTHENTICATION_SOURCES - - secretKey: PGADMIN_CONFIG_OAUTH2_AUTO_CREATE_USER - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/pgadmin/env - metadataPolicy: None - property: PGADMIN_CONFIG_OAUTH2_AUTO_CREATE_USER - - secretKey: PGADMIN_CONFIG_OAUTH2_CONFIG - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/pgadmin/env - metadataPolicy: None - property: PGADMIN_CONFIG_OAUTH2_CONFIG diff --git a/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-password-secret.yaml b/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-password-secret.yaml deleted file mode 100644 index 79c5fe575..000000000 --- a/clusters/cl01tl/manifests/pgadmin/ExternalSecret-pgadmin-password-secret.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: pgadmin-password-secret - namespace: pgadmin - labels: - app.kubernetes.io/name: pgadmin-password-secret - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/part-of: pgadmin -spec: - secretStoreRef: - kind: ClusterSecretStore - name: vault - data: - - secretKey: pgadmin-password - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: /cl01tl/pgadmin/auth - metadataPolicy: None - property: pgadmin-password diff --git a/clusters/cl01tl/manifests/pgadmin/HTTPRoute-http-route-pgadmin.yaml b/clusters/cl01tl/manifests/pgadmin/HTTPRoute-http-route-pgadmin.yaml deleted file mode 100644 index 3f5236055..000000000 --- a/clusters/cl01tl/manifests/pgadmin/HTTPRoute-http-route-pgadmin.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: http-route-pgadmin - namespace: pgadmin - labels: - app.kubernetes.io/name: http-route-pgadmin - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/part-of: pgadmin -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: traefik-gateway - namespace: traefik - hostnames: - - pgadmin.alexlebens.net - rules: - - matches: - - path: - type: PathPrefix - value: / - backendRefs: - - group: '' - kind: Service - name: pgadmin - port: 80 - weight: 100 diff --git a/clusters/cl01tl/manifests/pgadmin/PersistentVolumeClaim-pgadmin4-data.yaml b/clusters/cl01tl/manifests/pgadmin/PersistentVolumeClaim-pgadmin4-data.yaml deleted file mode 100644 index 68bf5cbb3..000000000 --- a/clusters/cl01tl/manifests/pgadmin/PersistentVolumeClaim-pgadmin4-data.yaml +++ /dev/null @@ -1,19 +0,0 @@ -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: pgadmin4-data - labels: - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: pgadmin - helm.sh/chart: pgadmin4-4.5.0 - annotations: - helm.sh/resource-policy: keep - namespace: pgadmin -spec: - accessModes: - - "ReadWriteOnce" - resources: - requests: - storage: "5Gi" - storageClassName: "ceph-block" diff --git a/clusters/cl01tl/manifests/pgadmin/ReplicationSource-pgadmin-data-backup-source.yaml b/clusters/cl01tl/manifests/pgadmin/ReplicationSource-pgadmin-data-backup-source.yaml deleted file mode 100644 index e574eb98a..000000000 --- a/clusters/cl01tl/manifests/pgadmin/ReplicationSource-pgadmin-data-backup-source.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: volsync.backube/v1alpha1 -kind: ReplicationSource -metadata: - name: pgadmin-data-backup-source - namespace: pgadmin - labels: - app.kubernetes.io/name: pgadmin-data-backup-source - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/part-of: pgadmin -spec: - sourcePVC: pgadmin-data - trigger: - schedule: 0 4 * * * - restic: - pruneIntervalDays: 7 - repository: pgadmin-data-backup-secret - retain: - hourly: 1 - daily: 3 - weekly: 2 - monthly: 2 - yearly: 4 - moverSecurityContext: - runAsUser: 5050 - runAsGroup: 5050 - copyMethod: Snapshot - storageClassName: ceph-block - volumeSnapshotClassName: ceph-blockpool-snapshot diff --git a/clusters/cl01tl/manifests/pgadmin/Service-pgadmin.yaml b/clusters/cl01tl/manifests/pgadmin/Service-pgadmin.yaml deleted file mode 100644 index 2bd06eda8..000000000 --- a/clusters/cl01tl/manifests/pgadmin/Service-pgadmin.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: pgadmin - labels: - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: pgadmin - app.kubernetes.io/service: pgadmin - helm.sh/chart: pgadmin4-4.5.0 - namespace: pgadmin -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 80 - protocol: TCP - name: http - selector: - app.kubernetes.io/controller: main - app.kubernetes.io/instance: pgadmin - app.kubernetes.io/name: pgadmin