Files
infrastructure/clusters/cl01tl/manifests/directus/Cluster-directus-postgresql-18-cluster.yaml

1333 lines
46 KiB
YAML

apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: directus-postgresql-18-cluster
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
spec:
instances: 3
imageName: "ghcr.io/cloudnative-pg/postgresql:18.3-standard-trixie"
imagePullPolicy: IfNotPresent
postgresUID: 26
postgresGID: 26
storage:
size: 10Gi
storageClass: local-path
walStorage:
size: 2Gi
storageClass: local-path
resources:
limits:
hugepages-2Mi: 256Mi
requests:
cpu: 100m
memory: 256Mi
affinity:
enablePodAntiAffinity: true
topologyKey: kubernetes.io/hostname
primaryUpdateMethod: switchover
primaryUpdateStrategy: unsupervised
logLevel: info
enableSuperuserAccess: false
enablePDB: true
postgresql:
parameters:
hot_standby_feedback: "on"
max_slot_wal_keep_size: 2000MB
shared_buffers: 128MB
monitoring:
enablePodMonitor: true
disableDefaultQueries: false
plugins:
- name: barman-cloud.cloudnative-pg.io
enabled: true
isWALArchiver: true
parameters:
barmanObjectName: "directus-postgresql-18-backup-garage-local"
serverName: "directus-postgresql-18-backup-1"
bootstrap:
recovery:
database: app
source: directus-postgresql-18-backup-1
externalClusters:
- name: directus-postgresql-18-backup-1
plugin:
name: barman-cloud.cloudnative-pg.io
enabled: true
isWALArchiver: false
parameters:
barmanObjectName: "directus-postgresql-18-recovery"
serverName: directus-postgresql-18-backup-1
---
apiVersion: v1
kind: ConfigMap
metadata:
name: directus-valkey-init-scripts
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
data:
init.sh: |-
#!/bin/sh
set -eu
# Default config paths
VALKEY_CONFIG=${VALKEY_CONFIG_PATH:-/data/conf/valkey.conf}
LOGFILE="/data/init.log"
DATA_DIR="/data/conf"
# Logging function (outputs to stderr and file)
log() {
echo "$(date) $1" | tee -a "$LOGFILE" >&2
}
# Function to get password for a user
# Usage: get_user_password <username> [password_key]
# Returns: password via stdout, exits with error if not found
get_user_password() {
username="$1"
password_key="${2:-$username}"
password=""
# Try to get password from existing secret first (priority)
if [ -f "/valkey-users-secret/$password_key" ]; then
password=$(cat "/valkey-users-secret/$password_key")
log "Using password from existing secret for user $username"
elif [ -f "/valkey-auth-secret/${username}-password" ]; then
# Fallback to inline password
password=$(cat "/valkey-auth-secret/${username}-password")
log "Using inline password for user $username"
else
log "ERROR: No password found for user $username"
return 1
fi
echo "$password"
}
# Clean old log if requested
if [ "${KEEP_OLD_LOGS:-false}" != "true" ]; then
rm -f "$LOGFILE"
fi
if [ -f "$LOGFILE" ]; then
log "Detected restart of this instance ($HOSTNAME)"
fi
log "Creating configuration in $DATA_DIR..."
mkdir -p "$DATA_DIR"
rm -f "$VALKEY_CONFIG"
# Base valkey.conf
log "Generating base valkey.conf"
{
echo "port 6379"
echo "protected-mode no"
echo "bind * -::*"
echo "dir /data"
} >>"$VALKEY_CONFIG"
# Create secure directory for ACL file
log "Creating /etc/valkey directory for ACL file"
mkdir -p /etc/valkey
# Set aclfile path in valkey.conf
echo "aclfile /etc/valkey/users.acl" >>"$VALKEY_CONFIG"
# Remove or reset existing ACL file if present (it may be read-only from previous run)
log "Preparing ACL file at /etc/valkey/users.acl"
if [ -f /etc/valkey/users.acl ]; then
log "Removing existing read-only users.acl file"
chmod 0600 /etc/valkey/users.acl
rm -f /etc/valkey/users.acl
fi
# Create ACL file with secure permissions
touch /etc/valkey/users.acl
chmod 0600 /etc/valkey/users.acl
# Generate ACL entries for each user
log "Generating ACL entries for users"
# User: default
PASSWORD=$(get_user_password "default" "default") || exit 1
# Hash the password and write ACL entry
PASSHASH=$(echo -n "$PASSWORD" | sha256sum | cut -f 1 -d " ")
echo "user default on #$PASSHASH ~* &* +@all" >> /etc/valkey/users.acl
# Set final permissions
chmod 0400 /etc/valkey/users.acl
log "ACL file created with 0400 permissions"
# Replica mode configuration
log "Configuring replication mode"
# Use POD_INDEX from Kubernetes metadata
POD_INDEX=${POD_INDEX:-0}
IS_MASTER=false
# Check if this is pod-0 (master)
if [ "$POD_INDEX" = "0" ]; then
IS_MASTER=true
log "This pod (index $POD_INDEX) is configured as MASTER"
else
log "This pod (index $POD_INDEX) is configured as REPLICA"
fi
# Configure replica settings
if [ "$IS_MASTER" = "false" ]; then
MASTER_HOST="directus-valkey-0.directus-valkey-headless.directus.svc.cluster.local"
MASTER_PORT="6379"
log "Configuring replica to follow master at $MASTER_HOST:$MASTER_PORT"
{
echo ""
echo "# Replica Configuration"
echo "replicaof $MASTER_HOST $MASTER_PORT"
echo "replica-announce-ip directus-valkey-$POD_INDEX.directus-valkey-headless.directus.svc.cluster.local"
echo ""
echo "# Master authentication"
} >>"$VALKEY_CONFIG"
# Get the password for the replication user
REPL_PASSWORD=$(get_user_password "default" "default") || exit 1
# Write masterauth configuration
echo "masterauth $REPL_PASSWORD" >>"$VALKEY_CONFIG"
echo "masteruser default" >>"$VALKEY_CONFIG"
log "Configured masterauth with user default"
fi
# Append extra configs if present
if [ -f /usr/local/etc/valkey/valkey.conf ]; then
log "Appending /usr/local/etc/valkey/valkey.conf"
cat /usr/local/etc/valkey/valkey.conf >>"$VALKEY_CONFIG"
fi
if [ -d /extravalkeyconfigs ]; then
log "Appending files in /extravalkeyconfigs/"
cat /extravalkeyconfigs/* >>"$VALKEY_CONFIG"
fi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: directus
labels:
app.kubernetes.io/controller: main
app.kubernetes.io/instance: directus
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus
helm.sh/chart: directus-4.6.2
namespace: directus
spec:
revisionHistoryLimit: 3
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/controller: main
app.kubernetes.io/name: directus
app.kubernetes.io/instance: directus
template:
metadata:
labels:
app.kubernetes.io/controller: main
app.kubernetes.io/instance: directus
app.kubernetes.io/name: directus
spec:
enableServiceLinks: false
serviceAccountName: default
automountServiceAccountToken: true
hostIPC: false
hostNetwork: false
hostPID: false
dnsPolicy: ClusterFirst
containers:
- env:
- name: PUBLIC_URL
value: https://directus.alexlebens.net
- name: WEBSOCKETS_ENABLED
value: "true"
- name: ADMIN_EMAIL
valueFrom:
secretKeyRef:
key: admin-email
name: directus-config
- name: ADMIN_PASSWORD
valueFrom:
secretKeyRef:
key: admin-password
name: directus-config
- name: SECRET
valueFrom:
secretKeyRef:
key: secret
name: directus-config
- name: KEY
valueFrom:
secretKeyRef:
key: key
name: directus-config
- name: DB_CLIENT
value: postgres
- name: DB_HOST
valueFrom:
secretKeyRef:
key: host
name: directus-postgresql-18-cluster-app
- name: DB_DATABASE
valueFrom:
secretKeyRef:
key: dbname
name: directus-postgresql-18-cluster-app
- name: DB_PORT
valueFrom:
secretKeyRef:
key: port
name: directus-postgresql-18-cluster-app
- name: DB_USER
valueFrom:
secretKeyRef:
key: user
name: directus-postgresql-18-cluster-app
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
key: password
name: directus-postgresql-18-cluster-app
- name: SYNCHRONIZATION_STORE
value: redis
- name: CACHE_ENABLED
value: "true"
- name: CACHE_STORE
value: redis
- name: REDIS_ENABLED
value: "true"
- name: REDIS_HOST
value: directus-valkey
- name: REDIS_USERNAME
valueFrom:
secretKeyRef:
key: user
name: directus-valkey-config
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
key: password
name: directus-valkey-config
- name: STORAGE_LOCATIONS
value: s3
- name: STORAGE_S3_DRIVER
value: s3
- name: STORAGE_S3_KEY
valueFrom:
secretKeyRef:
key: ACCESS_KEY_ID
name: directus-bucket-garage
- name: STORAGE_S3_SECRET
valueFrom:
secretKeyRef:
key: ACCESS_SECRET_KEY
name: directus-bucket-garage
- name: STORAGE_S3_REGION
valueFrom:
secretKeyRef:
key: ACCESS_REGION
name: directus-bucket-garage
- name: STORAGE_S3_BUCKET
value: directus-assets
- name: STORAGE_S3_ENDPOINT
value: http://garage-main.garage:3900
- name: STORAGE_S3_FORCE_PATH_STYLE
value: "true"
- name: AUTH_PROVIDERS
value: AUTHENTIK
- name: AUTH_AUTHENTIK_DRIVER
value: openid
- name: AUTH_AUTHENTIK_CLIENT_ID
valueFrom:
secretKeyRef:
key: OIDC_CLIENT_ID
name: directus-oidc-secret
- name: AUTH_AUTHENTIK_CLIENT_SECRET
valueFrom:
secretKeyRef:
key: OIDC_CLIENT_SECRET
name: directus-oidc-secret
- name: AUTH_AUTHENTIK_SCOPE
value: openid profile email
- name: AUTH_AUTHENTIK_ISSUER_URL
value: https://authentik.alexlebens.net/application/o/directus/.well-known/openid-configuration
- name: AUTH_AUTHENTIK_IDENTIFIER_KEY
value: email
- name: AUTH_AUTHENTIK_ALLOW_PUBLIC_REGISTRATION
value: "true"
- name: AUTH_AUTHENTIK_LABEL
value: Authentik
- name: TELEMETRY
value: "false"
- name: METRICS_ENABLED
value: "true"
- name: METRICS_TOKENS
valueFrom:
secretKeyRef:
key: metric-token
name: directus-metric-token
image: directus/directus:11.16.1
imagePullPolicy: IfNotPresent
name: main
resources:
requests:
cpu: 10m
memory: 256Mi
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-bucket-garage
namespace: directus
labels:
app.kubernetes.io/name: directus-bucket-garage
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/directus-assets
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: ACCESS_SECRET_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/directus-assets
metadataPolicy: None
property: ACCESS_SECRET_KEY
- secretKey: ACCESS_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/directus-assets
metadataPolicy: None
property: ACCESS_REGION
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-config
namespace: directus
labels:
app.kubernetes.io/name: directus-config
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: admin-email
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/config
metadataPolicy: None
property: admin-email
- secretKey: admin-password
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/config
metadataPolicy: None
property: admin-password
- secretKey: secret
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/config
metadataPolicy: None
property: secret
- secretKey: key
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/config
metadataPolicy: None
property: key
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-metric-token
namespace: directus
labels:
app.kubernetes.io/name: directus-metric-token
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: metric-token
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/metrics
metadataPolicy: None
property: metric-token
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-oidc-secret
namespace: directus
labels:
app.kubernetes.io/name: directus-oidc-secret
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: OIDC_CLIENT_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /authentik/oidc/directus
metadataPolicy: None
property: client
- secretKey: OIDC_CLIENT_SECRET
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /authentik/oidc/directus
metadataPolicy: None
property: secret
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-postgresql-18-backup-garage-local-secret
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus-postgresql-18-backup-garage-local-secret
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: ACCESS_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_REGION
- secretKey: ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: ACCESS_SECRET_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_SECRET_KEY
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-postgresql-18-recovery-secret
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus-postgresql-18-recovery-secret
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: ACCESS_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_REGION
- secretKey: ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: ACCESS_SECRET_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_SECRET_KEY
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: directus-valkey-config
namespace: directus
labels:
app.kubernetes.io/name: directus-valkey-config
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: default
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/valkey
metadataPolicy: None
property: password
- secretKey: user
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/valkey
metadataPolicy: None
property: user
- secretKey: password
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/directus/valkey
metadataPolicy: None
property: password
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: directus
labels:
app.kubernetes.io/instance: directus
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus
helm.sh/chart: directus-4.6.2
namespace: directus
spec:
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: traefik-gateway
namespace: traefik
hostnames:
- "directus.alexlebens.net"
rules:
- backendRefs:
- group: ""
kind: Service
name: directus
namespace: directus
port: 80
weight: 100
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: barmancloud.cnpg.io/v1
kind: ObjectStore
metadata:
name: directus-postgresql-18-backup-garage-local
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus-postgresql-18-backup-garage-local
spec:
retentionPolicy: 7d
instanceSidecarConfiguration:
env:
- name: AWS_REQUEST_CHECKSUM_CALCULATION
value: when_required
- name: AWS_RESPONSE_CHECKSUM_VALIDATION
value: when_required
configuration:
destinationPath: s3://postgres-backups/cl01tl/directus/directus-postgresql-18-cluster
endpointURL: http://garage-main.garage:3900
s3Credentials:
accessKeyId:
name: directus-postgresql-18-backup-garage-local-secret
key: ACCESS_KEY_ID
secretAccessKey:
name: directus-postgresql-18-backup-garage-local-secret
key: ACCESS_SECRET_KEY
region:
name: directus-postgresql-18-backup-garage-local-secret
key: ACCESS_REGION
---
apiVersion: barmancloud.cnpg.io/v1
kind: ObjectStore
metadata:
name: "directus-postgresql-18-recovery"
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: "directus-postgresql-18-recovery"
spec:
configuration:
destinationPath: s3://postgres-backups/cl01tl/directus/directus-postgresql-18-cluster
endpointURL: http://garage-main.garage:3900
wal:
compression: snappy
maxParallel: 1
data:
compression: snappy
jobs: 1
s3Credentials:
accessKeyId:
name: directus-postgresql-18-recovery-secret
key: ACCESS_KEY_ID
secretAccessKey:
name: directus-postgresql-18-recovery-secret
key: ACCESS_SECRET_KEY
region:
name: directus-postgresql-18-recovery-secret
key: ACCESS_REGION
---
apiVersion: v1
kind: Pod
metadata:
name: directus-valkey-test-auth-existing
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
annotations:
"helm.sh/hook": test
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
restartPolicy: Never
containers:
- name: test-auth
image: "valkey/valkey:9.0.3"
command:
- sh
- -c
- |
set -e
echo "Testing authentication with usersExistingSecret..."
TLS_FLAGS=""
# Test basic connection (no auth - will fail if auth is properly configured)
PING_RESULT=$(valkey-cli -h directus-valkey -p 6379 $TLS_FLAGS PING 2>&1 || true)
if [ "$PING_RESULT" = "PONG" ]; then
echo "✗ Authentication test failed: server allows unauthenticated access"
exit 1
fi
echo "✓ Authentication is enforced (unauthenticated access denied)"
echo "✓ Received expected error: $PING_RESULT"
echo "⚠ Manual verification recommended for usersExistingSecret configuration"
exit 0
volumeMounts:
- name: valkey-users-secret
mountPath: /valkey-users-secret
readOnly: true
volumes:
- name: valkey-users-secret
secret:
secretName: directus-valkey-config
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: directus-postgresql-18-alert-rules
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
spec:
groups:
- name: cloudnative-pg/directus-postgresql-18
rules:
- alert: CNPGClusterBackendsWaitingWarning
annotations:
summary: CNPG Cluster a backend is waiting for longer than 5 minutes.
description: |-
Pod {{ $labels.pod }}
has been waiting for longer than 5 minutes
expr: |
cnpg_backends_waiting_total{namespace="directus"} > 300
for: 1m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterDatabaseDeadlockConflictsWarning
annotations:
summary: CNPG Cluster has over 10 deadlock conflicts.
description: |-
There are over 10 deadlock conflicts in
{{ $labels.pod }}
expr: |
cnpg_pg_stat_database_deadlocks{namespace="directus"} > 10
for: 1m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterHACritical
annotations:
summary: CNPG Cluster has no standby replicas!
description: |-
CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has no ready standby replicas. Your cluster at a severe
risk of data loss and downtime if the primary instance fails.
The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint
will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main.
This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less
instances. The replaced instance may need some time to catch-up with the cluster primary instance.
This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this
case you may want to silence it.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
expr: |
max by (job) (cnpg_pg_replication_streaming_replicas{namespace="directus"} - cnpg_pg_replication_is_wal_receiver_up{namespace="directus"}) < 1
for: 5m
labels:
severity: critical
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterHAWarning
annotations:
summary: CNPG Cluster less than 2 standby replicas.
description: |-
CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has only {{`{{`}} $value {{`}}`}} standby replicas, putting
your cluster at risk if another instance fails. The cluster is still able to operate normally, although
the `-ro` and `-r` endpoints operate at reduced capacity.
This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may
need some time to catch-up with the cluster primary instance.
This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances.
In this case you may want to silence it.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
expr: |
max by (job) (cnpg_pg_replication_streaming_replicas{namespace="directus"} - cnpg_pg_replication_is_wal_receiver_up{namespace="directus"}) < 2
for: 5m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterHighConnectionsCritical
annotations:
summary: CNPG Instance maximum number of connections critical!
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of
the maximum number of connections.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
expr: |
sum by (pod) (cnpg_backends_total{namespace="directus", pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="directus", pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 95
for: 5m
labels:
severity: critical
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterHighConnectionsWarning
annotations:
summary: CNPG Instance is approaching the maximum number of connections.
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of
the maximum number of connections.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
expr: |
sum by (pod) (cnpg_backends_total{namespace="directus", pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="directus", pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 80
for: 5m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterHighReplicationLag
annotations:
summary: CNPG Cluster high replication lag
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" is experiencing a high replication lag of
{{`{{`}} $value {{`}}`}}ms.
High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
expr: |
max(cnpg_pg_replication_lag{namespace="directus",pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) * 1000 > 1000
for: 5m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterInstancesOnSameNode
annotations:
summary: CNPG Cluster instances are located on the same node.
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" has {{`{{`}} $value {{`}}`}}
instances on the same node {{`{{`}} $labels.node {{`}}`}}.
A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
expr: |
count by (node) (kube_pod_info{namespace="directus", pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) > 1
for: 5m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterLongRunningTransactionWarning
annotations:
summary: CNPG Cluster query is taking longer than 5 minutes.
description: |-
CloudNativePG Cluster Pod {{ $labels.pod }}
is taking more than 5 minutes (300 seconds) for a query.
expr: |-
cnpg_backends_max_tx_duration_seconds{namespace="directus"} > 300
for: 1m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterLowDiskSpaceCritical
annotations:
summary: CNPG Instance is running out of disk space!
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" is running extremely low on disk space. Check attached PVCs!
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
expr: |
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.9 OR
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.9 OR
max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
/
sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
*
on(namespace, persistentvolumeclaim) group_left(volume)
kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}
) > 0.9
for: 5m
labels:
severity: critical
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterLowDiskSpaceWarning
annotations:
summary: CNPG Instance is running out of disk space.
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" is running low on disk space. Check attached PVCs.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
expr: |
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.7 OR
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.7 OR
max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
/
sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="directus", persistentvolumeclaim=~"directus-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
*
on(namespace, persistentvolumeclaim) group_left(volume)
kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}
) > 0.7
for: 5m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterOffline
annotations:
summary: CNPG Cluster has no running instances!
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" has no ready instances.
Having an offline cluster means your applications will not be able to access the database, leading to
potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
expr: |
(count(cnpg_collector_up{namespace="directus",pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"}) OR on() vector(0)) == 0
for: 5m
labels:
severity: critical
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterPGDatabaseXidAgeWarning
annotations:
summary: CNPG Cluster has a number of transactions from the frozen XID to the current one.
description: |-
Over 300,000,000 transactions from frozen xid
on pod {{ $labels.pod }}
expr: |
cnpg_pg_database_xid_age{namespace="directus"} > 300000000
for: 1m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterPGReplicationWarning
annotations:
summary: CNPG Cluster standby is lagging behind the primary.
description: |-
Standby is lagging behind by over 300 seconds (5 minutes)
expr: |
cnpg_pg_replication_lag{namespace="directus"} > 300
for: 1m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterReplicaFailingReplicationWarning
annotations:
summary: CNPG Cluster has a replica is failing to replicate.
description: |-
Replica {{ $labels.pod }}
is failing to replicate
expr: |
cnpg_pg_replication_in_recovery{namespace="directus"} > cnpg_pg_replication_is_wal_receiver_up{namespace="directus"}
for: 1m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
- alert: CNPGClusterZoneSpreadWarning
annotations:
summary: CNPG Cluster instances in the same zone.
description: |-
CloudNativePG Cluster "directus/directus-postgresql-18-cluster" has instances in the same availability zone.
A disaster in one availability zone will lead to a potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
expr: |
3 > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="directus", pod=~"directus-postgresql-18-cluster-([1-9][0-9]*)$"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
for: 5m
labels:
severity: warning
namespace: directus
cnpg_cluster: directus-postgresql-18-cluster
---
apiVersion: postgresql.cnpg.io/v1
kind: ScheduledBackup
metadata:
name: "directus-postgresql-18-scheduled-backup-live-backup"
namespace: directus
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: directus-postgresql-18
app.kubernetes.io/instance: directus
app.kubernetes.io/part-of: directus
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: "directus-postgresql-18-scheduled-backup-live-backup"
spec:
immediate: true
suspend: false
schedule: "0 15 14 * * *"
backupOwnerReference: self
cluster:
name: directus-postgresql-18-cluster
method: plugin
pluginConfiguration:
name: barman-cloud.cloudnative-pg.io
parameters:
barmanObjectName: "directus-postgresql-18-backup-garage-local"
---
apiVersion: v1
kind: Service
metadata:
name: directus-valkey-headless
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: headless
spec:
type: ClusterIP
clusterIP: None
publishNotReadyAddresses: true
ports:
- name: tcp
port: 6379
targetPort: tcp
protocol: TCP
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
---
apiVersion: v1
kind: Service
metadata:
name: directus-valkey-read
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: read
spec:
type: ClusterIP
ports:
- name: tcp
port: 6379
targetPort: tcp
protocol: TCP
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
---
apiVersion: v1
kind: Service
metadata:
name: directus-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: primary
spec:
type: ClusterIP
ports:
- port: 6379
targetPort: tcp
protocol: TCP
name: tcp
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
statefulset.kubernetes.io/pod-name: directus-valkey-0
---
apiVersion: v1
kind: Service
metadata:
name: directus
labels:
app.kubernetes.io/instance: directus
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus
app.kubernetes.io/service: directus
helm.sh/chart: directus-4.6.2
namespace: directus
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 8055
protocol: TCP
name: http
selector:
app.kubernetes.io/controller: main
app.kubernetes.io/instance: directus
app.kubernetes.io/name: directus
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: directus-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
automountServiceAccountToken: false
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: directus
labels:
app.kubernetes.io/instance: directus
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: directus
helm.sh/chart: directus-4.6.2
namespace: directus
spec:
jobLabel: directus
namespaceSelector:
matchNames:
- directus
selector:
matchLabels:
app.kubernetes.io/instance: directus
app.kubernetes.io/name: directus
endpoints:
- bearerTokenSecret:
key: metric-token
name: directus-metric-token
interval: 30s
path: /metrics
port: http
scrapeTimeout: 15s
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: directus-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
spec:
serviceName: directus-valkey-headless
replicas: 3
podManagementPolicy: OrderedReady
selector:
matchLabels:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
volumeClaimTemplates:
- metadata:
name: valkey-data
spec:
accessModes:
- ReadWriteOnce
storageClassName: "ceph-block"
resources:
requests:
storage: "1Gi"
template:
metadata:
labels:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: directus
annotations:
checksum/initconfig: "6307ecb287c2f05dc09ba3cf7cdfd155"
spec:
automountServiceAccountToken: false
serviceAccountName: directus-valkey
securityContext:
fsGroup: 1000
runAsGroup: 1000
runAsUser: 1000
initContainers:
- name: directus-valkey-init
image: docker.io/valkey/valkey:9.0.3
imagePullPolicy: IfNotPresent
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
command: ["/scripts/init.sh"]
env:
- name: POD_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
volumeMounts:
- name: valkey-data
mountPath: /data
- name: scripts
mountPath: /scripts
- name: valkey-acl
mountPath: /etc/valkey
- name: valkey-users-secret
mountPath: /valkey-users-secret
readOnly: true
containers:
- name: directus-valkey
image: docker.io/valkey/valkey:9.0.3
imagePullPolicy: IfNotPresent
command: ["valkey-server"]
args: ["/data/conf/valkey.conf"]
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
env:
- name: POD_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
- name: VALKEY_LOGLEVEL
value: "notice"
ports:
- name: tcp
containerPort: 6379
protocol: TCP
startupProbe:
exec:
command: ["sh", "-c", "valkey-cli ping"]
livenessProbe:
exec:
command: ["sh", "-c", "valkey-cli ping"]
resources:
requests:
cpu: 10m
memory: 128Mi
volumeMounts:
- name: valkey-data
mountPath: /data
- name: valkey-acl
mountPath: /etc/valkey
volumes:
- name: scripts
configMap:
name: directus-valkey-init-scripts
defaultMode: 0555
- name: valkey-acl
emptyDir:
medium: Memory
- name: valkey-users-secret
secret:
secretName: directus-valkey-config
defaultMode: 0400