Files
infrastructure/clusters/cl01tl/manifests/immich/Cluster-immich-postgresql-18-cluster.yaml

1457 lines
47 KiB
YAML

apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: immich-postgresql-18-cluster
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
spec:
instances: 3
imageName: "ghcr.io/tensorchord/cloudnative-vectorchord:18.0-0.5.3"
imagePullPolicy: IfNotPresent
postgresUID: 26
postgresGID: 26
storage:
size: 10Gi
storageClass: local-path
walStorage:
size: 2Gi
storageClass: local-path
resources:
limits:
hugepages-2Mi: 256Mi
requests:
cpu: 100m
memory: 256Mi
affinity:
enablePodAntiAffinity: true
topologyKey: kubernetes.io/hostname
primaryUpdateMethod: switchover
primaryUpdateStrategy: unsupervised
logLevel: info
enableSuperuserAccess: false
enablePDB: true
postgresql:
shared_preload_libraries:
- vchord.so
parameters:
hot_standby_feedback: "on"
max_slot_wal_keep_size: 2000MB
shared_buffers: 256MB
monitoring:
enablePodMonitor: true
disableDefaultQueries: false
plugins:
- name: barman-cloud.cloudnative-pg.io
enabled: true
isWALArchiver: true
parameters:
barmanObjectName: "immich-postgresql-18-backup-garage-local"
serverName: "immich-postgresql-18-backup-1"
bootstrap:
recovery:
database: app
source: immich-postgresql-18-backup-1
externalClusters:
- name: immich-postgresql-18-backup-1
plugin:
name: barman-cloud.cloudnative-pg.io
enabled: true
isWALArchiver: false
parameters:
barmanObjectName: "immich-postgresql-18-recovery"
serverName: immich-postgresql-18-backup-1
---
apiVersion: v1
kind: ConfigMap
metadata:
name: immich-valkey-init-scripts
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
data:
init.sh: |-
#!/bin/sh
set -eu
# Default config paths
VALKEY_CONFIG=${VALKEY_CONFIG_PATH:-/data/conf/valkey.conf}
LOGFILE="/data/init.log"
DATA_DIR="/data/conf"
# Logging function (outputs to stderr and file)
log() {
echo "$(date) $1" | tee -a "$LOGFILE" >&2
}
# Clean old log if requested
if [ "${KEEP_OLD_LOGS:-false}" != "true" ]; then
rm -f "$LOGFILE"
fi
if [ -f "$LOGFILE" ]; then
log "Detected restart of this instance ($HOSTNAME)"
fi
log "Creating configuration in $DATA_DIR..."
mkdir -p "$DATA_DIR"
rm -f "$VALKEY_CONFIG"
# Base valkey.conf
log "Generating base valkey.conf"
{
echo "port 6379"
echo "protected-mode no"
echo "bind * -::*"
echo "dir /data"
} >>"$VALKEY_CONFIG"
# Replica mode configuration
log "Configuring replication mode"
# Use POD_INDEX from Kubernetes metadata
POD_INDEX=${POD_INDEX:-0}
IS_MASTER=false
# Check if this is pod-0 (master)
if [ "$POD_INDEX" = "0" ]; then
IS_MASTER=true
log "This pod (index $POD_INDEX) is configured as MASTER"
else
log "This pod (index $POD_INDEX) is configured as REPLICA"
fi
# Configure replica settings
if [ "$IS_MASTER" = "false" ]; then
MASTER_HOST="immich-valkey-0.immich-valkey-headless.immich.svc.cluster.local"
MASTER_PORT="6379"
log "Configuring replica to follow master at $MASTER_HOST:$MASTER_PORT"
{
echo ""
echo "# Replica Configuration"
echo "replicaof $MASTER_HOST $MASTER_PORT"
echo "replica-announce-ip immich-valkey-$POD_INDEX.immich-valkey-headless.immich.svc.cluster.local"
} >>"$VALKEY_CONFIG"
fi
# Append extra configs if present
if [ -f /usr/local/etc/valkey/valkey.conf ]; then
log "Appending /usr/local/etc/valkey/valkey.conf"
cat /usr/local/etc/valkey/valkey.conf >>"$VALKEY_CONFIG"
fi
if [ -d /extravalkeyconfigs ]; then
log "Appending files in /extravalkeyconfigs/"
cat /extravalkeyconfigs/* >>"$VALKEY_CONFIG"
fi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: immich
labels:
app.kubernetes.io/controller: main
app.kubernetes.io/instance: immich
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich
helm.sh/chart: immich-4.6.2
namespace: immich
spec:
revisionHistoryLimit: 3
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/controller: main
app.kubernetes.io/name: immich
app.kubernetes.io/instance: immich
template:
metadata:
labels:
app.kubernetes.io/controller: main
app.kubernetes.io/instance: immich
app.kubernetes.io/name: immich
spec:
enableServiceLinks: false
serviceAccountName: default
automountServiceAccountToken: true
hostIPC: false
hostNetwork: false
hostPID: false
dnsPolicy: ClusterFirst
containers:
- env:
- name: TZ
value: US/Central
- name: IMMICH_TELEMETRY_INCLUDE
value: all
- name: IMMICH_CONFIG_FILE
value: /config/immich.json
- name: REDIS_HOSTNAME
value: immich-valkey
- name: DB_VECTOR_EXTENSION
value: vectorchord
- name: DB_HOSTNAME
valueFrom:
secretKeyRef:
key: host
name: immich-postgresql-18-cluster-app
- name: DB_DATABASE_NAME
valueFrom:
secretKeyRef:
key: dbname
name: immich-postgresql-18-cluster-app
- name: DB_PORT
valueFrom:
secretKeyRef:
key: port
name: immich-postgresql-18-cluster-app
- name: DB_USERNAME
valueFrom:
secretKeyRef:
key: user
name: immich-postgresql-18-cluster-app
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
key: password
name: immich-postgresql-18-cluster-app
image: ghcr.io/immich-app/immich-server:v2.5.6
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /api/server/ping
port: 2283
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 1
name: main
readinessProbe:
failureThreshold: 3
httpGet:
path: /api/server/ping
port: 2283
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 1
resources:
limits:
gpu.intel.com/i915: 1
requests:
cpu: 10m
gpu.intel.com/i915: 1
memory: 512Mi
startupProbe:
failureThreshold: 30
httpGet:
path: /api/server/ping
port: 2283
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 1
volumeMounts:
- mountPath: /config/immich.json
mountPropagation: None
name: config
readOnly: true
subPath: immich.json
- mountPath: /usr/src/app/upload
name: data
volumes:
- name: config
secret:
secretName: immich-config-secret
- name: data
persistentVolumeClaim:
claimName: immich
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: immich-backup-secret-external
namespace: immich
labels:
helm.sh/chart: volsync-target-data-0.8.0
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "0.8.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-backup-secret-external
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
target:
template:
mergePolicy: Merge
engineVersion: v2
data:
RESTIC_REPOSITORY: "{{ .BUCKET_ENDPOINT }}/immich/immich"
data:
- secretKey: BUCKET_ENDPOINT
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /volsync/restic/digital-ocean
metadataPolicy: None
property: BUCKET_ENDPOINT
- secretKey: RESTIC_PASSWORD
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /volsync/restic/digital-ocean
metadataPolicy: None
property: RESTIC_PASSWORD
- secretKey: AWS_DEFAULT_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /digital-ocean/home-infra/volsync-backups
metadataPolicy: None
property: AWS_DEFAULT_REGION
- secretKey: AWS_ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /digital-ocean/home-infra/volsync-backups
metadataPolicy: None
property: AWS_ACCESS_KEY_ID
- secretKey: AWS_SECRET_ACCESS_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /digital-ocean/home-infra/volsync-backups
metadataPolicy: None
property: AWS_SECRET_ACCESS_KEY
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: immich-backup-secret-local
namespace: immich
labels:
helm.sh/chart: volsync-target-data-0.8.0
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "0.8.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-backup-secret-local
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
target:
template:
mergePolicy: Merge
engineVersion: v2
data:
RESTIC_REPOSITORY: "{{ .BUCKET_ENDPOINT }}/immich/immich"
data:
- secretKey: BUCKET_ENDPOINT
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /volsync/restic/garage-local
metadataPolicy: None
property: BUCKET_ENDPOINT
- secretKey: RESTIC_PASSWORD
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /volsync/restic/garage-local
metadataPolicy: None
property: RESTIC_PASSWORD
- secretKey: AWS_DEFAULT_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/volsync-backups
metadataPolicy: None
property: ACCESS_REGION
- secretKey: AWS_ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/volsync-backups
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: AWS_SECRET_ACCESS_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/volsync-backups
metadataPolicy: None
property: ACCESS_SECRET_KEY
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: immich-backup-secret-remote
namespace: immich
labels:
helm.sh/chart: volsync-target-data-0.8.0
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "0.8.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-backup-secret-remote
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
target:
template:
mergePolicy: Merge
engineVersion: v2
data:
RESTIC_REPOSITORY: "{{ .BUCKET_ENDPOINT }}/immich/immich"
data:
- secretKey: BUCKET_ENDPOINT
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /volsync/restic/garage-remote
metadataPolicy: None
property: BUCKET_ENDPOINT
- secretKey: RESTIC_PASSWORD
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /volsync/restic/garage-remote
metadataPolicy: None
property: RESTIC_PASSWORD
- secretKey: AWS_DEFAULT_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/volsync-backups
metadataPolicy: None
property: ACCESS_REGION
- secretKey: AWS_ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/volsync-backups
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: AWS_SECRET_ACCESS_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/volsync-backups
metadataPolicy: None
property: ACCESS_SECRET_KEY
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: immich-config-secret
namespace: immich
labels:
app.kubernetes.io/name: immich-config-secret
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: immich.json
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /cl01tl/immich/config
metadataPolicy: None
property: immich.json
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: immich-postgresql-18-backup-garage-local-secret
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-postgresql-18-backup-garage-local-secret
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: ACCESS_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_REGION
- secretKey: ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: ACCESS_SECRET_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_SECRET_KEY
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: immich-postgresql-18-recovery-secret
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-postgresql-18-recovery-secret
spec:
secretStoreRef:
kind: ClusterSecretStore
name: vault
data:
- secretKey: ACCESS_REGION
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_REGION
- secretKey: ACCESS_KEY_ID
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_KEY_ID
- secretKey: ACCESS_SECRET_KEY
remoteRef:
conversionStrategy: Default
decodingStrategy: None
key: /garage/home-infra/postgres-backups
metadataPolicy: None
property: ACCESS_SECRET_KEY
---
apiVersion: gateway.networking.k8s.io/v1alpha2
kind: HTTPRoute
metadata:
name: immich
labels:
app.kubernetes.io/instance: immich
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich
helm.sh/chart: immich-4.6.2
namespace: immich
spec:
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: traefik-gateway
namespace: traefik
hostnames:
- "immich.alexlebens.net"
rules:
- backendRefs:
- group: ""
kind: Service
name: immich
namespace: immich
port: 2283
weight: 100
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: barmancloud.cnpg.io/v1
kind: ObjectStore
metadata:
name: immich-postgresql-18-backup-garage-local
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-postgresql-18-backup-garage-local
spec:
retentionPolicy: 7d
instanceSidecarConfiguration:
env:
- name: AWS_REQUEST_CHECKSUM_CALCULATION
value: when_required
- name: AWS_RESPONSE_CHECKSUM_VALIDATION
value: when_required
configuration:
destinationPath: s3://postgres-backups/cl01tl/immich/immich-postgresql-18-cluster
endpointURL: http://garage-main.garage:3900
s3Credentials:
accessKeyId:
name: immich-postgresql-18-backup-garage-local-secret
key: ACCESS_KEY_ID
secretAccessKey:
name: immich-postgresql-18-backup-garage-local-secret
key: ACCESS_SECRET_KEY
region:
name: immich-postgresql-18-backup-garage-local-secret
key: ACCESS_REGION
---
apiVersion: barmancloud.cnpg.io/v1
kind: ObjectStore
metadata:
name: "immich-postgresql-18-recovery"
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: "immich-postgresql-18-recovery"
spec:
configuration:
destinationPath: s3://postgres-backups/cl01tl/immich/immich-postgresql-18-cluster
endpointURL: http://garage-main.garage:3900
wal:
compression: snappy
maxParallel: 1
data:
compression: snappy
jobs: 1
s3Credentials:
accessKeyId:
name: immich-postgresql-18-recovery-secret
key: ACCESS_KEY_ID
secretAccessKey:
name: immich-postgresql-18-recovery-secret
key: ACCESS_SECRET_KEY
region:
name: immich-postgresql-18-recovery-secret
key: ACCESS_REGION
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: immich
labels:
app.kubernetes.io/instance: immich
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich
helm.sh/chart: immich-4.6.2
annotations:
helm.sh/resource-policy: keep
namespace: immich
spec:
accessModes:
- "ReadWriteOnce"
resources:
requests:
storage: "50Gi"
storageClassName: "ceph-block"
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: immich-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/part-of: valkey
app.kubernetes.io/component: podmonitor
spec:
podMetricsEndpoints:
- port: metrics
interval: 30s
namespaceSelector:
matchNames:
- immich
selector:
matchLabels:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: immich-postgresql-18-alert-rules
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
spec:
groups:
- name: cloudnative-pg/immich-postgresql-18
rules:
- alert: CNPGClusterBackendsWaitingWarning
annotations:
summary: CNPG Cluster a backend is waiting for longer than 5 minutes.
description: |-
Pod {{ $labels.pod }}
has been waiting for longer than 5 minutes
expr: |
cnpg_backends_waiting_total{namespace="immich"} > 300
for: 1m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterDatabaseDeadlockConflictsWarning
annotations:
summary: CNPG Cluster has over 10 deadlock conflicts.
description: |-
There are over 10 deadlock conflicts in
{{ $labels.pod }}
expr: |
cnpg_pg_stat_database_deadlocks{namespace="immich"} > 10
for: 1m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterHACritical
annotations:
summary: CNPG Cluster has no standby replicas!
description: |-
CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has no ready standby replicas. Your cluster at a severe
risk of data loss and downtime if the primary instance fails.
The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint
will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main.
This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less
instances. The replaced instance may need some time to catch-up with the cluster primary instance.
This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this
case you may want to silence it.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
expr: |
max by (job) (cnpg_pg_replication_streaming_replicas{namespace="immich"} - cnpg_pg_replication_is_wal_receiver_up{namespace="immich"}) < 1
for: 5m
labels:
severity: critical
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterHAWarning
annotations:
summary: CNPG Cluster less than 2 standby replicas.
description: |-
CloudNativePG Cluster "{{`{{`}} $labels.job {{`}}`}}" has only {{`{{`}} $value {{`}}`}} standby replicas, putting
your cluster at risk if another instance fails. The cluster is still able to operate normally, although
the `-ro` and `-r` endpoints operate at reduced capacity.
This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may
need some time to catch-up with the cluster primary instance.
This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances.
In this case you may want to silence it.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
expr: |
max by (job) (cnpg_pg_replication_streaming_replicas{namespace="immich"} - cnpg_pg_replication_is_wal_receiver_up{namespace="immich"}) < 2
for: 5m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterHighConnectionsCritical
annotations:
summary: CNPG Instance maximum number of connections critical!
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of
the maximum number of connections.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
expr: |
sum by (pod) (cnpg_backends_total{namespace="immich", pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="immich", pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 95
for: 5m
labels:
severity: critical
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterHighConnectionsWarning
annotations:
summary: CNPG Instance is approaching the maximum number of connections.
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" instance {{`{{`}} $labels.pod {{`}}`}} is using {{`{{`}} $value {{`}}`}}% of
the maximum number of connections.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
expr: |
sum by (pod) (cnpg_backends_total{namespace="immich", pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="immich", pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) * 100 > 80
for: 5m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterHighReplicationLag
annotations:
summary: CNPG Cluster high replication lag
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" is experiencing a high replication lag of
{{`{{`}} $value {{`}}`}}ms.
High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
expr: |
max(cnpg_pg_replication_lag{namespace="immich",pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) * 1000 > 1000
for: 5m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterInstancesOnSameNode
annotations:
summary: CNPG Cluster instances are located on the same node.
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" has {{`{{`}} $value {{`}}`}}
instances on the same node {{`{{`}} $labels.node {{`}}`}}.
A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
expr: |
count by (node) (kube_pod_info{namespace="immich", pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) > 1
for: 5m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterLongRunningTransactionWarning
annotations:
summary: CNPG Cluster query is taking longer than 5 minutes.
description: |-
CloudNativePG Cluster Pod {{ $labels.pod }}
is taking more than 5 minutes (300 seconds) for a query.
expr: |-
cnpg_backends_max_tx_duration_seconds{namespace="immich"} > 300
for: 1m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterLowDiskSpaceCritical
annotations:
summary: CNPG Instance is running out of disk space!
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" is running extremely low on disk space. Check attached PVCs!
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
expr: |
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.9 OR
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.9 OR
max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
/
sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
*
on(namespace, persistentvolumeclaim) group_left(volume)
kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}
) > 0.9
for: 5m
labels:
severity: critical
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterLowDiskSpaceWarning
annotations:
summary: CNPG Instance is running out of disk space.
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" is running low on disk space. Check attached PVCs.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
expr: |
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"} / kubelet_volume_stats_capacity_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"})) > 0.7 OR
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-wal"} / kubelet_volume_stats_capacity_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-wal"})) > 0.7 OR
max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
/
sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="immich", persistentvolumeclaim=~"immich-postgresql-18-cluster-([1-9][0-9]*)$-tbs.*"})
*
on(namespace, persistentvolumeclaim) group_left(volume)
kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}
) > 0.7
for: 5m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterOffline
annotations:
summary: CNPG Cluster has no running instances!
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" has no ready instances.
Having an offline cluster means your applications will not be able to access the database, leading to
potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
expr: |
(count(cnpg_collector_up{namespace="immich",pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"}) OR on() vector(0)) == 0
for: 5m
labels:
severity: critical
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterPGDatabaseXidAgeWarning
annotations:
summary: CNPG Cluster has a number of transactions from the frozen XID to the current one.
description: |-
Over 300,000,000 transactions from frozen xid
on pod {{ $labels.pod }}
expr: |
cnpg_pg_database_xid_age{namespace="immich"} > 300000000
for: 1m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterPGReplicationWarning
annotations:
summary: CNPG Cluster standby is lagging behind the primary.
description: |-
Standby is lagging behind by over 300 seconds (5 minutes)
expr: |
cnpg_pg_replication_lag{namespace="immich"} > 300
for: 1m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterReplicaFailingReplicationWarning
annotations:
summary: CNPG Cluster has a replica is failing to replicate.
description: |-
Replica {{ $labels.pod }}
is failing to replicate
expr: |
cnpg_pg_replication_in_recovery{namespace="immich"} > cnpg_pg_replication_is_wal_receiver_up{namespace="immich"}
for: 1m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
- alert: CNPGClusterZoneSpreadWarning
annotations:
summary: CNPG Cluster instances in the same zone.
description: |-
CloudNativePG Cluster "immich/immich-postgresql-18-cluster" has instances in the same availability zone.
A disaster in one availability zone will lead to a potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
expr: |
3 > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="immich", pod=~"immich-postgresql-18-cluster-([1-9][0-9]*)$"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
for: 5m
labels:
severity: warning
namespace: immich
cnpg_cluster: immich-postgresql-18-cluster
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: immich-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/part-of: valkey
spec:
groups:
- name: immich-valkey
rules:
- alert: ValkeyDown
annotations:
description: Valkey instance {{ $labels.instance }} is down.
summary: Valkey instance {{ $labels.instance }} down
expr: |
redis_up{service="immich-valkey-metrics"} == 0
for: 2m
labels:
severity: error
- alert: ValkeyMemoryHigh
annotations:
description: |
Valkey instance {{ $labels.instance }} is using {{ $value }}% of its available memory.
summary: Valkey instance {{ $labels.instance }} is using too much memory
expr: |
redis_memory_used_bytes{service="immich-valkey-metrics"} * 100
/
redis_memory_max_bytes{service="immich-valkey-metrics"}
> 90 <= 100
for: 2m
labels:
severity: error
- alert: ValkeyKeyEviction
annotations:
description: |
Valkey instance {{ $labels.instance }} has evicted {{ $value }} keys in the last 5 minutes.
summary: Valkey instance {{ $labels.instance }} has evicted keys
expr: |
increase(redis_evicted_keys_total{service="immich-valkey-metrics"}[5m]) > 0
for: 1s
labels:
severity: error
---
apiVersion: volsync.backube/v1alpha1
kind: ReplicationSource
metadata:
name: immich-backup-source-external
namespace: immich
labels:
helm.sh/chart: volsync-target-data-0.8.0
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "0.8.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-backup
spec:
sourcePVC: immich
trigger:
schedule: 24 10 * * *
restic:
pruneIntervalDays: 7
repository: immich-backup-secret-external
retain:
daily: 7
hourly: 0
monthly: 3
weekly: 4
yearly: 1
copyMethod: Snapshot
storageClassName: ceph-block
volumeSnapshotClassName: ceph-blockpool-snapshot
cacheCapacity: 10Gi
---
apiVersion: volsync.backube/v1alpha1
kind: ReplicationSource
metadata:
name: immich-backup-source-local
namespace: immich
labels:
helm.sh/chart: volsync-target-data-0.8.0
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "0.8.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-backup
spec:
sourcePVC: immich
trigger:
schedule: 24 8 * * *
restic:
pruneIntervalDays: 7
repository: immich-backup-secret-local
retain:
daily: 7
hourly: 0
monthly: 3
weekly: 4
yearly: 1
copyMethod: Snapshot
storageClassName: ceph-block
volumeSnapshotClassName: ceph-blockpool-snapshot
cacheCapacity: 10Gi
---
apiVersion: volsync.backube/v1alpha1
kind: ReplicationSource
metadata:
name: immich-backup-source-remote
namespace: immich
labels:
helm.sh/chart: volsync-target-data-0.8.0
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "0.8.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich-backup
spec:
sourcePVC: immich
trigger:
schedule: 24 9 * * *
restic:
pruneIntervalDays: 7
repository: immich-backup-secret-remote
retain:
daily: 7
hourly: 0
monthly: 3
weekly: 4
yearly: 1
copyMethod: Snapshot
storageClassName: ceph-block
volumeSnapshotClassName: ceph-blockpool-snapshot
cacheCapacity: 10Gi
---
apiVersion: postgresql.cnpg.io/v1
kind: ScheduledBackup
metadata:
name: "immich-postgresql-18-scheduled-backup-live-backup"
namespace: immich
labels:
helm.sh/chart: postgres-18-cluster-7.9.1
app.kubernetes.io/name: immich-postgresql-18
app.kubernetes.io/instance: immich
app.kubernetes.io/part-of: immich
app.kubernetes.io/version: "7.9.1"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: "immich-postgresql-18-scheduled-backup-live-backup"
spec:
immediate: true
suspend: false
schedule: "0 40 14 * * *"
backupOwnerReference: self
cluster:
name: immich-postgresql-18-cluster
method: plugin
pluginConfiguration:
name: barman-cloud.cloudnative-pg.io
parameters:
barmanObjectName: "immich-postgresql-18-backup-garage-local"
---
apiVersion: v1
kind: Service
metadata:
name: immich-valkey-headless
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: headless
spec:
type: ClusterIP
clusterIP: None
publishNotReadyAddresses: true
ports:
- name: tcp
port: 6379
targetPort: tcp
protocol: TCP
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
---
apiVersion: v1
kind: Service
metadata:
name: immich-valkey-metrics
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: valkey
annotations:
spec:
type: ClusterIP
ports:
- name: metrics
port: 9121
protocol: TCP
targetPort: metrics
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
---
apiVersion: v1
kind: Service
metadata:
name: immich-valkey-read
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: read
spec:
type: ClusterIP
ports:
- name: tcp
port: 6379
targetPort: tcp
protocol: TCP
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
---
apiVersion: v1
kind: Service
metadata:
name: immich-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: primary
spec:
type: ClusterIP
ports:
- port: 6379
targetPort: tcp
protocol: TCP
name: tcp
selector:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
statefulset.kubernetes.io/pod-name: immich-valkey-0
---
apiVersion: v1
kind: Service
metadata:
name: immich
labels:
app.kubernetes.io/instance: immich
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich
app.kubernetes.io/service: immich
helm.sh/chart: immich-4.6.2
namespace: immich
spec:
type: ClusterIP
ports:
- port: 2283
targetPort: 2283
protocol: TCP
name: http
- port: 8081
targetPort: 8081
protocol: TCP
name: metrics-api
- port: 8082
targetPort: 8082
protocol: TCP
name: metrics-ms
selector:
app.kubernetes.io/controller: main
app.kubernetes.io/instance: immich
app.kubernetes.io/name: immich
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: immich-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
automountServiceAccountToken: false
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: immich-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/part-of: valkey
app.kubernetes.io/component: service-monitor
spec:
endpoints:
- port: metrics
interval: 30s
namespaceSelector:
matchNames:
- immich
selector:
matchLabels:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/component: metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: immich
labels:
app.kubernetes.io/instance: immich
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: immich
helm.sh/chart: immich-4.6.2
namespace: immich
spec:
jobLabel: immich
namespaceSelector:
matchNames:
- immich
selector:
matchLabels:
app.kubernetes.io/instance: immich
app.kubernetes.io/name: immich
endpoints:
- interval: 3m
path: /metrics
port: metrics-api
scrapeTimeout: 1m
- interval: 3m
path: /metrics
port: metrics-ms
scrapeTimeout: 1m
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: immich-valkey
labels:
helm.sh/chart: valkey-0.9.3
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
app.kubernetes.io/version: "9.0.3"
app.kubernetes.io/managed-by: Helm
spec:
serviceName: immich-valkey-headless
replicas: 3
podManagementPolicy: OrderedReady
selector:
matchLabels:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
volumeClaimTemplates:
- metadata:
name: valkey-data
spec:
accessModes:
- ReadWriteOnce
storageClassName: "ceph-block"
resources:
requests:
storage: "1Gi"
template:
metadata:
labels:
app.kubernetes.io/name: valkey
app.kubernetes.io/instance: immich
annotations:
checksum/initconfig: "56fd0449d1eea259ddd955cd82dc8344"
spec:
automountServiceAccountToken: false
serviceAccountName: immich-valkey
securityContext:
fsGroup: 1000
runAsGroup: 1000
runAsUser: 1000
initContainers:
- name: immich-valkey-init
image: docker.io/valkey/valkey:9.0.3
imagePullPolicy: IfNotPresent
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
command: ["/scripts/init.sh"]
env:
- name: POD_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
volumeMounts:
- name: valkey-data
mountPath: /data
- name: scripts
mountPath: /scripts
containers:
- name: immich-valkey
image: docker.io/valkey/valkey:9.0.3
imagePullPolicy: IfNotPresent
command: ["valkey-server"]
args: ["/data/conf/valkey.conf"]
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
env:
- name: POD_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
- name: VALKEY_LOGLEVEL
value: "notice"
ports:
- name: tcp
containerPort: 6379
protocol: TCP
startupProbe:
exec:
command: ["sh", "-c", "valkey-cli ping"]
livenessProbe:
exec:
command: ["sh", "-c", "valkey-cli ping"]
resources:
requests:
cpu: 10m
memory: 128Mi
volumeMounts:
- name: valkey-data
mountPath: /data
- name: metrics
image: ghcr.io/oliver006/redis_exporter:v1.82.0
imagePullPolicy: "IfNotPresent"
ports:
- name: metrics
containerPort: 9121
startupProbe:
tcpSocket:
port: metrics
livenessProbe:
tcpSocket:
port: metrics
readinessProbe:
httpGet:
path: /
port: metrics
resources:
requests:
cpu: 10m
memory: 64M
env:
- name: REDIS_ALIAS
value: immich-valkey
volumes:
- name: scripts
configMap:
name: immich-valkey-init-scripts
defaultMode: 0555