--- # Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: kube-prometheus-stack-kubernetes-apps namespace: kube-prometheus-stack labels: app: kube-prometheus-stack app.kubernetes.io/managed-by: Helm app.kubernetes.io/instance: kube-prometheus-stack app.kubernetes.io/version: "79.11.0" app.kubernetes.io/part-of: kube-prometheus-stack chart: kube-prometheus-stack-79.11.0 release: "kube-prometheus-stack" heritage: "Helm" spec: groups: - name: kubernetes-apps rules: - alert: KubePodCrashLooping annotations: description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{ $labels.cluster }}.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping summary: Pod is crash looping. expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 for: 15m labels: severity: warning - alert: KubePodNotReady annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. expr: |- sum by (namespace, pod, cluster) ( max by (namespace, pod, cluster) ( kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"} ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 for: 15m labels: severity: warning - alert: KubeDeploymentGenerationMismatch annotations: description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back expr: |- kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} != kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} for: 15m labels: severity: warning - alert: KubeDeploymentReplicasMismatch annotations: description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: |- ( kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} > kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} ) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) == 0 ) for: 15m labels: severity: warning - alert: KubeDeploymentRolloutStuck annotations: description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck summary: Deployment rollout is not progressing. expr: |- kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"} != 0 for: 15m labels: severity: warning - alert: KubeStatefulSetReplicasMismatch annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch summary: StatefulSet has not matched the expected number of replicas. expr: |- ( kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"} != kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} ) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) == 0 ) for: 15m labels: severity: warning - alert: KubeStatefulSetGenerationMismatch annotations: description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back expr: |- kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"} != kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"} for: 15m labels: severity: warning - alert: KubeStatefulSetUpdateNotRolledOut annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout summary: StatefulSet update has not been rolled out. expr: |- ( max by (namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"} unless kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"} ) * on (namespace, statefulset, job, cluster) ( kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} ) ) and on (namespace, statefulset, job, cluster) ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) == 0 ) for: 15m labels: severity: warning - alert: KubeDaemonSetRolloutStuck annotations: description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck summary: DaemonSet rollout is stuck. expr: |- ( ( kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} ) or ( kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} != 0 ) or ( kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} ) or ( kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} ) ) and ( changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) == 0 ) for: 15m labels: severity: warning - alert: KubeContainerWaiting annotations: description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}") on cluster {{ $labels.cluster }}.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"} > 0 for: 1h labels: severity: warning - alert: KubeDaemonSetNotScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled on cluster {{ $labels.cluster }}.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. expr: |- kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 for: 10m labels: severity: warning - alert: KubeDaemonSetMisScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run on cluster {{ $labels.cluster }}.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} > 0 for: 15m labels: severity: warning - alert: KubeJobNotCompleted annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted summary: Job did not complete in time expr: |- time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} and kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 labels: severity: warning - alert: KubeJobFailed annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed summary: Job failed to complete. expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 for: 15m labels: severity: warning - alert: KubeHpaReplicasMismatch annotations: description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch summary: HPA has not matched desired number of replicas. expr: |- (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} != kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} > kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} < kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 for: 15m labels: severity: warning - alert: KubeHpaMaxedOut annotations: description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout summary: HPA is running at max replicas expr: |- kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} for: 15m labels: severity: warning - alert: KubePdbNotEnoughHealthyPods annotations: description: PDB {{ $labels.cluster }}/{{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least 15m. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods summary: PDB does not have enough healthy pods. expr: |- ( kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics", namespace=~".*"} - kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics", namespace=~".*"} ) > 0 for: 15m labels: severity: warning