--- # Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: kube-prometheus-stack-kubernetes-resources namespace: kube-prometheus-stack labels: app: kube-prometheus-stack app.kubernetes.io/managed-by: Helm app.kubernetes.io/instance: kube-prometheus-stack app.kubernetes.io/version: "79.11.0" app.kubernetes.io/part-of: kube-prometheus-stack chart: kube-prometheus-stack-79.11.0 release: "kube-prometheus-stack" heritage: "Helm" spec: groups: - name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ printf "%.2f" $value }} CPU shares and cannot tolerate node failure. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- # Non-HA clusters. ( ( sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{}) - sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) > 0 ) and count by (cluster) (max by (cluster, node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3 ) or # HA clusters. ( sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{}) - ( # Skip clusters with only one allocatable node. ( sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) - max by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) ) > 0 ) > 0 ) for: 10m labels: severity: warning - alert: KubeMemoryOvercommit annotations: description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: |- # Non-HA clusters. ( ( sum by (cluster) (namespace_memory:kube_pod_container_resource_requests:sum{}) - sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="memory"}) > 0 ) and count by (cluster) (max by (cluster, node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3 ) or # HA clusters. ( sum by (cluster) (namespace_memory:kube_pod_container_resource_requests:sum{}) - ( # Skip clusters with only one allocatable node. ( sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="memory"}) - max by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="memory"}) ) > 0 ) > 0 ) for: 10m labels: severity: warning - alert: KubeCPUQuotaOvercommit annotations: description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- sum by (cluster) ( min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"}) ) / sum by (cluster) ( kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"} ) > 1.5 for: 5m labels: severity: warning - alert: KubeMemoryQuotaOvercommit annotations: description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: |- sum by (cluster) ( min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"}) ) / sum by (cluster) ( kube_node_status_allocatable{resource="memory", job="kube-state-metrics"} ) > 1.5 for: 5m labels: severity: warning - alert: KubeQuotaAlmostFull annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull summary: Namespace quota is going to be full. expr: |- max without (instance, job, type) ( kube_resourcequota{job="kube-state-metrics", type="used"} ) / on (cluster, namespace, resource, resourcequota) group_left() ( max without (instance, job, type) ( kube_resourcequota{job="kube-state-metrics", type="hard"} ) > 0 ) > 0.9 < 1 for: 15m labels: severity: info - alert: KubeQuotaFullyUsed annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused summary: Namespace quota is fully used. expr: |- max without (instance, job, type) ( kube_resourcequota{job="kube-state-metrics", type="used"} ) / on (cluster, namespace, resource, resourcequota) group_left() ( max without (instance, job, type) ( kube_resourcequota{job="kube-state-metrics", type="hard"} ) > 0 ) == 1 for: 15m labels: severity: info - alert: KubeQuotaExceeded annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded summary: Namespace quota has exceeded the limits. expr: |- max without (instance, job, type) ( kube_resourcequota{job="kube-state-metrics", type="used"} ) / on (cluster, namespace, resource, resourcequota) group_left() ( max without (instance, job, type) ( kube_resourcequota{job="kube-state-metrics", type="hard"} ) > 0 ) > 1 for: 15m labels: severity: warning - alert: CPUThrottlingHigh annotations: description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }} on cluster {{ $labels.cluster }}.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: |- sum without (id, metrics_path, name, image, endpoint, job, node) ( topk by (cluster, namespace, pod, container, instance) (1, increase( container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", } [5m]) ) ) / on (cluster, namespace, pod, container, instance) group_left sum without (id, metrics_path, name, image, endpoint, job, node) ( topk by (cluster, namespace, pod, container, instance) (1, increase( container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", } [5m]) ) ) > ( 25 / 100 ) for: 15m labels: severity: info