Files
infrastructure/clusters/cl01tl/manifests/kube-prometheus-stack/PrometheusRule-kube-prometheus-stack-kubernetes-system-apiserver.yaml

81 lines
4.6 KiB
YAML

---
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kube-prometheus-stack-kubernetes-system-apiserver
namespace: kube-prometheus-stack
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/version: "79.11.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-79.11.0
release: "kube-prometheus-stack"
heritage: "Helm"
spec:
groups:
- name: kubernetes-system-apiserver
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{ $labels.cluster }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |-
histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
and
on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
for: 5m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{ $labels.cluster }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |-
histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
and
on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
for: 5m
labels:
severity: critical
- alert: KubeAggregatedAPIErrors
annotations:
description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
expr: sum by (cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
for: 10m
labels:
severity: warning
- alert: KubeAggregatedAPIDown
annotations:
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m on cluster {{ $labels.cluster }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
expr: (1 - max by (name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
for: 5m
labels:
severity: warning
- alert: KubeAPIDown
annotations:
description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="apiserver"})
for: 15m
labels:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests on cluster {{ $labels.cluster }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
expr: sum by (cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by (cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by (cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning