This PR contains newly rendered Kubernetes manifests automatically generated by the CI workflow. Reviewed-on: #2174 Co-authored-by: gitea-bot <gitea-bot@alexlebens.net> Co-committed-by: gitea-bot <gitea-bot@alexlebens.net>
5729 lines
236 KiB
YAML
5729 lines
236 KiB
YAML
---
|
|
# Source: kube-prometheus-stack/templates/namespace.yaml
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: kube-prometheus-stack
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
pod-security.kubernetes.io/audit: privileged
|
|
pod-security.kubernetes.io/enforce: privileged
|
|
pod-security.kubernetes.io/warn: privileged
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
automountServiceAccountToken: true
|
|
metadata:
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
namespace: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/serviceaccount.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-node-exporter
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: prometheus-node-exporter-4.49.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: prometheus-node-exporter
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "1.10.2"
|
|
release: kube-prometheus-stack
|
|
automountServiceAccountToken: false
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/alertmanager/serviceaccount.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: kube-prometheus-stack-alertmanager
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-alertmanager
|
|
app.kubernetes.io/name: kube-prometheus-stack-alertmanager
|
|
app.kubernetes.io/component: alertmanager
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
automountServiceAccountToken: true
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/serviceaccount.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: kube-prometheus-stack-operator
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
automountServiceAccountToken: true
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/serviceaccount.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-prometheus
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus
|
|
app.kubernetes.io/component: prometheus
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
automountServiceAccountToken: true
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/alertmanager/secret.yaml
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: alertmanager-kube-prometheus-stack-alertmanager
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-alertmanager
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
data:
|
|
alertmanager.yaml: "Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0KaW5oaWJpdF9ydWxlczoKLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIC0gYWxlcnRuYW1lCiAgc291cmNlX21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSBjcml0aWNhbAogIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID1+IHdhcm5pbmd8aW5mbwotIGVxdWFsOgogIC0gbmFtZXNwYWNlCiAgLSBhbGVydG5hbWUKICBzb3VyY2VfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IHdhcm5pbmcKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIHNvdXJjZV9tYXRjaGVyczoKICAtIGFsZXJ0bmFtZSA9IEluZm9JbmhpYml0b3IKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBhbGVydG5hbWUgPSBJbmZvSW5oaWJpdG9yCnJlY2VpdmVyczoKLSBuYW1lOiBwdXNob3ZlcgogIHB1c2hvdmVyX2NvbmZpZ3M6CiAgLSBzZW5kX3Jlc29sdmVkOiB0cnVlCiAgICB0b2tlbl9maWxlOiAvZXRjL2FsZXJ0bWFuYWdlci9zZWNyZXRzL2FsZXJ0bWFuYWdlci1jb25maWctc2VjcmV0L3B1c2hvdmVyX3Rva2VuCiAgICB1c2VyX2tleV9maWxlOiAvZXRjL2FsZXJ0bWFuYWdlci9zZWNyZXRzL2FsZXJ0bWFuYWdlci1jb25maWctc2VjcmV0L3B1c2hvdmVyX3VzZXJfa2V5Ci0gbmFtZTogbnRmeQogIHdlYmhvb2tfY29uZmlnczoKICAtIGh0dHBfY29uZmlnOgogICAgICBiYXNpY19hdXRoOgogICAgICAgIHBhc3N3b3JkX2ZpbGU6IC9ldGMvYWxlcnRtYW5hZ2VyL3NlY3JldHMvYWxlcnRtYW5hZ2VyLWNvbmZpZy1zZWNyZXQvbnRmeV9wYXNzd29yZAogICAgICAgIHVzZXJuYW1lOiBudGZ5LWFsZXJ0bWFuYWdlcgogICAgdXJsOiBodHRwOi8vbnRmeS1hbGVydG1hbmFnZXIua3ViZS1wcm9tZXRoZXVzLXN0YWNrOjgwCnJvdXRlOgogIGdyb3VwX2J5OgogIC0gbmFtZXNwYWNlCiAgLSBhbGVydG5hbWUKICBncm91cF9pbnRlcnZhbDogNW0KICBncm91cF93YWl0OiAzMHMKICByZWNlaXZlcjogbnRmeQogIHJlcGVhdF9pbnRlcnZhbDogMjRoCiAgcm91dGVzOgogIC0gZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgICBncm91cF93YWl0OiAxMHMKICAgIHJlY2VpdmVyOiBudGZ5CiAgICByZXBlYXRfaW50ZXJ2YWw6IDI0aAp0ZW1wbGF0ZXM6Ci0gL2V0Yy9hbGVydG1hbmFnZXIvY29uZmlnLyoudG1wbA=="
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
rules:
|
|
|
|
- apiGroups: ["certificates.k8s.io"]
|
|
resources:
|
|
- certificatesigningrequests
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- configmaps
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["batch"]
|
|
resources:
|
|
- cronjobs
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["apps"]
|
|
resources:
|
|
- daemonsets
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["apps"]
|
|
resources:
|
|
- deployments
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- endpoints
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["autoscaling"]
|
|
resources:
|
|
- horizontalpodautoscalers
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["networking.k8s.io"]
|
|
resources:
|
|
- ingresses
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["batch"]
|
|
resources:
|
|
- jobs
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["coordination.k8s.io"]
|
|
resources:
|
|
- leases
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- limitranges
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["admissionregistration.k8s.io"]
|
|
resources:
|
|
- mutatingwebhookconfigurations
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- namespaces
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["networking.k8s.io"]
|
|
resources:
|
|
- networkpolicies
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- nodes
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- persistentvolumeclaims
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- persistentvolumes
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["policy"]
|
|
resources:
|
|
- poddisruptionbudgets
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- pods
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["apps"]
|
|
resources:
|
|
- replicasets
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- replicationcontrollers
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- resourcequotas
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- secrets
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: [""]
|
|
resources:
|
|
- services
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["apps"]
|
|
resources:
|
|
- statefulsets
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["storage.k8s.io"]
|
|
resources:
|
|
- storageclasses
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["admissionregistration.k8s.io"]
|
|
resources:
|
|
- validatingwebhookconfigurations
|
|
verbs: ["list", "watch"]
|
|
|
|
- apiGroups: ["storage.k8s.io"]
|
|
resources:
|
|
- volumeattachments
|
|
verbs: ["list", "watch"]
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/aggregate-clusterroles.yaml
|
|
kind: ClusterRole
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-crd-view
|
|
labels:
|
|
rbac.authorization.k8s.io/aggregate-to-admin: "true"
|
|
rbac.authorization.k8s.io/aggregate-to-edit: "true"
|
|
rbac.authorization.k8s.io/aggregate-to-view: "true"
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
rules:
|
|
- apiGroups: ["monitoring.coreos.com"]
|
|
resources: ["alertmanagers", "alertmanagerconfigs", "podmonitors", "probes", "prometheuses", "prometheusagents", "prometheusrules", "scrapeconfigs", "servicemonitors"]
|
|
verbs: ["get", "list", "watch"]
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/aggregate-clusterroles.yaml
|
|
kind: ClusterRole
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-crd-edit
|
|
labels:
|
|
rbac.authorization.k8s.io/aggregate-to-edit: "true"
|
|
rbac.authorization.k8s.io/aggregate-to-admin: "true"
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
rules:
|
|
- apiGroups: ["monitoring.coreos.com"]
|
|
resources: ["alertmanagers", "alertmanagerconfigs", "podmonitors", "probes", "prometheuses", "prometheusagents", "prometheusrules", "scrapeconfigs", "servicemonitors"]
|
|
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/clusterrole.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: kube-prometheus-stack-operator
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
rules:
|
|
- apiGroups:
|
|
- monitoring.coreos.com
|
|
resources:
|
|
- alertmanagers
|
|
- alertmanagers/finalizers
|
|
- alertmanagers/status
|
|
- alertmanagerconfigs
|
|
- prometheuses
|
|
- prometheuses/finalizers
|
|
- prometheuses/status
|
|
- prometheusagents
|
|
- prometheusagents/finalizers
|
|
- prometheusagents/status
|
|
- thanosrulers
|
|
- thanosrulers/finalizers
|
|
- thanosrulers/status
|
|
- scrapeconfigs
|
|
- scrapeconfigs/status
|
|
- servicemonitors
|
|
- servicemonitors/status
|
|
- podmonitors
|
|
- podmonitors/status
|
|
- probes
|
|
- probes/status
|
|
- prometheusrules
|
|
verbs:
|
|
- '*'
|
|
- apiGroups:
|
|
- apps
|
|
resources:
|
|
- statefulsets
|
|
verbs:
|
|
- '*'
|
|
- apiGroups:
|
|
- ""
|
|
resources:
|
|
- configmaps
|
|
- secrets
|
|
verbs:
|
|
- '*'
|
|
- apiGroups:
|
|
- ""
|
|
resources:
|
|
- pods
|
|
verbs:
|
|
- list
|
|
- delete
|
|
- apiGroups:
|
|
- ""
|
|
resources:
|
|
- services
|
|
- services/finalizers
|
|
- endpoints
|
|
verbs:
|
|
- get
|
|
- create
|
|
- update
|
|
- delete
|
|
- apiGroups:
|
|
- ""
|
|
resources:
|
|
- nodes
|
|
verbs:
|
|
- list
|
|
- watch
|
|
- apiGroups:
|
|
- ""
|
|
resources:
|
|
- namespaces
|
|
verbs:
|
|
- get
|
|
- list
|
|
- watch
|
|
- apiGroups:
|
|
- ""
|
|
- events.k8s.io
|
|
resources:
|
|
- events
|
|
verbs:
|
|
- patch
|
|
- create
|
|
- apiGroups:
|
|
- networking.k8s.io
|
|
resources:
|
|
- ingresses
|
|
verbs:
|
|
- get
|
|
- list
|
|
- watch
|
|
- apiGroups:
|
|
- storage.k8s.io
|
|
resources:
|
|
- storageclasses
|
|
verbs:
|
|
- get
|
|
- apiGroups:
|
|
- discovery.k8s.io
|
|
resources:
|
|
- endpointslices
|
|
verbs:
|
|
- get
|
|
- create
|
|
- list
|
|
- watch
|
|
- update
|
|
- delete
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/clusterrole.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
labels:
|
|
app: kube-prometheus-stack-prometheus
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
rules:
|
|
# These permissions (to examine all namespaces) are not in the kube-prometheus repo.
|
|
# They're grabbed from https://github.com/prometheus/prometheus/blob/master/documentation/examples/rbac-setup.yml
|
|
# kube-prometheus deliberately defaults to a more restrictive setup that is not appropriate for our general audience.
|
|
- apiGroups: [""]
|
|
resources:
|
|
- nodes
|
|
- nodes/metrics
|
|
- services
|
|
- endpoints
|
|
- pods
|
|
verbs: ["get", "list", "watch"]
|
|
- apiGroups: ["discovery.k8s.io"]
|
|
resources:
|
|
- endpointslices
|
|
verbs: ["get", "list", "watch"]
|
|
- apiGroups:
|
|
- "networking.k8s.io"
|
|
resources:
|
|
- ingresses
|
|
verbs: ["get", "list", "watch"]
|
|
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
|
|
verbs: ["get"]
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
namespace: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/clusterrolebinding.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: kube-prometheus-stack-operator
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: kube-prometheus-stack-operator
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kube-prometheus-stack-operator
|
|
namespace: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/clusterrolebinding.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
labels:
|
|
app: kube-prometheus-stack-prometheus
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: kube-prometheus-stack-prometheus
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kube-prometheus-stack-prometheus
|
|
namespace: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
annotations:
|
|
spec:
|
|
type: "ClusterIP"
|
|
ports:
|
|
- name: http
|
|
protocol: TCP
|
|
port: 8080
|
|
targetPort: http
|
|
|
|
selector:
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-node-exporter
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: prometheus-node-exporter-4.49.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: prometheus-node-exporter
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "1.10.2"
|
|
release: kube-prometheus-stack
|
|
jobLabel: node-exporter
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9100
|
|
targetPort: 9100
|
|
protocol: TCP
|
|
name: http-metrics
|
|
selector:
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/alertmanager/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-alertmanager
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-alertmanager
|
|
self-monitor: "true"
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
ports:
|
|
- name: http-web
|
|
port: 9093
|
|
targetPort: 9093
|
|
protocol: TCP
|
|
- name: reloader-web
|
|
appProtocol: http
|
|
port: 8080
|
|
targetPort: reloader-web
|
|
selector:
|
|
app.kubernetes.io/name: alertmanager
|
|
alertmanager: kube-prometheus-stack-alertmanager
|
|
sessionAffinity: None
|
|
type: "ClusterIP"
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/exporters/core-dns/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-coredns
|
|
labels:
|
|
app: kube-prometheus-stack-coredns
|
|
jobLabel: coredns
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
namespace: kube-system
|
|
spec:
|
|
clusterIP: None
|
|
ports:
|
|
- name: http-metrics
|
|
port: 9153
|
|
protocol: TCP
|
|
targetPort: 9153
|
|
selector:
|
|
k8s-app: kube-dns
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/exporters/kube-etcd/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-etcd
|
|
labels:
|
|
app: kube-prometheus-stack-kube-etcd
|
|
jobLabel: kube-etcd
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
namespace: kube-system
|
|
spec:
|
|
clusterIP: None
|
|
ports:
|
|
- name: http-metrics
|
|
port: 2381
|
|
protocol: TCP
|
|
targetPort: 2381
|
|
selector:
|
|
k8s-app: kube-controller-manager
|
|
type: ClusterIP
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-operator
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
spec:
|
|
ports:
|
|
- name: https
|
|
port: 443
|
|
targetPort: https
|
|
selector:
|
|
app: kube-prometheus-stack-operator
|
|
release: "kube-prometheus-stack"
|
|
type: "ClusterIP"
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-prometheus
|
|
self-monitor: "true"
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
ports:
|
|
- name: http-web
|
|
port: 9090
|
|
targetPort: 9090
|
|
- name: reloader-web
|
|
appProtocol: http
|
|
port: 8080
|
|
targetPort: reloader-web
|
|
publishNotReadyAddresses: false
|
|
selector:
|
|
app.kubernetes.io/name: prometheus
|
|
operator.prometheus.io/name: kube-prometheus-stack-prometheus
|
|
sessionAffinity: None
|
|
type: "ClusterIP"
|
|
---
|
|
# Source: kube-prometheus-stack/charts/ntfy-alertmanager/templates/common.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: ntfy-alertmanager
|
|
labels:
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/name: kube-prometheus-stack
|
|
app.kubernetes.io/service: ntfy-alertmanager
|
|
helm.sh/chart: ntfy-alertmanager-4.4.0
|
|
namespace: kube-prometheus-stack
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 80
|
|
targetPort: 8080
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app.kubernetes.io/controller: main
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/name: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/templates/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: node-ps10rp
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: node-ps10rp
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
annotations:
|
|
tailscale.com/tailnet-fqdn: node-exporter-ps10rp.boreal-beaufort.ts.net
|
|
spec:
|
|
externalName: placeholder
|
|
type: ExternalName
|
|
---
|
|
# Source: kube-prometheus-stack/templates/service.yaml
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: garage-ps10rp
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: garage-ps10rp
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
annotations:
|
|
tailscale.com/tailnet-fqdn: garage-ps10rp.boreal-beaufort.ts.net
|
|
spec:
|
|
externalName: placeholder
|
|
type: ExternalName
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/daemonset.yaml
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-node-exporter
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: prometheus-node-exporter-4.49.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: prometheus-node-exporter
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "1.10.2"
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
revisionHistoryLimit: 10
|
|
updateStrategy:
|
|
rollingUpdate:
|
|
maxUnavailable: 1
|
|
type: RollingUpdate
|
|
template:
|
|
metadata:
|
|
annotations:
|
|
cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
|
|
labels:
|
|
helm.sh/chart: prometheus-node-exporter-4.49.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: prometheus-node-exporter
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "1.10.2"
|
|
release: kube-prometheus-stack
|
|
jobLabel: node-exporter
|
|
spec:
|
|
automountServiceAccountToken: false
|
|
securityContext:
|
|
fsGroup: 65534
|
|
runAsGroup: 65534
|
|
runAsNonRoot: true
|
|
runAsUser: 65534
|
|
serviceAccountName: kube-prometheus-stack-prometheus-node-exporter
|
|
containers:
|
|
- name: node-exporter
|
|
image: quay.io/prometheus/node-exporter:v1.10.2
|
|
imagePullPolicy: IfNotPresent
|
|
args:
|
|
- --path.procfs=/host/proc
|
|
- --path.sysfs=/host/sys
|
|
- --path.rootfs=/host/root
|
|
- --path.udev.data=/host/root/run/udev/data
|
|
- --web.listen-address=[$(HOST_IP)]:9100
|
|
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run/containerd/.+|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
|
|
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs|erofs)$
|
|
securityContext:
|
|
readOnlyRootFilesystem: true
|
|
env:
|
|
- name: HOST_IP
|
|
value: 0.0.0.0
|
|
ports:
|
|
- name: http-metrics
|
|
containerPort: 9100
|
|
protocol: TCP
|
|
livenessProbe:
|
|
failureThreshold: 3
|
|
httpGet:
|
|
httpHeaders:
|
|
path: /
|
|
port: http-metrics
|
|
scheme: HTTP
|
|
initialDelaySeconds: 0
|
|
periodSeconds: 10
|
|
successThreshold: 1
|
|
timeoutSeconds: 1
|
|
readinessProbe:
|
|
failureThreshold: 3
|
|
httpGet:
|
|
httpHeaders:
|
|
path: /
|
|
port: http-metrics
|
|
scheme: HTTP
|
|
initialDelaySeconds: 0
|
|
periodSeconds: 10
|
|
successThreshold: 1
|
|
timeoutSeconds: 1
|
|
volumeMounts:
|
|
- name: proc
|
|
mountPath: /host/proc
|
|
readOnly: true
|
|
- name: sys
|
|
mountPath: /host/sys
|
|
readOnly: true
|
|
- name: root
|
|
mountPath: /host/root
|
|
mountPropagation: HostToContainer
|
|
readOnly: true
|
|
hostNetwork: true
|
|
hostPID: true
|
|
hostIPC: false
|
|
affinity:
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: eks.amazonaws.com/compute-type
|
|
operator: NotIn
|
|
values:
|
|
- fargate
|
|
- key: type
|
|
operator: NotIn
|
|
values:
|
|
- virtual-kubelet
|
|
nodeSelector:
|
|
kubernetes.io/os: linux
|
|
tolerations:
|
|
- effect: NoSchedule
|
|
operator: Exists
|
|
volumes:
|
|
- name: proc
|
|
hostPath:
|
|
path: /proc
|
|
- name: sys
|
|
hostPath:
|
|
path: /sys
|
|
- name: root
|
|
hostPath:
|
|
path: /
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
replicas: 1
|
|
strategy:
|
|
type: RollingUpdate
|
|
revisionHistoryLimit: 10
|
|
template:
|
|
metadata:
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
automountServiceAccountToken: true
|
|
hostNetwork: false
|
|
serviceAccountName: kube-prometheus-stack-kube-state-metrics
|
|
securityContext:
|
|
fsGroup: 65534
|
|
runAsGroup: 65534
|
|
runAsNonRoot: true
|
|
runAsUser: 65534
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
dnsPolicy: ClusterFirst
|
|
containers:
|
|
- name: kube-state-metrics
|
|
args:
|
|
- --port=8080
|
|
- --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments
|
|
imagePullPolicy: IfNotPresent
|
|
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.17.0
|
|
ports:
|
|
- containerPort: 8080
|
|
name: http
|
|
livenessProbe:
|
|
failureThreshold: 3
|
|
httpGet:
|
|
httpHeaders:
|
|
path: /livez
|
|
port: 8080
|
|
scheme: HTTP
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 10
|
|
successThreshold: 1
|
|
timeoutSeconds: 5
|
|
readinessProbe:
|
|
failureThreshold: 3
|
|
httpGet:
|
|
httpHeaders:
|
|
path: /readyz
|
|
port: 8081
|
|
scheme: HTTP
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 10
|
|
successThreshold: 1
|
|
timeoutSeconds: 5
|
|
resources:
|
|
{}
|
|
securityContext:
|
|
allowPrivilegeEscalation: false
|
|
capabilities:
|
|
drop:
|
|
- ALL
|
|
readOnlyRootFilesystem: true
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/deployment.yaml
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: kube-prometheus-stack-operator
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
spec:
|
|
replicas: 1
|
|
revisionHistoryLimit: 10
|
|
selector:
|
|
matchLabels:
|
|
app: kube-prometheus-stack-operator
|
|
release: "kube-prometheus-stack"
|
|
template:
|
|
metadata:
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
spec:
|
|
containers:
|
|
- name: kube-prometheus-stack
|
|
image: "quay.io/prometheus-operator/prometheus-operator:v0.86.2"
|
|
imagePullPolicy: "IfNotPresent"
|
|
args:
|
|
- --kubelet-service=kube-system/kube-prometheus-stack-kubelet
|
|
- --kubelet-endpoints=true
|
|
- --kubelet-endpointslice=false
|
|
- --localhost=127.0.0.1
|
|
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.86.2
|
|
- --config-reloader-cpu-request=0
|
|
- --config-reloader-cpu-limit=0
|
|
- --config-reloader-memory-request=0
|
|
- --config-reloader-memory-limit=0
|
|
- --thanos-default-base-image=quay.io/thanos/thanos:v0.40.1
|
|
- --secret-field-selector=type!=kubernetes.io/dockercfg,type!=kubernetes.io/service-account-token,type!=helm.sh/release.v1
|
|
- --web.enable-tls=true
|
|
- --web.cert-file=/cert/cert
|
|
- --web.key-file=/cert/key
|
|
- --web.listen-address=:10250
|
|
- --web.tls-min-version=VersionTLS13
|
|
ports:
|
|
- containerPort: 10250
|
|
name: https
|
|
env:
|
|
- name: GOGC
|
|
value: "30"
|
|
resources:
|
|
{}
|
|
securityContext:
|
|
allowPrivilegeEscalation: false
|
|
capabilities:
|
|
drop:
|
|
- ALL
|
|
readOnlyRootFilesystem: true
|
|
volumeMounts:
|
|
- name: tls-secret
|
|
mountPath: /cert
|
|
readOnly: true
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /healthz
|
|
port: https
|
|
scheme: HTTPS
|
|
initialDelaySeconds: 0
|
|
periodSeconds: 10
|
|
timeoutSeconds: 1
|
|
successThreshold: 1
|
|
failureThreshold: 3
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /healthz
|
|
port: https
|
|
scheme: HTTPS
|
|
initialDelaySeconds: 0
|
|
periodSeconds: 10
|
|
timeoutSeconds: 1
|
|
successThreshold: 1
|
|
failureThreshold: 3
|
|
volumes:
|
|
- name: tls-secret
|
|
secret:
|
|
defaultMode: 420
|
|
secretName: kube-prometheus-stack-admission
|
|
securityContext:
|
|
fsGroup: 65534
|
|
runAsGroup: 65534
|
|
runAsNonRoot: true
|
|
runAsUser: 65534
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
serviceAccountName: kube-prometheus-stack-operator
|
|
automountServiceAccountToken: true
|
|
terminationGracePeriodSeconds: 30
|
|
---
|
|
# Source: kube-prometheus-stack/charts/ntfy-alertmanager/templates/common.yaml
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: ntfy-alertmanager
|
|
labels:
|
|
app.kubernetes.io/controller: main
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/name: kube-prometheus-stack
|
|
helm.sh/chart: ntfy-alertmanager-4.4.0
|
|
namespace: kube-prometheus-stack
|
|
spec:
|
|
revisionHistoryLimit: 3
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/controller: main
|
|
app.kubernetes.io/name: kube-prometheus-stack
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/controller: main
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/name: kube-prometheus-stack
|
|
spec:
|
|
enableServiceLinks: false
|
|
serviceAccountName: default
|
|
automountServiceAccountToken: true
|
|
hostIPC: false
|
|
hostNetwork: false
|
|
hostPID: false
|
|
dnsPolicy: ClusterFirst
|
|
containers:
|
|
- image: xenrox/ntfy-alertmanager:0.5.0
|
|
imagePullPolicy: IfNotPresent
|
|
name: main
|
|
volumeMounts:
|
|
- mountPath: /etc/ntfy-alertmanager/config
|
|
mountPropagation: None
|
|
name: config
|
|
readOnly: true
|
|
subPath: config
|
|
volumes:
|
|
- name: config
|
|
secret:
|
|
secretName: ntfy-alertmanager-config-secret
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/alertmanager/alertmanager.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: Alertmanager
|
|
metadata:
|
|
name: kube-prometheus-stack-alertmanager
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-alertmanager
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
image: "quay.io/prometheus/alertmanager:v0.29.0"
|
|
imagePullPolicy: "IfNotPresent"
|
|
version: v0.29.0
|
|
replicas: 1
|
|
listenLocal: false
|
|
serviceAccountName: kube-prometheus-stack-alertmanager
|
|
automountServiceAccountToken: true
|
|
externalUrl: http://kube-prometheus-stack-alertmanager.kube-prometheus-stack:9093
|
|
paused: false
|
|
logFormat: "logfmt"
|
|
logLevel: "info"
|
|
retention: "120h"
|
|
secrets:
|
|
- alertmanager-config-secret
|
|
alertmanagerConfigSelector: {}
|
|
alertmanagerConfigNamespaceSelector:
|
|
{}
|
|
routePrefix: "/"
|
|
securityContext:
|
|
fsGroup: 2000
|
|
runAsGroup: 2000
|
|
runAsNonRoot: true
|
|
runAsUser: 1000
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
topologyKey: kubernetes.io/hostname
|
|
labelSelector:
|
|
matchExpressions:
|
|
- {key: app.kubernetes.io/name, operator: In, values: [alertmanager]}
|
|
- {key: alertmanager, operator: In, values: [kube-prometheus-stack-alertmanager]}
|
|
portName: http-web
|
|
---
|
|
# Source: kube-prometheus-stack/templates/external-secret.yaml
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ExternalSecret
|
|
metadata:
|
|
name: alertmanager-config-secret
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: alertmanager-config-secret
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
secretStoreRef:
|
|
kind: ClusterSecretStore
|
|
name: vault
|
|
data:
|
|
- secretKey: pushover_token
|
|
remoteRef:
|
|
conversionStrategy: Default
|
|
decodingStrategy: None
|
|
key: /pushover/key
|
|
metadataPolicy: None
|
|
property: alertmanager_key
|
|
- secretKey: pushover_user_key
|
|
remoteRef:
|
|
conversionStrategy: Default
|
|
decodingStrategy: None
|
|
key: /pushover/key
|
|
metadataPolicy: None
|
|
property: user_key
|
|
- secretKey: ntfy_password
|
|
remoteRef:
|
|
conversionStrategy: Default
|
|
decodingStrategy: None
|
|
key: /cl01tl/kube-prometheus-stack/ntfy-alertmanager
|
|
metadataPolicy: None
|
|
property: ntfy_password
|
|
---
|
|
# Source: kube-prometheus-stack/templates/external-secret.yaml
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ExternalSecret
|
|
metadata:
|
|
name: garage-metric-secret
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: garage-metric-secret
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
secretStoreRef:
|
|
kind: ClusterSecretStore
|
|
name: vault
|
|
data:
|
|
- secretKey: token
|
|
remoteRef:
|
|
conversionStrategy: Default
|
|
decodingStrategy: None
|
|
key: /garage/token
|
|
metadataPolicy: None
|
|
property: metric
|
|
---
|
|
# Source: kube-prometheus-stack/templates/external-secret.yaml
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ExternalSecret
|
|
metadata:
|
|
name: ntfy-alertmanager-config-secret
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: ntfy-alertmanager-config-secret
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
secretStoreRef:
|
|
kind: ClusterSecretStore
|
|
name: vault
|
|
data:
|
|
- secretKey: ntfy_password
|
|
remoteRef:
|
|
conversionStrategy: Default
|
|
decodingStrategy: None
|
|
key: /cl01tl/kube-prometheus-stack/ntfy-alertmanager
|
|
metadataPolicy: None
|
|
property: ntfy_password
|
|
- secretKey: config
|
|
remoteRef:
|
|
conversionStrategy: Default
|
|
decodingStrategy: None
|
|
key: /cl01tl/kube-prometheus-stack/ntfy-alertmanager
|
|
metadataPolicy: None
|
|
property: config
|
|
---
|
|
# Source: kube-prometheus-stack/templates/http-route.yaml
|
|
apiVersion: gateway.networking.k8s.io/v1
|
|
kind: HTTPRoute
|
|
metadata:
|
|
name: http-route-prometheus
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: http-route-prometheus
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
parentRefs:
|
|
- group: gateway.networking.k8s.io
|
|
kind: Gateway
|
|
name: traefik-gateway
|
|
namespace: traefik
|
|
hostnames:
|
|
- prometheus.alexlebens.net
|
|
rules:
|
|
- matches:
|
|
- path:
|
|
type: PathPrefix
|
|
value: /
|
|
backendRefs:
|
|
- group: ''
|
|
kind: Service
|
|
name: prometheus-operated
|
|
port: 9090
|
|
weight: 100
|
|
---
|
|
# Source: kube-prometheus-stack/templates/http-route.yaml
|
|
apiVersion: gateway.networking.k8s.io/v1
|
|
kind: HTTPRoute
|
|
metadata:
|
|
name: http-route-alertmanager
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: http-route-alertmanager
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
parentRefs:
|
|
- group: gateway.networking.k8s.io
|
|
kind: Gateway
|
|
name: traefik-gateway
|
|
namespace: traefik
|
|
hostnames:
|
|
- alertmanager.alexlebens.net
|
|
rules:
|
|
- matches:
|
|
- path:
|
|
type: PathPrefix
|
|
value: /
|
|
backendRefs:
|
|
- group: ''
|
|
kind: Service
|
|
name: kube-prometheus-stack-alertmanager
|
|
port: 9093
|
|
weight: 100
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
|
|
apiVersion: admissionregistration.k8s.io/v1
|
|
kind: MutatingWebhookConfiguration
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
annotations:
|
|
|
|
argocd.argoproj.io/hook: PreSync
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
webhooks:
|
|
- name: prometheusrulemutate.monitoring.coreos.com
|
|
failurePolicy: Ignore
|
|
rules:
|
|
- apiGroups:
|
|
- monitoring.coreos.com
|
|
apiVersions:
|
|
- "*"
|
|
resources:
|
|
- prometheusrules
|
|
operations:
|
|
- CREATE
|
|
- UPDATE
|
|
clientConfig:
|
|
service:
|
|
namespace: kube-prometheus-stack
|
|
name: kube-prometheus-stack-operator
|
|
path: /admission-prometheusrules/mutate
|
|
timeoutSeconds: 10
|
|
admissionReviewVersions: ["v1", "v1beta1"]
|
|
sideEffects: None
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/prometheus.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: Prometheus
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-prometheus
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
automountServiceAccountToken: true
|
|
alerting:
|
|
alertmanagers:
|
|
- namespace: kube-prometheus-stack
|
|
name: kube-prometheus-stack-alertmanager
|
|
port: http-web
|
|
pathPrefix: "/"
|
|
apiVersion: v2
|
|
image: "quay.io/prometheus/prometheus:v3.7.3"
|
|
imagePullPolicy: "IfNotPresent"
|
|
version: v3.7.3
|
|
externalUrl: "https://prometheus.alexlebens.net"
|
|
paused: false
|
|
replicas: 1
|
|
shards: 1
|
|
logLevel: "info"
|
|
logFormat: logfmt
|
|
listenLocal: false
|
|
enableOTLPReceiver: false
|
|
enableAdminAPI: false
|
|
scrapeInterval: 30s
|
|
retention: "30d"
|
|
tsdb:
|
|
outOfOrderTimeWindow: 0s
|
|
walCompression: true
|
|
routePrefix: "/"
|
|
serviceAccountName: kube-prometheus-stack-prometheus
|
|
serviceMonitorSelector: {}
|
|
serviceMonitorNamespaceSelector: {}
|
|
podMonitorSelector: {}
|
|
podMonitorNamespaceSelector: {}
|
|
probeSelector:
|
|
matchLabels:
|
|
release: "kube-prometheus-stack"
|
|
|
|
probeNamespaceSelector: {}
|
|
securityContext:
|
|
fsGroup: 2000
|
|
runAsGroup: 2000
|
|
runAsNonRoot: true
|
|
runAsUser: 1000
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
ruleNamespaceSelector: {}
|
|
ruleSelector: {}
|
|
scrapeConfigSelector: {}
|
|
scrapeConfigNamespaceSelector: {}
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
resources:
|
|
requests:
|
|
storage: 250Gi
|
|
storageClassName: synology-iscsi-delete
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
topologyKey: kubernetes.io/hostname
|
|
labelSelector:
|
|
matchExpressions:
|
|
- {key: app.kubernetes.io/name, operator: In, values: [prometheus]}
|
|
- {key: app.kubernetes.io/instance, operator: In, values: [kube-prometheus-stack-prometheus]}
|
|
portName: http-web
|
|
hostNetwork: false
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-alertmanager.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerFailedReload
|
|
annotations:
|
|
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
|
summary: Reloading an Alertmanager configuration has failed.
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(alertmanager_config_last_reload_successful{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[5m]) == 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerMembersInconsistent
|
|
annotations:
|
|
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
|
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[5m])
|
|
< on (namespace,service,cluster) group_left
|
|
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[5m]))
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerFailedToSendAlerts
|
|
annotations:
|
|
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
|
summary: An Alertmanager instance failed to send notifications.
|
|
expr: |-
|
|
(
|
|
rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[15m])
|
|
/
|
|
ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[15m])
|
|
)
|
|
> 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
annotations:
|
|
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
|
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
|
expr: |-
|
|
min by (namespace,service, integration) (
|
|
rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack", integration=~`.*`}[15m])
|
|
/
|
|
ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack", integration=~`.*`}[15m])
|
|
)
|
|
> 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
annotations:
|
|
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
|
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
|
expr: |-
|
|
min by (namespace,service, integration) (
|
|
rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack", integration!~`.*`}[15m])
|
|
/
|
|
ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack", integration!~`.*`}[15m])
|
|
)
|
|
> 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: AlertmanagerConfigInconsistent
|
|
annotations:
|
|
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
|
summary: Alertmanager instances within the same cluster have different configurations.
|
|
expr: |-
|
|
count by (namespace,service,cluster) (
|
|
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"})
|
|
)
|
|
!= 1
|
|
for: 20m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerClusterDown
|
|
annotations:
|
|
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
|
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
|
expr: |-
|
|
(
|
|
count by (namespace,service,cluster) (
|
|
avg_over_time(up{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[5m]) < 0.5
|
|
)
|
|
/
|
|
count by (namespace,service,cluster) (
|
|
up{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}
|
|
)
|
|
)
|
|
>= 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerClusterCrashlooping
|
|
annotations:
|
|
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
|
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
|
expr: |-
|
|
(
|
|
count by (namespace,service,cluster) (
|
|
changes(process_start_time_seconds{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}[10m]) > 4
|
|
)
|
|
/
|
|
count by (namespace,service,cluster) (
|
|
up{job="kube-prometheus-stack-alertmanager",container="alertmanager",namespace="kube-prometheus-stack"}
|
|
)
|
|
)
|
|
>= 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-config-reloaders
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: config-reloaders
|
|
rules:
|
|
- alert: ConfigReloaderSidecarErrors
|
|
annotations:
|
|
description: 'Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
|
|
|
|
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
|
|
summary: config-reloader sidecar has not had a successful reload for 10m
|
|
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-etcd
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: etcd
|
|
rules:
|
|
- alert: etcdMembersDown
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
|
|
summary: etcd cluster members are down.
|
|
expr: |-
|
|
max without (endpoint) (
|
|
sum without (instance, pod) (up{job=~".*etcd.*"} == bool 0)
|
|
or
|
|
count without (To) (
|
|
sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
|
)
|
|
)
|
|
> 0
|
|
for: 20m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdInsufficientMembers
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
|
|
summary: etcd cluster has insufficient number of members.
|
|
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance, pod) < ((count(up{job=~".*etcd.*"}) without (instance, pod) + 1) / 2)
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
- alert: etcdNoLeader
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
|
summary: etcd cluster has no leader.
|
|
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: etcdHighNumberOfLeaderChanges
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
|
summary: etcd cluster has high number of leader changes.
|
|
expr: increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster has high number of failed grpc requests.
|
|
expr: |-
|
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
|
/
|
|
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
|
> 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster has high number of failed grpc requests.
|
|
expr: |-
|
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
|
/
|
|
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
|
> 5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: etcdGRPCRequestsSlow
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
|
|
summary: etcd grpc requests are slow
|
|
expr: |-
|
|
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
|
> 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: etcdMemberCommunicationSlow
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster member communication is slow.
|
|
expr: |-
|
|
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
> 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdHighNumberOfFailedProposals
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster has high number of proposal failures.
|
|
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdHighFsyncDurations
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster 99th percentile fsync durations are too high.
|
|
expr: |-
|
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
> 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdHighFsyncDurations
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster 99th percentile fsync durations are too high.
|
|
expr: |-
|
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
> 1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: etcdHighCommitDurations
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
summary: etcd cluster 99th percentile commit durations are too high.
|
|
expr: |-
|
|
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
> 0.25
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdDatabaseQuotaLowSpace
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
|
summary: etcd cluster database is running full.
|
|
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: etcdExcessiveDatabaseGrowth
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
|
|
summary: etcd cluster database growing very fast.
|
|
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: etcdDatabaseHighFragmentationRatio
|
|
annotations:
|
|
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
|
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
|
summary: etcd database size in use is less than 50% of the actual allocated storage.
|
|
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-general.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: general.rules
|
|
rules:
|
|
- alert: TargetDown
|
|
annotations:
|
|
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
|
summary: One or more targets are unreachable.
|
|
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: Watchdog
|
|
annotations:
|
|
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
|
|
|
This alert is always firing, therefore it should always be firing in Alertmanager
|
|
|
|
and always fire against a receiver. There are integrations with various notification
|
|
|
|
mechanisms that send a notification when this alert is not firing. For example the
|
|
|
|
"DeadMansSnitch" integration in PagerDuty.
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
|
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
|
expr: vector(1)
|
|
labels:
|
|
severity: none
|
|
- alert: InfoInhibitor
|
|
annotations:
|
|
description: 'This is an alert that is used to inhibit info alerts.
|
|
|
|
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
|
|
|
other alerts.
|
|
|
|
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
|
|
|
|
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
|
|
|
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
|
summary: Info-level alert inhibition.
|
|
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
|
labels:
|
|
severity: none
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.container-cpu-usage-seconds-tot
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.container_cpu_usage_seconds_total
|
|
rules:
|
|
- expr: |-
|
|
sum by (cluster, namespace, pod, container) (
|
|
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
|
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
|
1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
)
|
|
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m
|
|
- expr: |-
|
|
sum by (cluster, namespace, pod, container) (
|
|
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
|
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
|
1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
)
|
|
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.container-memory-cache
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.container_memory_cache
|
|
rules:
|
|
- expr: |-
|
|
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
)
|
|
record: node_namespace_pod_container:container_memory_cache
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.container-memory-rss
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.container_memory_rss
|
|
rules:
|
|
- expr: |-
|
|
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
)
|
|
record: node_namespace_pod_container:container_memory_rss
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.container-memory-swap
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.container_memory_swap
|
|
rules:
|
|
- expr: |-
|
|
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
)
|
|
record: node_namespace_pod_container:container_memory_swap
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.container-memory-working-set-by
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.container_memory_working_set_bytes
|
|
rules:
|
|
- expr: |-
|
|
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
)
|
|
record: node_namespace_pod_container:container_memory_working_set_bytes
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.container-resource
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.container_resource
|
|
rules:
|
|
- expr: |-
|
|
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
group_left() max by (namespace, pod, cluster) (
|
|
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
)
|
|
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
|
- expr: |-
|
|
sum by (namespace, cluster) (
|
|
sum by (namespace, pod, cluster) (
|
|
max by (namespace, pod, container, cluster) (
|
|
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
|
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
)
|
|
)
|
|
)
|
|
record: namespace_memory:kube_pod_container_resource_requests:sum
|
|
- expr: |-
|
|
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
group_left() max by (namespace, pod, cluster) (
|
|
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
)
|
|
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
|
- expr: |-
|
|
sum by (namespace, cluster) (
|
|
sum by (namespace, pod, cluster) (
|
|
max by (namespace, pod, container, cluster) (
|
|
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
|
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
)
|
|
)
|
|
)
|
|
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
|
- expr: |-
|
|
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
group_left() max by (namespace, pod, cluster) (
|
|
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
)
|
|
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
|
- expr: |-
|
|
sum by (namespace, cluster) (
|
|
sum by (namespace, pod, cluster) (
|
|
max by (namespace, pod, container, cluster) (
|
|
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
|
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
)
|
|
)
|
|
)
|
|
record: namespace_memory:kube_pod_container_resource_limits:sum
|
|
- expr: |-
|
|
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
group_left() max by (namespace, pod, cluster) (
|
|
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
)
|
|
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
|
- expr: |-
|
|
sum by (namespace, cluster) (
|
|
sum by (namespace, pod, cluster) (
|
|
max by (namespace, pod, container, cluster) (
|
|
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
|
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
)
|
|
)
|
|
)
|
|
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-k8s.rules.pod-owner
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules.pod_owner
|
|
rules:
|
|
- expr: |-
|
|
max by (cluster, namespace, workload, pod) (
|
|
label_replace(
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
|
"replicaset", "$1", "owner_name", "(.*)"
|
|
) * on (cluster, replicaset, namespace) group_left(owner_name) topk by (cluster, replicaset, namespace) (
|
|
1, max by (cluster, replicaset, namespace, owner_name) (
|
|
kube_replicaset_owner{job="kube-state-metrics", owner_kind=""}
|
|
)
|
|
),
|
|
"workload", "$1", "replicaset", "(.*)"
|
|
)
|
|
)
|
|
labels:
|
|
workload_type: replicaset
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
max by (cluster, namespace, workload, pod) (
|
|
label_replace(
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
|
"replicaset", "$1", "owner_name", "(.*)"
|
|
) * on (replicaset, namespace, cluster) group_left(owner_name) topk by (cluster, replicaset, namespace) (
|
|
1, max by (cluster, replicaset, namespace, owner_name) (
|
|
kube_replicaset_owner{job="kube-state-metrics", owner_kind="Deployment"}
|
|
)
|
|
),
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
)
|
|
labels:
|
|
workload_type: deployment
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
max by (cluster, namespace, workload, pod) (
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
)
|
|
labels:
|
|
workload_type: daemonset
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
max by (cluster, namespace, workload, pod) (
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
|
"workload", "$1", "owner_name", "(.*)")
|
|
)
|
|
labels:
|
|
workload_type: statefulset
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
group by (cluster, namespace, workload, pod) (
|
|
label_join(
|
|
group by (cluster, namespace, job_name, pod, owner_name) (
|
|
label_join(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}
|
|
, "job_name", "", "owner_name")
|
|
)
|
|
* on (cluster, namespace, job_name) group_left()
|
|
group by (cluster, namespace, job_name) (
|
|
kube_job_owner{job="kube-state-metrics", owner_kind=~"Pod|"}
|
|
)
|
|
, "workload", "", "owner_name")
|
|
)
|
|
labels:
|
|
workload_type: job
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
max by (cluster, namespace, workload, pod) (
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="", owner_name=""},
|
|
"workload", "$1", "pod", "(.+)")
|
|
)
|
|
labels:
|
|
workload_type: barepod
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
max by (cluster, namespace, workload, pod) (
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="Node"},
|
|
"workload", "$1", "pod", "(.+)")
|
|
)
|
|
labels:
|
|
workload_type: staticpod
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
- expr: |-
|
|
group by (cluster, namespace, workload, workload_type, pod) (
|
|
label_join(
|
|
label_join(
|
|
group by (cluster, namespace, job_name, pod) (
|
|
label_join(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}
|
|
, "job_name", "", "owner_name")
|
|
)
|
|
* on (cluster, namespace, job_name) group_left(owner_kind, owner_name)
|
|
group by (cluster, namespace, job_name, owner_kind, owner_name) (
|
|
kube_job_owner{job="kube-state-metrics", owner_kind!="Pod", owner_kind!=""}
|
|
)
|
|
, "workload", "", "owner_name")
|
|
, "workload_type", "", "owner_kind")
|
|
|
|
OR
|
|
|
|
label_replace(
|
|
label_replace(
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}
|
|
, "replicaset", "$1", "owner_name", "(.+)"
|
|
)
|
|
* on (cluster, namespace, replicaset) group_left(owner_kind, owner_name)
|
|
group by (cluster, namespace, replicaset, owner_kind, owner_name) (
|
|
kube_replicaset_owner{job="kube-state-metrics", owner_kind!="Deployment", owner_kind!=""}
|
|
)
|
|
, "workload", "$1", "owner_name", "(.+)")
|
|
OR
|
|
label_replace(
|
|
group by (cluster, namespace, pod, owner_name, owner_kind) (
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind!="ReplicaSet", owner_kind!="DaemonSet", owner_kind!="StatefulSet", owner_kind!="Job", owner_kind!="Node", owner_kind!=""}
|
|
)
|
|
, "workload", "$1", "owner_name", "(.+)"
|
|
)
|
|
, "workload_type", "$1", "owner_kind", "(.+)")
|
|
)
|
|
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-apiserver-availability.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- interval: 3m
|
|
name: kube-apiserver-availability.rules
|
|
rules:
|
|
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
|
record: code_verb:apiserver_request_total:increase30d
|
|
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
|
labels:
|
|
verb: read
|
|
record: code:apiserver_request_total:increase30d
|
|
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
|
labels:
|
|
verb: write
|
|
record: code:apiserver_request_total:increase30d
|
|
- expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
|
|
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
|
|
- expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
|
|
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
|
|
- expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"})
|
|
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
|
|
- expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"})
|
|
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
|
|
- expr: |-
|
|
1 - (
|
|
(
|
|
# write too slow
|
|
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
|
-
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0))
|
|
) +
|
|
(
|
|
# read too slow
|
|
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
|
-
|
|
(
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0))
|
|
+
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0))
|
|
+
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0))
|
|
)
|
|
) +
|
|
# errors
|
|
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
|
)
|
|
/
|
|
sum by (cluster) (code:apiserver_request_total:increase30d)
|
|
labels:
|
|
verb: all
|
|
record: apiserver_request:availability30d
|
|
- expr: |-
|
|
1 - (
|
|
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
|
-
|
|
(
|
|
# too slow
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0))
|
|
+
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0))
|
|
+
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0))
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
|
)
|
|
/
|
|
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:availability30d
|
|
- expr: |-
|
|
1 - (
|
|
(
|
|
# too slow
|
|
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
|
-
|
|
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0))
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
|
)
|
|
/
|
|
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:availability30d
|
|
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
|
labels:
|
|
verb: read
|
|
record: code_resource:apiserver_request_total:rate5m
|
|
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
|
labels:
|
|
verb: write
|
|
record: code_resource:apiserver_request_total:rate5m
|
|
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
|
record: code_verb:apiserver_request_total:increase1h
|
|
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
|
record: code_verb:apiserver_request_total:increase1h
|
|
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
|
record: code_verb:apiserver_request_total:increase1h
|
|
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
|
record: code_verb:apiserver_request_total:increase1h
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-apiserver-burnrate.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kube-apiserver-burnrate.rules
|
|
rules:
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1d]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1d]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1d]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate1d
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1h]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1h]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1h]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate1h
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[2h]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[2h]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[2h]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate2h
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[30m]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[30m]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[30m]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate30m
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[3d]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[3d]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[3d]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate3d
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[5m]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[5m]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[5m]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate5m
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
|
-
|
|
(
|
|
(
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[6h]))
|
|
or
|
|
vector(0)
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[6h]))
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[6h]))
|
|
)
|
|
)
|
|
+
|
|
# errors
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
|
labels:
|
|
verb: read
|
|
record: apiserver_request:burnrate6h
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1d]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate1d
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1h]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate1h
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[2h]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate2h
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[30m]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate30m
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[3d]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate3d
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[5m]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate5m
|
|
- expr: |-
|
|
(
|
|
(
|
|
# too slow
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
|
-
|
|
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[6h]))
|
|
)
|
|
+
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
|
)
|
|
/
|
|
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
|
labels:
|
|
verb: write
|
|
record: apiserver_request:burnrate6h
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-apiserver-histogram.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kube-apiserver-histogram.rules
|
|
rules:
|
|
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
|
labels:
|
|
quantile: '0.99'
|
|
verb: read
|
|
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
|
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
|
labels:
|
|
quantile: '0.99'
|
|
verb: write
|
|
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-apiserver-slos
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kube-apiserver-slos
|
|
rules:
|
|
- alert: KubeAPIErrorBudgetBurn
|
|
annotations:
|
|
description: The API server is burning too much error budget on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
|
summary: The API server is burning too much error budget.
|
|
expr: |-
|
|
sum by (cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
|
and on (cluster)
|
|
sum by (cluster) (apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
|
for: 2m
|
|
labels:
|
|
long: 1h
|
|
severity: critical
|
|
short: 5m
|
|
- alert: KubeAPIErrorBudgetBurn
|
|
annotations:
|
|
description: The API server is burning too much error budget on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
|
summary: The API server is burning too much error budget.
|
|
expr: |-
|
|
sum by (cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
|
and on (cluster)
|
|
sum by (cluster) (apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
|
for: 15m
|
|
labels:
|
|
long: 6h
|
|
severity: critical
|
|
short: 30m
|
|
- alert: KubeAPIErrorBudgetBurn
|
|
annotations:
|
|
description: The API server is burning too much error budget on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
|
summary: The API server is burning too much error budget.
|
|
expr: |-
|
|
sum by (cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
|
and on (cluster)
|
|
sum by (cluster) (apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
|
for: 1h
|
|
labels:
|
|
long: 1d
|
|
severity: warning
|
|
short: 2h
|
|
- alert: KubeAPIErrorBudgetBurn
|
|
annotations:
|
|
description: The API server is burning too much error budget on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
|
summary: The API server is burning too much error budget.
|
|
expr: |-
|
|
sum by (cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
|
and on (cluster)
|
|
sum by (cluster) (apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
|
for: 3h
|
|
labels:
|
|
long: 3d
|
|
severity: warning
|
|
short: 6h
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-prometheus-general.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kube-prometheus-general.rules
|
|
rules:
|
|
- expr: count without(instance, pod, node) (up == 1)
|
|
record: count:up1
|
|
- expr: count without(instance, pod, node) (up == 0)
|
|
record: count:up0
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-prometheus-node-recording.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kube-prometheus-node-recording.rules
|
|
rules:
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
|
record: instance:node_cpu:rate:sum
|
|
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
|
record: instance:node_network_receive_bytes:rate:sum
|
|
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
|
record: instance:node_network_transmit_bytes:rate:sum
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON (instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
|
record: instance:node_cpu:ratio
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
|
record: cluster:node_cpu:sum_rate5m
|
|
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
|
record: cluster:node_cpu:ratio
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kube-state-metrics
|
|
rules:
|
|
- alert: KubeStateMetricsListErrors
|
|
annotations:
|
|
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
|
summary: kube-state-metrics is experiencing errors in list operations.
|
|
expr: |-
|
|
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
|
/
|
|
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
|
|
> 0.01
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStateMetricsWatchErrors
|
|
annotations:
|
|
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
|
summary: kube-state-metrics is experiencing errors in watch operations.
|
|
expr: |-
|
|
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
|
/
|
|
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
|
|
> 0.01
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStateMetricsShardingMismatch
|
|
annotations:
|
|
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
|
summary: kube-state-metrics sharding is misconfigured.
|
|
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStateMetricsShardsMissing
|
|
annotations:
|
|
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
|
summary: kube-state-metrics shards are missing.
|
|
expr: |-
|
|
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
|
|
-
|
|
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
|
|
!= 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubelet.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubelet.rules
|
|
rules:
|
|
- expr: |-
|
|
histogram_quantile(
|
|
0.99,
|
|
sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)
|
|
* on (cluster, instance) group_left (node)
|
|
max by (cluster, instance, node) (kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
|
)
|
|
labels:
|
|
quantile: '0.99'
|
|
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
|
- expr: |-
|
|
histogram_quantile(
|
|
0.9,
|
|
sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)
|
|
* on (cluster, instance) group_left (node)
|
|
max by (cluster, instance, node) (kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
|
)
|
|
labels:
|
|
quantile: '0.9'
|
|
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
|
- expr: |-
|
|
histogram_quantile(
|
|
0.5,
|
|
sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)
|
|
* on (cluster, instance) group_left (node)
|
|
max by (cluster, instance, node) (kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
|
)
|
|
labels:
|
|
quantile: '0.5'
|
|
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubernetes-apps
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-apps
|
|
rules:
|
|
- alert: KubePodCrashLooping
|
|
annotations:
|
|
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
|
summary: Pod is crash looping.
|
|
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubePodNotReady
|
|
annotations:
|
|
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
|
summary: Pod has been in a non-ready state for more than 15 minutes.
|
|
expr: |-
|
|
sum by (namespace, pod, cluster) (
|
|
max by (namespace, pod, cluster) (
|
|
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
|
|
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
|
|
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
|
)
|
|
) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDeploymentGenerationMismatch
|
|
annotations:
|
|
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
|
summary: Deployment generation mismatch due to possible roll-back
|
|
expr: |-
|
|
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
annotations:
|
|
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
|
summary: Deployment has not matched the expected number of replicas.
|
|
expr: |-
|
|
(
|
|
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
>
|
|
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
|
) and (
|
|
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDeploymentRolloutStuck
|
|
annotations:
|
|
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
|
summary: Deployment rollout is not progressing.
|
|
expr: |-
|
|
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
|
!= 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeStatefulSetReplicasMismatch
|
|
annotations:
|
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
|
summary: StatefulSet has not matched the expected number of replicas.
|
|
expr: |-
|
|
(
|
|
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
) and (
|
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeStatefulSetGenerationMismatch
|
|
annotations:
|
|
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
|
summary: StatefulSet generation mismatch due to possible roll-back
|
|
expr: |-
|
|
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
annotations:
|
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
|
summary: StatefulSet update has not been rolled out.
|
|
expr: |-
|
|
(
|
|
max by (namespace, statefulset, job, cluster) (
|
|
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
|
unless
|
|
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
|
)
|
|
* on (namespace, statefulset, job, cluster)
|
|
(
|
|
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
|
)
|
|
) and on (namespace, statefulset, job, cluster) (
|
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetRolloutStuck
|
|
annotations:
|
|
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
|
summary: DaemonSet rollout is stuck.
|
|
expr: |-
|
|
(
|
|
(
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
) or (
|
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
0
|
|
) or (
|
|
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
) or (
|
|
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
)
|
|
) and (
|
|
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeContainerWaiting
|
|
annotations:
|
|
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}") on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
|
summary: Pod container waiting longer than 1 hour
|
|
expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"} > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetNotScheduled
|
|
annotations:
|
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
|
summary: DaemonSet pods are not scheduled.
|
|
expr: |-
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
-
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetMisScheduled
|
|
annotations:
|
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
|
summary: DaemonSet pods are misscheduled.
|
|
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobNotCompleted
|
|
annotations:
|
|
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
|
summary: Job did not complete in time
|
|
expr: |-
|
|
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
|
and
|
|
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobFailed
|
|
annotations:
|
|
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
|
summary: Job failed to complete.
|
|
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeHpaReplicasMismatch
|
|
annotations:
|
|
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
|
summary: HPA has not matched desired number of replicas.
|
|
expr: |-
|
|
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
!=
|
|
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
and
|
|
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
>
|
|
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
and
|
|
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
<
|
|
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
and
|
|
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeHpaMaxedOut
|
|
annotations:
|
|
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
|
summary: HPA is running at max replicas
|
|
expr: |-
|
|
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
==
|
|
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubePdbNotEnoughHealthyPods
|
|
annotations:
|
|
description: PDB {{ $labels.cluster }}/{{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least 15m.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods
|
|
summary: PDB does not have enough healthy pods.
|
|
expr: |-
|
|
(
|
|
kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics", namespace=~".*"}
|
|
-
|
|
kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics", namespace=~".*"}
|
|
)
|
|
> 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubernetes-resources
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-resources
|
|
rules:
|
|
- alert: KubeCPUOvercommit
|
|
annotations:
|
|
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ printf "%.2f" $value }} CPU shares and cannot tolerate node failure.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
|
summary: Cluster has overcommitted CPU resource requests.
|
|
expr: |-
|
|
# Non-HA clusters.
|
|
(
|
|
(
|
|
sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{})
|
|
-
|
|
sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) > 0
|
|
)
|
|
and
|
|
count by (cluster) (max by (cluster, node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3
|
|
)
|
|
or
|
|
# HA clusters.
|
|
(
|
|
sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{})
|
|
-
|
|
(
|
|
# Skip clusters with only one allocatable node.
|
|
(
|
|
sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"})
|
|
-
|
|
max by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"})
|
|
) > 0
|
|
) > 0
|
|
)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemoryOvercommit
|
|
annotations:
|
|
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
|
summary: Cluster has overcommitted memory resource requests.
|
|
expr: |-
|
|
# Non-HA clusters.
|
|
(
|
|
(
|
|
sum by (cluster) (namespace_memory:kube_pod_container_resource_requests:sum{})
|
|
-
|
|
sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="memory"}) > 0
|
|
)
|
|
and
|
|
count by (cluster) (max by (cluster, node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3
|
|
)
|
|
or
|
|
# HA clusters.
|
|
(
|
|
sum by (cluster) (namespace_memory:kube_pod_container_resource_requests:sum{})
|
|
-
|
|
(
|
|
# Skip clusters with only one allocatable node.
|
|
(
|
|
sum by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="memory"})
|
|
-
|
|
max by (cluster) (kube_node_status_allocatable{job="kube-state-metrics",resource="memory"})
|
|
) > 0
|
|
) > 0
|
|
)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeCPUQuotaOvercommit
|
|
annotations:
|
|
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
|
summary: Cluster has overcommitted CPU resource requests.
|
|
expr: |-
|
|
sum by (cluster) (
|
|
min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})
|
|
)
|
|
/
|
|
sum by (cluster) (
|
|
kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}
|
|
) > 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemoryQuotaOvercommit
|
|
annotations:
|
|
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
|
summary: Cluster has overcommitted memory resource requests.
|
|
expr: |-
|
|
sum by (cluster) (
|
|
min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})
|
|
)
|
|
/
|
|
sum by (cluster) (
|
|
kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}
|
|
) > 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeQuotaAlmostFull
|
|
annotations:
|
|
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
|
summary: Namespace quota is going to be full.
|
|
expr: |-
|
|
max without (instance, job, type) (
|
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
)
|
|
/ on (cluster, namespace, resource, resourcequota) group_left()
|
|
(
|
|
max without (instance, job, type) (
|
|
kube_resourcequota{job="kube-state-metrics", type="hard"}
|
|
) > 0
|
|
)
|
|
> 0.9 < 1
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
- alert: KubeQuotaFullyUsed
|
|
annotations:
|
|
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
|
summary: Namespace quota is fully used.
|
|
expr: |-
|
|
max without (instance, job, type) (
|
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
)
|
|
/ on (cluster, namespace, resource, resourcequota) group_left()
|
|
(
|
|
max without (instance, job, type) (
|
|
kube_resourcequota{job="kube-state-metrics", type="hard"}
|
|
) > 0
|
|
)
|
|
== 1
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
- alert: KubeQuotaExceeded
|
|
annotations:
|
|
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
|
summary: Namespace quota has exceeded the limits.
|
|
expr: |-
|
|
max without (instance, job, type) (
|
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
)
|
|
/ on (cluster, namespace, resource, resourcequota) group_left()
|
|
(
|
|
max without (instance, job, type) (
|
|
kube_resourcequota{job="kube-state-metrics", type="hard"}
|
|
) > 0
|
|
) > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: CPUThrottlingHigh
|
|
annotations:
|
|
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }} on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
|
summary: Processes experience elevated CPU throttling.
|
|
expr: |-
|
|
sum without (id, metrics_path, name, image, endpoint, job, node) (
|
|
topk by (cluster, namespace, pod, container, instance) (1,
|
|
increase(
|
|
container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }
|
|
[5m])
|
|
)
|
|
)
|
|
/ on (cluster, namespace, pod, container, instance) group_left
|
|
sum without (id, metrics_path, name, image, endpoint, job, node) (
|
|
topk by (cluster, namespace, pod, container, instance) (1,
|
|
increase(
|
|
container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }
|
|
[5m])
|
|
)
|
|
)
|
|
> ( 25 / 100 )
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubernetes-storage
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-storage
|
|
rules:
|
|
- alert: KubePersistentVolumeFillingUp
|
|
annotations:
|
|
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
|
summary: PersistentVolume is filling up.
|
|
expr: |-
|
|
(
|
|
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
) < 0.03
|
|
and
|
|
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePersistentVolumeFillingUp
|
|
annotations:
|
|
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
|
summary: PersistentVolume is filling up.
|
|
expr: |-
|
|
(
|
|
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
) < 0.15
|
|
and
|
|
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
and
|
|
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubePersistentVolumeInodesFillingUp
|
|
annotations:
|
|
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
|
summary: PersistentVolumeInodes are filling up.
|
|
expr: |-
|
|
(
|
|
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
/
|
|
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
) < 0.03
|
|
and
|
|
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePersistentVolumeInodesFillingUp
|
|
annotations:
|
|
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
|
summary: PersistentVolumeInodes are filling up.
|
|
expr: |-
|
|
(
|
|
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
/
|
|
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
) < 0.15
|
|
and
|
|
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
and
|
|
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
unless on (cluster, namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubePersistentVolumeErrors
|
|
annotations:
|
|
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
|
summary: PersistentVolume is having issues with provisioning.
|
|
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubernetes-system-apiserver
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-system-apiserver
|
|
rules:
|
|
- alert: KubeClientCertificateExpiration
|
|
annotations:
|
|
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
|
summary: Client certificate is about to expire.
|
|
expr: |-
|
|
histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
|
and
|
|
on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientCertificateExpiration
|
|
annotations:
|
|
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
|
summary: Client certificate is about to expire.
|
|
expr: |-
|
|
histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
|
and
|
|
on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAggregatedAPIErrors
|
|
annotations:
|
|
description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
|
|
summary: Kubernetes aggregated API has reported errors.
|
|
expr: sum by (cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAggregatedAPIDown
|
|
annotations:
|
|
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
|
|
summary: Kubernetes aggregated API is down.
|
|
expr: (1 - max by (name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPIDown
|
|
annotations:
|
|
description: KubeAPI has disappeared from Prometheus target discovery.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown
|
|
summary: Target disappeared from Prometheus target discovery.
|
|
expr: absent(up{job="apiserver"})
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPITerminatedRequests
|
|
annotations:
|
|
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
|
|
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
|
expr: sum by (cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by (cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by (cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubernetes-system-kubelet
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-system-kubelet
|
|
rules:
|
|
- alert: KubeNodeNotReady
|
|
annotations:
|
|
description: '{{ $labels.node }} has been unready for more than 15 minutes on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
|
|
summary: Node is not ready.
|
|
expr: |-
|
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
|
and on (cluster, node)
|
|
kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeNodePressure
|
|
annotations:
|
|
description: '{{ $labels.node }} on cluster {{ $labels.cluster }} has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure
|
|
summary: Node has as active Condition.
|
|
expr: |-
|
|
kube_node_status_condition{job="kube-state-metrics",condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1
|
|
and on (cluster, node)
|
|
kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
- alert: KubeNodeUnreachable
|
|
annotations:
|
|
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled on cluster {{ $labels.cluster }}.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
|
|
summary: Node is unreachable.
|
|
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletTooManyPods
|
|
annotations:
|
|
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
|
|
summary: Kubelet is running at capacity.
|
|
expr: |-
|
|
(
|
|
max by (cluster, instance) (
|
|
kubelet_running_pods{job="kubelet", metrics_path="/metrics"} > 1
|
|
)
|
|
* on (cluster, instance) group_left(node)
|
|
max by (cluster, instance, node) (
|
|
kubelet_node_name{job="kubelet", metrics_path="/metrics"}
|
|
)
|
|
)
|
|
/ on (cluster, node) group_left()
|
|
max by (cluster, node) (
|
|
kube_node_status_capacity{job="kube-state-metrics", resource="pods"} != 1
|
|
) > 0.95
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
- alert: KubeNodeReadinessFlapping
|
|
annotations:
|
|
description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
|
|
summary: Node readiness status is flapping.
|
|
expr: |-
|
|
sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2
|
|
and on (cluster, node)
|
|
kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeNodeEviction
|
|
annotations:
|
|
description: Node {{ $labels.node }} on {{ $labels.cluster }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeeviction
|
|
summary: Node is evicting pods.
|
|
expr: |-
|
|
sum(rate(kubelet_evictions{job="kubelet", metrics_path="/metrics"}[15m])) by (cluster, eviction_signal, instance)
|
|
* on (cluster, instance) group_left(node)
|
|
max by (cluster, instance, node) (
|
|
kubelet_node_name{job="kubelet", metrics_path="/metrics"}
|
|
)
|
|
> 0
|
|
for: 0s
|
|
labels:
|
|
severity: info
|
|
- alert: KubeletPlegDurationHigh
|
|
annotations:
|
|
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }} on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
|
|
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
|
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletPodStartUpLatencyHigh
|
|
annotations:
|
|
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }} on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
|
|
summary: Kubelet Pod startup latency is too high.
|
|
expr: |-
|
|
histogram_quantile(0.99,
|
|
sum by (cluster, instance, le) (
|
|
topk by (cluster, instance, le, operation_type) (1,
|
|
rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])
|
|
)
|
|
)
|
|
)
|
|
* on (cluster, instance) group_left(node)
|
|
topk by (cluster, instance, node) (1,
|
|
kubelet_node_name{job="kubelet", metrics_path="/metrics"}
|
|
)
|
|
> 60
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletClientCertificateExpiration
|
|
annotations:
|
|
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
|
summary: Kubelet client certificate is about to expire.
|
|
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletClientCertificateExpiration
|
|
annotations:
|
|
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
|
summary: Kubelet client certificate is about to expire.
|
|
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeletServerCertificateExpiration
|
|
annotations:
|
|
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
|
summary: Kubelet server certificate is about to expire.
|
|
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletServerCertificateExpiration
|
|
annotations:
|
|
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
|
summary: Kubelet server certificate is about to expire.
|
|
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeletClientCertificateRenewalErrors
|
|
annotations:
|
|
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes) on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
|
|
summary: Kubelet has failed to renew its client certificate.
|
|
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletServerCertificateRenewalErrors
|
|
annotations:
|
|
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes) on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
|
|
summary: Kubelet has failed to renew its server certificate.
|
|
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletDown
|
|
annotations:
|
|
description: Kubelet has disappeared from Prometheus target discovery.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
|
|
summary: Target disappeared from Prometheus target discovery.
|
|
expr: absent(up{job="kubelet", metrics_path="/metrics"})
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-kubernetes-system
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-system
|
|
rules:
|
|
- alert: KubeVersionMismatch
|
|
annotations:
|
|
description: There are {{ $value }} different semantic versions of Kubernetes components running on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
|
|
summary: Different semantic versions of Kubernetes components running.
|
|
expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientErrors
|
|
annotations:
|
|
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors on cluster {{ $labels.cluster }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
|
|
summary: Kubernetes API server client is experiencing errors.
|
|
expr: |-
|
|
(sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace)
|
|
/
|
|
sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace))
|
|
> 0.01
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-node-exporter.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: node-exporter.rules
|
|
rules:
|
|
- expr: |-
|
|
count without (cpu, mode) (
|
|
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
|
)
|
|
record: instance:node_num_cpu:sum
|
|
- expr: |-
|
|
1 - avg without (cpu) (
|
|
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
|
)
|
|
record: instance:node_cpu_utilisation:rate5m
|
|
- expr: |-
|
|
(
|
|
node_load1{job="node-exporter"}
|
|
/
|
|
instance:node_num_cpu:sum{job="node-exporter"}
|
|
)
|
|
record: instance:node_load1_per_cpu:ratio
|
|
- expr: |-
|
|
1 - (
|
|
(
|
|
node_memory_MemAvailable_bytes{job="node-exporter"}
|
|
or
|
|
(
|
|
node_memory_Buffers_bytes{job="node-exporter"}
|
|
+
|
|
node_memory_Cached_bytes{job="node-exporter"}
|
|
+
|
|
node_memory_MemFree_bytes{job="node-exporter"}
|
|
+
|
|
node_memory_Slab_bytes{job="node-exporter"}
|
|
)
|
|
)
|
|
/
|
|
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
)
|
|
record: instance:node_memory_utilisation:ratio
|
|
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
|
record: instance:node_vmstat_pgmajfault:rate5m
|
|
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
|
record: instance_device:node_disk_io_time_seconds:rate5m
|
|
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
|
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
|
- expr: |-
|
|
sum without (device) (
|
|
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
|
)
|
|
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
|
- expr: |-
|
|
sum without (device) (
|
|
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
|
)
|
|
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
|
- expr: |-
|
|
sum without (device) (
|
|
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
|
)
|
|
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
|
- expr: |-
|
|
sum without (device) (
|
|
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
|
)
|
|
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-node-exporter
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: node-exporter
|
|
rules:
|
|
- alert: NodeFilesystemSpaceFillingUp
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
|
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
|
expr: |-
|
|
(
|
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
|
and
|
|
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeFilesystemSpaceFillingUp
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
|
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
|
expr: |-
|
|
(
|
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
|
and
|
|
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeFilesystemAlmostOutOfSpace
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
|
summary: Filesystem has less than 5% space left.
|
|
expr: |-
|
|
(
|
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeFilesystemAlmostOutOfSpace
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
|
summary: Filesystem has less than 3% space left.
|
|
expr: |-
|
|
(
|
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeFilesystemFilesFillingUp
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
|
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
|
expr: |-
|
|
(
|
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
|
and
|
|
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeFilesystemFilesFillingUp
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
|
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
|
expr: |-
|
|
(
|
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
|
and
|
|
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeFilesystemAlmostOutOfFiles
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
|
summary: Filesystem has less than 5% inodes left.
|
|
expr: |-
|
|
(
|
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeFilesystemAlmostOutOfFiles
|
|
annotations:
|
|
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
|
summary: Filesystem has less than 3% inodes left.
|
|
expr: |-
|
|
(
|
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
|
and
|
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
|
)
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeNetworkReceiveErrs
|
|
annotations:
|
|
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
|
summary: Network interface is reporting many receive errors.
|
|
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeNetworkTransmitErrs
|
|
annotations:
|
|
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
|
summary: Network interface is reporting many transmit errors.
|
|
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeHighNumberConntrackEntriesUsed
|
|
annotations:
|
|
description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
|
summary: Number of conntrack are getting close to the limit.
|
|
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeTextFileCollectorScrapeError
|
|
annotations:
|
|
description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
|
summary: Node Exporter text file collector failed to scrape.
|
|
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeClockSkewDetected
|
|
annotations:
|
|
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
|
summary: Clock skew detected.
|
|
expr: |-
|
|
(
|
|
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
|
and
|
|
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
|
)
|
|
or
|
|
(
|
|
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
|
and
|
|
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
|
)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeClockNotSynchronising
|
|
annotations:
|
|
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
|
summary: Clock not synchronising.
|
|
expr: |-
|
|
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
|
and
|
|
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeRAIDDegraded
|
|
annotations:
|
|
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
|
summary: RAID Array is degraded.
|
|
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeRAIDDiskFailure
|
|
annotations:
|
|
description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
|
summary: Failed device in RAID array.
|
|
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeFileDescriptorLimit
|
|
annotations:
|
|
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
|
expr: |-
|
|
(
|
|
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeFileDescriptorLimit
|
|
annotations:
|
|
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
|
expr: |-
|
|
(
|
|
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeCPUHighUsage
|
|
annotations:
|
|
description: 'CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
|
summary: High CPU usage.
|
|
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!~"idle|iowait"}[2m]))) * 100 > 90
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
- alert: NodeSystemSaturation
|
|
annotations:
|
|
description: 'System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
|
|
|
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
|
summary: System saturated, load per core is very high.
|
|
expr: |-
|
|
node_load1{job="node-exporter"}
|
|
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeMemoryMajorPagesFaults
|
|
annotations:
|
|
description: 'Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
|
|
|
Please check that there is enough memory available at this instance.
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
|
summary: Memory major page faults are occurring at very high rate.
|
|
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeMemoryHighUtilization
|
|
annotations:
|
|
description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
|
summary: Host is running out of memory.
|
|
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeDiskIOSaturation
|
|
annotations:
|
|
description: 'Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
|
|
|
This symptom might indicate disk saturation.
|
|
|
|
'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
|
summary: Disk IO queue is high.
|
|
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeSystemdServiceFailed
|
|
annotations:
|
|
description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
|
summary: Systemd service has entered failed state.
|
|
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeSystemdServiceCrashlooping
|
|
annotations:
|
|
description: Systemd service {{ $labels.name }} has being restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicecrashlooping
|
|
summary: Systemd service keeps restaring, possibly crash looping.
|
|
expr: increase(node_systemd_service_restart_total{job="node-exporter"}[5m]) > 2
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeBondingDegraded
|
|
annotations:
|
|
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
|
summary: Bonding interface is degraded.
|
|
expr: (node_bonding_slaves{job="node-exporter"} - node_bonding_active{job="node-exporter"}) != 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-node-network
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: node-network
|
|
rules:
|
|
- alert: NodeNetworkInterfaceFlapping
|
|
annotations:
|
|
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
|
summary: Network interface is often changing its status
|
|
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-node.rules
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: node.rules
|
|
rules:
|
|
- expr: |-
|
|
topk by (cluster, namespace, pod) (1,
|
|
max by (cluster, node, namespace, pod) (
|
|
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
|
))
|
|
record: 'node_namespace_pod:kube_pod_info:'
|
|
- expr: |-
|
|
count by (cluster, node) (
|
|
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
|
* on (cluster, namespace, pod) group_left(node)
|
|
topk by (cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:)
|
|
)
|
|
record: node:node_num_cpu:sum
|
|
- expr: |-
|
|
sum(
|
|
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
|
(
|
|
node_memory_Buffers_bytes{job="node-exporter"} +
|
|
node_memory_Cached_bytes{job="node-exporter"} +
|
|
node_memory_MemFree_bytes{job="node-exporter"} +
|
|
node_memory_Slab_bytes{job="node-exporter"}
|
|
)
|
|
) by (cluster)
|
|
record: :node_memory_MemAvailable_bytes:sum
|
|
- expr: |-
|
|
avg by (cluster, node) (
|
|
sum without (mode) (
|
|
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
|
)
|
|
)
|
|
record: node:node_cpu_utilization:ratio_rate5m
|
|
- expr: |-
|
|
avg by (cluster) (
|
|
node:node_cpu_utilization:ratio_rate5m
|
|
)
|
|
record: cluster:node_cpu:ratio_rate5m
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-operator
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: prometheus-operator
|
|
rules:
|
|
- alert: PrometheusOperatorListErrors
|
|
annotations:
|
|
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
|
|
summary: Errors while performing list operations in controller.
|
|
expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[10m]))) > 0.4
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorWatchErrors
|
|
annotations:
|
|
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
|
|
summary: Errors while performing watch operations in controller.
|
|
expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]))) > 0.4
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorSyncFailed
|
|
annotations:
|
|
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed
|
|
summary: Last controller reconciliation failed
|
|
expr: min_over_time(prometheus_operator_syncs{status="failed",job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorReconcileErrors
|
|
annotations:
|
|
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
|
|
summary: Errors while reconciling objects.
|
|
expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]))) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorStatusUpdateErrors
|
|
annotations:
|
|
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorstatusupdateerrors
|
|
summary: Errors while updating objects status.
|
|
expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]))) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorNodeLookupErrors
|
|
annotations:
|
|
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors
|
|
summary: Errors while reconciling Prometheus.
|
|
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorNotReady
|
|
annotations:
|
|
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
|
|
summary: Prometheus operator not ready
|
|
expr: min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]) == 0)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorRejectedResources
|
|
annotations:
|
|
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources
|
|
summary: Resources rejected by Prometheus operator
|
|
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="kube-prometheus-stack-operator",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
groups:
|
|
- name: prometheus
|
|
rules:
|
|
- alert: PrometheusBadConfig
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
|
|
summary: Failed Prometheus configuration reload.
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) == 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusSDRefreshFailure
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to refresh SD with mechanism {{$labels.mechanism}}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure
|
|
summary: Failed Prometheus SD refresh.
|
|
expr: increase(prometheus_sd_refresh_failures_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[10m]) > 0
|
|
for: 20m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusKubernetesListWatchFailures
|
|
annotations:
|
|
description: Kubernetes service discovery of Prometheus {{$labels.namespace}}/{{$labels.pod}} is experiencing {{ printf "%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuskuberneteslistwatchfailures
|
|
summary: Requests in Kubernetes SD are failing.
|
|
expr: increase(prometheus_sd_kubernetes_failures_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusNotificationQueueRunningFull
|
|
annotations:
|
|
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
|
|
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
|
expr: |-
|
|
# Without min_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m], 60 * 30)
|
|
>
|
|
min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
|
annotations:
|
|
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} were affected by errors.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
|
|
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors.
|
|
expr: |-
|
|
(
|
|
rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
/
|
|
rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
)
|
|
* 100
|
|
> 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusNotConnectedToAlertmanagers
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
|
|
summary: Prometheus is not connected to any Alertmanagers.
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBReloadsFailing
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
|
|
summary: Prometheus has issues reloading blocks from disk.
|
|
expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[3h]) > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBCompactionsFailing
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
|
|
summary: Prometheus has issues compacting blocks.
|
|
expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[3h]) > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusNotIngestingSamples
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
|
|
summary: Prometheus is not ingesting samples.
|
|
expr: |-
|
|
(
|
|
sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])) <= 0
|
|
and
|
|
(
|
|
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}) > 0
|
|
or
|
|
sum without(rule_group) (prometheus_rule_group_rules{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}) > 0
|
|
)
|
|
)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusDuplicateTimestamps
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
|
|
summary: Prometheus is dropping samples with duplicate timestamps.
|
|
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOutOfOrderTimestamps
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
|
|
summary: Prometheus drops samples with out-of-order timestamps.
|
|
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusRemoteStorageFailures
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
|
|
summary: Prometheus fails to send samples to remote storage.
|
|
expr: |-
|
|
(
|
|
(rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]))
|
|
/
|
|
(
|
|
(rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]))
|
|
+
|
|
(rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) or rate(prometheus_remote_storage_samples_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]))
|
|
)
|
|
)
|
|
* 100
|
|
> 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusRemoteWriteBehind
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
|
|
summary: Prometheus remote write is behind.
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
-
|
|
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
)
|
|
> 120
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusRemoteWriteDesiredShards
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}` $labels.instance | query | first | value }}.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
|
|
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
>
|
|
max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m])
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusRuleFailures
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
|
|
summary: Prometheus is failing rule evaluations.
|
|
expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusMissingRuleEvaluations
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
|
|
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
|
expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTargetLimitHit
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
|
|
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
|
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusLabelLimitHit
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
|
|
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
|
expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusScrapeBodySizeLimitHit
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
|
|
summary: Prometheus has dropped some targets that exceeded body size limit.
|
|
expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusScrapeSampleLimitHit
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
|
|
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
|
|
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTargetSyncFailure
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
|
|
summary: Prometheus has failed to sync targets.
|
|
expr: increase(prometheus_target_sync_failed_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[30m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusHighQueryLoad
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
|
|
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
|
|
expr: avg_over_time(prometheus_engine_queries{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack"}[5m]) > 0.8
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
|
annotations:
|
|
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
|
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
|
|
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
|
expr: |-
|
|
min without (alertmanager) (
|
|
rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack",alertmanager!~``}[5m])
|
|
/
|
|
rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="kube-prometheus-stack",alertmanager!~``}[5m])
|
|
)
|
|
* 100
|
|
> 3
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
---
|
|
# Source: kube-prometheus-stack/templates/redis-replication.yaml
|
|
apiVersion: redis.redis.opstreelabs.in/v1beta2
|
|
kind: RedisReplication
|
|
metadata:
|
|
name: redis-replication-kube-prometheus-stack
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: redis-replication-kube-prometheus-stack
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
clusterSize: 3
|
|
podSecurityContext:
|
|
runAsUser: 1000
|
|
fsGroup: 1000
|
|
kubernetesConfig:
|
|
image: quay.io/opstree/redis:v8.0.3
|
|
imagePullPolicy: IfNotPresent
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 128Mi
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: ceph-block
|
|
accessModes: ["ReadWriteOnce"]
|
|
resources:
|
|
requests:
|
|
storage: 1Gi
|
|
redisExporter:
|
|
enabled: true
|
|
image: quay.io/opstree/redis-exporter:v1.48.0
|
|
---
|
|
# Source: kube-prometheus-stack/templates/scrape-config.yaml
|
|
apiVersion: monitoring.coreos.com/v1alpha1
|
|
kind: ScrapeConfig
|
|
metadata:
|
|
name: external-nodes-http
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: external-nodes-http
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
staticConfigs:
|
|
- labels:
|
|
job: external-nodes
|
|
targets:
|
|
- ps08rp.alexlebens.net:9100
|
|
- ps09rp.alexlebens.net:9100
|
|
metricsPath: /metrics
|
|
scheme: HTTP
|
|
---
|
|
# Source: kube-prometheus-stack/templates/scrape-config.yaml
|
|
apiVersion: monitoring.coreos.com/v1alpha1
|
|
kind: ScrapeConfig
|
|
metadata:
|
|
name: external-nodes-https
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: external-nodes-https
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
staticConfigs:
|
|
- labels:
|
|
job: external-nodes
|
|
targets:
|
|
- node-exporter-ps10rp.boreal-beaufort.ts.net
|
|
metricsPath: /metrics
|
|
scheme: HTTPS
|
|
---
|
|
# Source: kube-prometheus-stack/templates/scrape-config.yaml
|
|
apiVersion: monitoring.coreos.com/v1alpha1
|
|
kind: ScrapeConfig
|
|
metadata:
|
|
name: airgradient-http
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: airgradient-http
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
staticConfigs:
|
|
- labels:
|
|
job: airgradient
|
|
targets:
|
|
- it01ag.alexlebens.net:9926
|
|
metricsPath: /metrics
|
|
scheme: HTTP
|
|
---
|
|
# Source: kube-prometheus-stack/templates/scrape-config.yaml
|
|
apiVersion: monitoring.coreos.com/v1alpha1
|
|
kind: ScrapeConfig
|
|
metadata:
|
|
name: garage-https
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: garage-https
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
spec:
|
|
staticConfigs:
|
|
- labels:
|
|
job: garage
|
|
targets:
|
|
- garage-ps10rp.boreal-beaufort.ts.net:3903
|
|
metricsPath: /metrics
|
|
scrapeInterval: 1m
|
|
scheme: HTTPS
|
|
authorization:
|
|
type: Bearer
|
|
credentials:
|
|
key: token
|
|
name: garage-metric-secret
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-state-metrics
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: kube-state-metrics-6.4.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: kube-state-metrics
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "2.17.0"
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
jobLabel: app.kubernetes.io/name
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: kube-state-metrics
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
endpoints:
|
|
- port: http
|
|
honorLabels: true
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus-node-exporter
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
helm.sh/chart: prometheus-node-exporter-4.49.1
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/component: metrics
|
|
app.kubernetes.io/part-of: prometheus-node-exporter
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "1.10.2"
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
jobLabel: jobLabel
|
|
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: prometheus-node-exporter
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
attachMetadata:
|
|
node: false
|
|
endpoints:
|
|
- port: http-metrics
|
|
scheme: http
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/alertmanager/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-alertmanager
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-alertmanager
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
|
|
selector:
|
|
matchLabels:
|
|
app: kube-prometheus-stack-alertmanager
|
|
release: "kube-prometheus-stack"
|
|
self-monitor: "true"
|
|
namespaceSelector:
|
|
matchNames:
|
|
- "kube-prometheus-stack"
|
|
endpoints:
|
|
- port: http-web
|
|
enableHttp2: true
|
|
path: "/metrics"
|
|
- port: reloader-web
|
|
path: "/metrics"
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/exporters/core-dns/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-coredns
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-coredns
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
jobLabel: jobLabel
|
|
|
|
selector:
|
|
matchLabels:
|
|
app: kube-prometheus-stack-coredns
|
|
release: "kube-prometheus-stack"
|
|
namespaceSelector:
|
|
matchNames:
|
|
- "kube-system"
|
|
endpoints:
|
|
- port: http-metrics
|
|
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-apiserver
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-apiserver
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
|
|
endpoints:
|
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
port: https
|
|
scheme: https
|
|
metricRelabelings:
|
|
- action: drop
|
|
regex: (etcd_request|apiserver_request_slo|apiserver_request_sli|apiserver_request)_duration_seconds_bucket;(0\.15|0\.2|0\.3|0\.35|0\.4|0\.45|0\.6|0\.7|0\.8|0\.9|1\.25|1\.5|1\.75|2|3|3\.5|4|4\.5|6|7|8|9|15|20|40|45|50)(\.0)?
|
|
sourceLabels:
|
|
- __name__
|
|
- le
|
|
tlsConfig:
|
|
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
serverName: kubernetes
|
|
insecureSkipVerify: true
|
|
jobLabel: component
|
|
namespaceSelector:
|
|
matchNames:
|
|
- default
|
|
selector:
|
|
matchLabels:
|
|
component: apiserver
|
|
provider: kubernetes
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/exporters/kube-etcd/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-kube-etcd
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-kube-etcd
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
jobLabel: jobLabel
|
|
|
|
selector:
|
|
matchLabels:
|
|
app: kube-prometheus-stack-kube-etcd
|
|
release: "kube-prometheus-stack"
|
|
namespaceSelector:
|
|
matchNames:
|
|
- "kube-system"
|
|
endpoints:
|
|
- port: http-metrics
|
|
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
metricRelabelings:
|
|
- action: labeldrop
|
|
regex: pod
|
|
relabelings:
|
|
- action: replace
|
|
regex: ^(.*)$
|
|
replacement: $1
|
|
separator: ;
|
|
sourceLabels:
|
|
- __meta_kubernetes_pod_node_name
|
|
targetLabel: nodename
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-kubelet
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-kubelet
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
|
|
attachMetadata:
|
|
node: false
|
|
jobLabel: k8s-app
|
|
namespaceSelector:
|
|
matchNames:
|
|
- kube-system
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: kubelet
|
|
k8s-app: kubelet
|
|
endpoints:
|
|
- port: https-metrics
|
|
scheme: https
|
|
tlsConfig:
|
|
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
insecureSkipVerify: true
|
|
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
honorLabels: true
|
|
honorTimestamps: true
|
|
metricRelabelings:
|
|
- action: drop
|
|
regex: (csi_operations|storage_operation_duration)_seconds_bucket;(0.25|2.5|15|25|120|600)(\.0)?
|
|
sourceLabels:
|
|
- __name__
|
|
- le
|
|
relabelings:
|
|
- action: replace
|
|
sourceLabels:
|
|
- __metrics_path__
|
|
targetLabel: metrics_path
|
|
- port: https-metrics
|
|
scheme: https
|
|
path: /metrics/cadvisor
|
|
interval: 10s
|
|
honorLabels: true
|
|
honorTimestamps: true
|
|
trackTimestampsStaleness: true
|
|
tlsConfig:
|
|
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
insecureSkipVerify: true
|
|
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
metricRelabelings:
|
|
- action: drop
|
|
regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
|
|
sourceLabels:
|
|
- __name__
|
|
- action: drop
|
|
regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
|
|
sourceLabels:
|
|
- __name__
|
|
- action: drop
|
|
regex: container_memory_(mapped_file|swap)
|
|
sourceLabels:
|
|
- __name__
|
|
- action: drop
|
|
regex: container_(file_descriptors|tasks_state|threads_max)
|
|
sourceLabels:
|
|
- __name__
|
|
- action: drop
|
|
regex: container_memory_failures_total;hierarchy
|
|
sourceLabels:
|
|
- __name__
|
|
- scope
|
|
- action: drop
|
|
regex: container_network_.*;(cali|cilium|cni|lxc|nodelocaldns|tunl).*
|
|
sourceLabels:
|
|
- __name__
|
|
- interface
|
|
- action: drop
|
|
regex: container_spec.*
|
|
sourceLabels:
|
|
- __name__
|
|
- action: drop
|
|
regex: .+;
|
|
sourceLabels:
|
|
- id
|
|
- pod
|
|
relabelings:
|
|
- action: replace
|
|
sourceLabels:
|
|
- __metrics_path__
|
|
targetLabel: metrics_path
|
|
- port: https-metrics
|
|
scheme: https
|
|
path: /metrics/probes
|
|
honorLabels: true
|
|
honorTimestamps: true
|
|
tlsConfig:
|
|
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
insecureSkipVerify: true
|
|
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
relabelings:
|
|
- action: replace
|
|
sourceLabels:
|
|
- __metrics_path__
|
|
targetLabel: metrics_path
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-operator
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app: kube-prometheus-stack-operator
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator
|
|
spec:
|
|
|
|
endpoints:
|
|
- port: https
|
|
scheme: https
|
|
tlsConfig:
|
|
serverName: kube-prometheus-stack-operator
|
|
ca:
|
|
secret:
|
|
name: kube-prometheus-stack-admission
|
|
key: ca
|
|
optional: false
|
|
honorLabels: true
|
|
selector:
|
|
matchLabels:
|
|
app: kube-prometheus-stack-operator
|
|
release: "kube-prometheus-stack"
|
|
namespaceSelector:
|
|
matchNames:
|
|
- "kube-prometheus-stack"
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus/servicemonitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: kube-prometheus-stack-prometheus
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app: kube-prometheus-stack-prometheus
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
spec:
|
|
|
|
selector:
|
|
matchLabels:
|
|
app: kube-prometheus-stack-prometheus
|
|
release: "kube-prometheus-stack"
|
|
self-monitor: "true"
|
|
namespaceSelector:
|
|
matchNames:
|
|
- "kube-prometheus-stack"
|
|
endpoints:
|
|
- port: http-web
|
|
path: "/metrics"
|
|
- port: reloader-web
|
|
path: "/metrics"
|
|
---
|
|
# Source: kube-prometheus-stack/templates/service-monitor.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: redis-replication-kube-prometheus-stack
|
|
namespace: kube-prometheus-stack
|
|
labels:
|
|
app.kubernetes.io/name: redis-replication-kube-prometheus-stack
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
redis-operator: "true"
|
|
env: production
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
redis_setup_type: replication
|
|
endpoints:
|
|
- port: redis-exporter
|
|
interval: 30s
|
|
scrapeTimeout: 10s
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml
|
|
apiVersion: admissionregistration.k8s.io/v1
|
|
kind: ValidatingWebhookConfiguration
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
annotations:
|
|
|
|
argocd.argoproj.io/hook: PreSync
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
webhooks:
|
|
- name: prometheusrulevalidate.monitoring.coreos.com
|
|
failurePolicy: Ignore
|
|
rules:
|
|
- apiGroups:
|
|
- monitoring.coreos.com
|
|
apiVersions:
|
|
- "*"
|
|
resources:
|
|
- prometheusrules
|
|
operations:
|
|
- CREATE
|
|
- UPDATE
|
|
clientConfig:
|
|
service:
|
|
namespace: kube-prometheus-stack
|
|
name: kube-prometheus-stack-operator
|
|
path: /admission-prometheusrules/validate
|
|
timeoutSeconds: 10
|
|
admissionReviewVersions: ["v1", "v1beta1"]
|
|
sideEffects: None
|
|
- name: alertmanagerconfigsvalidate.monitoring.coreos.com
|
|
failurePolicy: Ignore
|
|
rules:
|
|
- apiGroups:
|
|
- monitoring.coreos.com
|
|
apiVersions:
|
|
- v1alpha1
|
|
resources:
|
|
- alertmanagerconfigs
|
|
operations:
|
|
- CREATE
|
|
- UPDATE
|
|
clientConfig:
|
|
service:
|
|
namespace: kube-prometheus-stack
|
|
name: kube-prometheus-stack-operator
|
|
path: /admission-alertmanagerconfigs/validate
|
|
timeoutSeconds: 10
|
|
admissionReviewVersions: ["v1", "v1beta1"]
|
|
sideEffects: None
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/serviceaccount.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
namespace: kube-prometheus-stack
|
|
annotations:
|
|
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
automountServiceAccountToken: true
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/clusterrole.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
annotations:
|
|
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
rules:
|
|
- apiGroups:
|
|
- admissionregistration.k8s.io
|
|
resources:
|
|
- validatingwebhookconfigurations
|
|
- mutatingwebhookconfigurations
|
|
verbs:
|
|
- get
|
|
- update
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/clusterrolebinding.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
annotations:
|
|
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: kube-prometheus-stack-admission
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kube-prometheus-stack-admission
|
|
namespace: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/role.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: Role
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
namespace: kube-prometheus-stack
|
|
annotations:
|
|
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
rules:
|
|
- apiGroups:
|
|
- ""
|
|
resources:
|
|
- secrets
|
|
verbs:
|
|
- get
|
|
- create
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/rolebinding.yaml
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: RoleBinding
|
|
metadata:
|
|
name: kube-prometheus-stack-admission
|
|
namespace: kube-prometheus-stack
|
|
annotations:
|
|
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: Role
|
|
name: kube-prometheus-stack-admission
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kube-prometheus-stack-admission
|
|
namespace: kube-prometheus-stack
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: kube-prometheus-stack-admission-create
|
|
namespace: kube-prometheus-stack
|
|
annotations:
|
|
"helm.sh/hook": pre-install,pre-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
argocd.argoproj.io/hook: PreSync
|
|
argocd.argoproj.io/hook-delete-policy: HookSucceeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission-create
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
spec:
|
|
ttlSecondsAfterFinished: 60
|
|
template:
|
|
metadata:
|
|
name: kube-prometheus-stack-admission-create
|
|
labels:
|
|
app: kube-prometheus-stack-admission-create
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
spec:
|
|
containers:
|
|
- name: create
|
|
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.6.4
|
|
imagePullPolicy: IfNotPresent
|
|
args:
|
|
- create
|
|
- --host=kube-prometheus-stack-operator,kube-prometheus-stack-operator.kube-prometheus-stack.svc
|
|
- --namespace=kube-prometheus-stack
|
|
- --secret-name=kube-prometheus-stack-admission
|
|
securityContext:
|
|
|
|
allowPrivilegeEscalation: false
|
|
capabilities:
|
|
drop:
|
|
- ALL
|
|
readOnlyRootFilesystem: true
|
|
resources:
|
|
{}
|
|
restartPolicy: OnFailure
|
|
serviceAccountName: kube-prometheus-stack-admission
|
|
securityContext:
|
|
runAsGroup: 2000
|
|
runAsNonRoot: true
|
|
runAsUser: 2000
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
---
|
|
# Source: kube-prometheus-stack/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: kube-prometheus-stack-admission-patch
|
|
namespace: kube-prometheus-stack
|
|
annotations:
|
|
"helm.sh/hook": post-install,post-upgrade
|
|
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
argocd.argoproj.io/hook: PreSync
|
|
argocd.argoproj.io/hook-delete-policy: HookSucceeded
|
|
labels:
|
|
app: kube-prometheus-stack-admission-patch
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
spec:
|
|
ttlSecondsAfterFinished: 60
|
|
template:
|
|
metadata:
|
|
name: kube-prometheus-stack-admission-patch
|
|
labels:
|
|
app: kube-prometheus-stack-admission-patch
|
|
|
|
app.kubernetes.io/managed-by: Helm
|
|
app.kubernetes.io/instance: kube-prometheus-stack
|
|
app.kubernetes.io/version: "79.7.1"
|
|
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
chart: kube-prometheus-stack-79.7.1
|
|
release: "kube-prometheus-stack"
|
|
heritage: "Helm"
|
|
app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator
|
|
app.kubernetes.io/component: prometheus-operator-webhook
|
|
spec:
|
|
containers:
|
|
- name: patch
|
|
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.6.4
|
|
imagePullPolicy: IfNotPresent
|
|
args:
|
|
- patch
|
|
- --webhook-name=kube-prometheus-stack-admission
|
|
- --namespace=kube-prometheus-stack
|
|
- --secret-name=kube-prometheus-stack-admission
|
|
- --patch-failure-policy=
|
|
securityContext:
|
|
|
|
allowPrivilegeEscalation: false
|
|
capabilities:
|
|
drop:
|
|
- ALL
|
|
readOnlyRootFilesystem: true
|
|
resources:
|
|
{}
|
|
restartPolicy: OnFailure
|
|
serviceAccountName: kube-prometheus-stack-admission
|
|
securityContext:
|
|
runAsGroup: 2000
|
|
runAsNonRoot: true
|
|
runAsUser: 2000
|
|
seccompProfile:
|
|
type: RuntimeDefault
|