groups: - name: service rules: - alert: service_down expr: up == 0 for: 10m labels: severity: critical annotations: summary: "Instance {{ $labels.job }} down" description: "{{ $labels.job }} has been down for more than 10 minutes." - name: infrastructure rules: - alert: high_load expr: node_load1 > 8 for: 10m labels: severity: warning annotations: summary: "Instance {{ $labels.job }} under high load" description: "{{ $labels.job }} is under high load." - name: disk_space rules: - alert: disk_will_fill expr: predict_linear(node_filesystem_free_bytes{job="host-ps03fd"}[4h], 4 * 3600) < 0 for: 5m labels: severity: critical - alert: disk_10_percent_free expr: node_exporter:node_filesystem_free_bytes:fs_used_percents >= 90 labels: severity: critical annotations: summary: "Instance {{ $labels.instance }} is low on disk space" description: "{{ $labels.instance }} has only {{ $value }}% free."