diff --git a/clusters/cl01tl/helm/argocd/templates/prometheus-rule.yaml b/clusters/cl01tl/helm/argocd/templates/prometheus-rule.yaml index 4a58bf1a2..93aee2099 100644 --- a/clusters/cl01tl/helm/argocd/templates/prometheus-rule.yaml +++ b/clusters/cl01tl/helm/argocd/templates/prometheus-rule.yaml @@ -43,82 +43,82 @@ spec: summary: HAProxy high HTTP 5xx error rate server (instance {{ `{{ $labels.instance }}` }}) description: "Too many HTTP requests with status 5xx (> 5%) on server {{ `{{ $labels.server }}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - alert: HAProxyServerResponseErrors - expr: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy server response errors (instance {{ `{{ $labels.instance }}` }}) - description: "Too many response errors to {{ `{{ $labels.server }}` }} server (> 5%).\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyBackendConnectionErrors - expr: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy backend connection errors (instance {{ `{{ $labels.instance }}` }}) - description: "Too many connection errors to {{ `{{ $labels.proxy }}` }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyServerConnectionErrors - expr: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 - for: 0m - labels: - severity: critical - annotations: - summary: HAProxy server connection errors (instance {{ `{{ $labels.instance }}` }}) - description: "Too many connection errors to {{ `{{ $labels.proxy }}` }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyBackendMaxActiveSession>80% - expr: (haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0 - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy backend max active session > 80% (instance {{ `{{ $labels.instance }}` }}) - description: "Session limit from backend {{ `{{ $labels.proxy }}` }} reached 80% of limit - {{ `{{ $value | printf \"%.2f\"}}` }}%\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyPendingRequests - expr: sum by (proxy) (haproxy_backend_current_queue) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy pending requests (instance {{ `{{ $labels.instance }}` }}) - description: "Some HAProxy requests are pending on {{ `{{ $labels.proxy }}` }} - {{ `{{ $value | printf \"%.2f\"}}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyHTTPSlowingDown - expr: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: HAProxy HTTP slowing down (instance {{ `{{ $labels.instance }}` }}) - description: "HAProxy backend max total time is above 1s on {{ `{{ $labels.proxy }}` }} - {{ `{{ $value | printf \"%.2f\"}}` }}s\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyRetryHigh - expr: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy retry high (instance {{ `{{ $labels.instance }}` }}) - description: "High rate of retry on {{ `{{ $labels.proxy }}` }} - {{ `{{ $value | printf \"%.2f\"}}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAproxyHasNoAliveBackends - expr: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0 - for: 0m - labels: - severity: critical - annotations: - summary: HAproxy has no alive backends (instance {{ `{{ $labels.instance }}` }}) - description: "HAProxy has no alive active or backup backends for {{ `{{ $labels.proxy }}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyFrontendSecurityBlockedRequests - expr: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10 - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy frontend security blocked requests (instance {{ `{{ $labels.instance }}` }}) - description: "HAProxy is blocking requests for security reason\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" - - alert: HAProxyServerHealthcheckFailure - expr: increase(haproxy_server_check_failures_total[1m]) > 2 - for: 0m - labels: - severity: warning - annotations: - summary: HAProxy server healthcheck failure (instance {{ `{{ $labels.instance }}` }}) - description: "Some server healthcheck are failing on {{ `{{ $labels.server }}` }} ({{ `{{ $value }}` }} in the last 1m)\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + expr: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy server response errors (instance {{ `{{ $labels.instance }}` }}) + description: "Too many response errors to {{ `{{ $labels.server }}` }} server (> 5%).\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyBackendConnectionErrors + expr: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy backend connection errors (instance {{ `{{ $labels.instance }}` }}) + description: "Too many connection errors to {{ `{{ $labels.proxy }}` }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyServerConnectionErrors + expr: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 + for: 0m + labels: + severity: critical + annotations: + summary: HAProxy server connection errors (instance {{ `{{ $labels.instance }}` }}) + description: "Too many connection errors to {{ `{{ $labels.proxy }}` }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyBackendMaxActiveSession>80% + expr: (haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0 + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy backend max active session > 80% (instance {{ `{{ $labels.instance }}` }}) + description: "Session limit from backend {{ `{{ $labels.proxy }}` }} reached 80% of limit - {{ `{{ $value | printf \"%.2f\"}}` }}%\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyPendingRequests + expr: sum by (proxy) (haproxy_backend_current_queue) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy pending requests (instance {{ `{{ $labels.instance }}` }}) + description: "Some HAProxy requests are pending on {{ `{{ $labels.proxy }}` }} - {{ `{{ $value | printf \"%.2f\"}}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyHTTPSlowingDown + expr: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: HAProxy HTTP slowing down (instance {{ `{{ $labels.instance }}` }}) + description: "HAProxy backend max total time is above 1s on {{ `{{ $labels.proxy }}` }} - {{ `{{ $value | printf \"%.2f\"}}` }}s\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyRetryHigh + expr: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy retry high (instance {{ `{{ $labels.instance }}` }}) + description: "High rate of retry on {{ `{{ $labels.proxy }}` }} - {{ `{{ $value | printf \"%.2f\"}}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAproxyHasNoAliveBackends + expr: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0 + for: 0m + labels: + severity: critical + annotations: + summary: HAproxy has no alive backends (instance {{ `{{ $labels.instance }}` }}) + description: "HAProxy has no alive active or backup backends for {{ `{{ $labels.proxy }}` }}\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyFrontendSecurityBlockedRequests + expr: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy frontend security blocked requests (instance {{ `{{ $labels.instance }}` }}) + description: "HAProxy is blocking requests for security reason\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}" + - alert: HAProxyServerHealthcheckFailure + expr: increase(haproxy_server_check_failures_total[1m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: HAProxy server healthcheck failure (instance {{ `{{ $labels.instance }}` }}) + description: "Some server healthcheck are failing on {{ `{{ $labels.server }}` }} ({{ `{{ $value }}` }} in the last 1m)\n VALUE = {{ `{{ $value }}` }}\n LABELS = {{ `{{ $labels }}` }}"