{ "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "enable": true, "expr": "changes(process_start_time_seconds{ instance=~\"$instance\"}[2m]) > 0", "hide": false, "iconColor": "#bf1b00", "name": "Restarts", "showIn": 0, "step": "1m", "tagKeys": "instance", "titleFormat": "Restart" } ] }, "editable": false, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 59, "links": [], "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 36, "panels": [], "title": "General info", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "fixedColor": "#f4d598", "mode": "fixed" }, "decimals": 0, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 0, "y": 1 }, "id": 4, "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(alertmanager_build_info{instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "title": "Number of instances", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Table containing list of Alertmanager instances showing it's version, up time, last reload time and if it was successful.", "fieldConfig": { "defaults": { "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "decimals": 2, "displayName": "", "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", "options": "Time" }, "properties": [ { "id": "displayName", "value": "Time" }, { "id": "custom.hidden", "value": true }, { "id": "custom.align" } ] }, { "matcher": { "id": "byName", "options": "instance" }, "properties": [ { "id": "displayName", "value": "Instance" }, { "id": "unit", "value": "short" }, { "id": "decimals", "value": 2 }, { "id": "custom.align" } ] }, { "matcher": { "id": "byName", "options": "version" }, "properties": [ { "id": "displayName", "value": "Version" }, { "id": "unit", "value": "short" }, { "id": "decimals", "value": 2 }, { "id": "custom.align" } ] }, { "matcher": { "id": "byName", "options": "Value #A" }, "properties": [ { "id": "displayName", "value": "Up time" }, { "id": "unit", "value": "s" }, { "id": "custom.align" } ] }, { "matcher": { "id": "byName", "options": "Value #B" }, "properties": [ { "id": "displayName", "value": "Last reload" }, { "id": "unit", "value": "s" }, { "id": "custom.align" } ] }, { "matcher": { "id": "byName", "options": "Value #C" }, "properties": [ { "id": "displayName", "value": "Last reload sucessfull" }, { "id": "unit", "value": "short" }, { "id": "decimals", "value": 2 }, { "id": "custom.cellOptions", "value": { "type": "color-background" } }, { "id": "custom.align" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "rgba(245, 54, 54, 0.9)", "value": null }, { "color": "rgba(237, 129, 40, 0.89)", "value": 0 }, { "color": "rgba(50, 172, 45, 0.97)", "value": 1 } ] } } ] } ] }, "gridPos": { "h": 5, "w": 9, "x": 3, "y": 1 }, "id": 26, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true }, "pluginVersion": "11.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "time() - (alertmanager_build_info{instance=~\"$instance\"} * on (instance, cluster) group_left process_start_time_seconds{instance=~\"$instance\"})", "format": "table", "instant": true, "intervalFactor": 1, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "time() - alertmanager_config_last_reload_success_timestamp_seconds{instance=~\"$instance\"}", "format": "table", "instant": true, "intervalFactor": 1, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_config_last_reload_successful{instance=~\"$instance\"}", "format": "table", "instant": true, "intervalFactor": 1, "refId": "C" } ], "title": "Instance versions and up time", "type": "table" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Number of peers in the Alertmanager cluster.", "fieldConfig": { "defaults": { "color": { "fixedColor": "#e5ac0e", "mode": "fixed" }, "decimals": 0, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 12, "y": 1 }, "id": 207, "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max(alertmanager_cluster_members{instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "title": "Cluster size", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Current number of active alerts.", "fieldConfig": { "defaults": { "color": { "fixedColor": "#bf1b00", "mode": "fixed" }, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 15, "y": 1 }, "id": 2, "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max(alertmanager_alerts{state=\"active\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "title": "Number of active alerts", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Current number of suppressed alerts.", "fieldConfig": { "defaults": { "color": { "fixedColor": "#f9e2d2", "mode": "fixed" }, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 18, "y": 1 }, "id": 3, "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max(alertmanager_alerts{state=\"suppressed\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "title": "Number of suppressed alerts", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Current number of active silences.", "fieldConfig": { "defaults": { "color": { "fixedColor": "#f9e2d2", "mode": "fixed" }, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 21, "y": 1 }, "id": 121, "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max(alertmanager_silences{state=\"active\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "title": "Number of active silences", "type": "stat" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 113, "panels": [], "title": "Notifications", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Number of sent notifications to distinct integrations such as PagerDuty, Slack and so on. On negative axis are displayed failed notifications.", "fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/Failed.*/" }, "properties": [ { "id": "color", "value": { "fixedColor": "#99440a", "mode": "fixed" } }, { "id": "custom.transform", "value": "negative-Y" } ] }, { "matcher": { "id": "byValue", "options": { "op": "gte", "reducer": "allIsZero", "value": 0 } }, "properties": [ { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } } ] }, { "matcher": { "id": "byValue", "options": { "op": "gte", "reducer": "allIsNull", "value": 0 } }, "properties": [ { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } } ] } ] }, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 7 }, "id": 118, "options": { "legend": { "calcs": ["mean", "sum"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_notifications_total{instance=~\"$instance\"}[$__interval])) by (integration)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ integration}}", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_notifications_failed_total{instance=~\"$instance\"}[$__interval])) by (integration)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "Failed {{ integration }}", "refId": "A" } ], "title": "Notifications sent from $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Duration of notification sends in 0.99 and 0.9 quantiles per integration.", "fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/0.99.*/" }, "properties": [ { "id": "custom.lineWidth", "value": 1 } ] }, { "matcher": { "id": "byRegexp", "options": "/0.5 .*/" }, "properties": [ { "id": "custom.lineWidth", "value": 2 } ] }, { "matcher": { "id": "byValue", "options": { "op": "gte", "reducer": "allIsZero", "value": 0 } }, "properties": [ { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } } ] }, { "matcher": { "id": "byValue", "options": { "op": "gte", "reducer": "allIsNull", "value": 0 } }, "properties": [ { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } } ] } ] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 }, "id": 115, "options": { "legend": { "calcs": ["mean"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(histogram_quantile(0.9,rate(alertmanager_notification_latency_seconds_bucket{instance=~\"$instance\"}[$__interval]))) by (integration)", "format": "time_series", "intervalFactor": 1, "legendFormat": "0.9q {{ integration }}", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(histogram_quantile(0.99,rate(alertmanager_notification_latency_seconds_bucket{instance=~\"$instance\"}[$__interval]))) by (integration)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "0.99q {{ integration }}", "refId": "A" } ], "title": "Notification durations per integration on $instance", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, "id": 18, "panels": [], "title": "Alerts", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Number of alerts by state such as `active`, `suppressed` etc.", "fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byName", "options": "active" }, "properties": [ { "id": "color", "value": { "fixedColor": "#bf1b00", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "suppressed" }, "properties": [ { "id": "color", "value": { "fixedColor": "#2f575e", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 19 }, "id": 6, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "mode": "multi", "sort": "none" } }, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(alertmanager_alerts{instance=~\"$instance\"}) by (state)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{state}}", "refId": "A" } ], "title": "Active alerts in $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Number of received alerts from Prometheus by status `firing` on positive axis and `resolved` on negative axis.", "fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byName", "options": "resolved" }, "properties": [ { "id": "color", "value": { "fixedColor": "#7eb26d", "mode": "fixed" } }, { "id": "custom.transform", "value": "negative-Y" } ] }, { "matcher": { "id": "byName", "options": "firing" }, "properties": [ { "id": "color", "value": { "fixedColor": "#99440a", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 24 }, "id": 8, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "mode": "multi", "sort": "none" } }, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_alerts_received_total{instance=~\"$instance\"}[$__interval])) by (status)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "{{ status }}", "refId": "A" } ], "title": "Received alerts by status for $instance", "type": "timeseries" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 29 }, "id": 34, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Shows cluster score representing cluster health. From Hashicorps official documentation: \n> This metric describes a node's perception of its own health based on how well it is meeting the soft real-time requirements of the protocol. This metric ranges from 0 to 8, where 0 indicates \"totally healthy\".\n\nFor more info see https://www.consul.io/docs/agent/telemetry.html#cluster-health", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 30 }, "id": 57, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_cluster_health_score{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Cluster health score", "refId": "A" } ], "title": "Clusterhealth score for $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "Shows gossip cluster members count in time and failing peers in case of any in red color.", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 122 }, "id": 38, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_cluster_members{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Number of cluster members", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_cluster_failed_peers{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Number of failed peers", "refId": "B" } ], "title": "Cluster members count on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "On positive axis shows number of peers that joined the cluster and on negative axis number of peers that left the cluster.", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 126 }, "id": 75, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_cluster_peers_left_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Cluster joined peers", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_cluster_peers_joined_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Cluster left peers", "refId": "B" } ], "title": "Cluster peers left/joined on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "On positive axis is number of attempts to reconnect the cluster. On negative axis if number of failed attempts.", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 132 }, "id": 68, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_cluster_reconnections_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Sucessful reconnections", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_cluster_reconnections_failed_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Failed reconnections", "refId": "B" } ], "title": "Cluster reconnections on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "On positive axis is number of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 138 }, "id": 48, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_cluster_messages_sent_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "sent {{msg_type}}", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_cluster_messages_received_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "received {{msg_type}}", "refId": "B" } ], "title": "Cluster messages count on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "On positive axis is size of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 144 }, "id": 53, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_cluster_messages_sent_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "sent {{msg_type}}", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(alertmanager_cluster_messages_received_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "received {{msg_type}}", "refId": "B" } ], "title": "Cluster messages size on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "description": "On positive axis is number of queued cluster messages and on negative axis number of pruned messages.", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 150 }, "id": 62, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_cluster_messages_pruned_total{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Pruned messaged", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_cluster_messages_queued{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Queued messages", "refId": "B" } ], "title": "Cluster messages queue on $instance", "type": "timeseries" } ], "title": "Cluster members", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }, "id": 284, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 10 }, "id": 314, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_oversized_gossip_message_sent_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{key}}", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_oversized_gossip_message_dropped_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "dropped {{key}}", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_oversized_gossip_message_failure_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "failed {{key}}", "refId": "C" } ], "title": "Count of oversized gossip messages on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 }, "id": 307, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(1,rate(alertmanager_oversize_gossip_message_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{key}}", "refId": "A" } ], "title": "Duration of oversized gossip messages on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 22 }, "id": 303, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_silences_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "silences", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_nflog_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "nf_log", "refId": "B" } ], "title": "Number of propagated gossip messages on $instance", "type": "timeseries" } ], "title": "Gossip messages", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, "id": 84, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 11 }, "id": 94, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_nflog_queries_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Nf log query count", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_nflog_query_errors_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Nf log query errors", "refId": "B" } ], "title": "Nf log queries count for $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 17 }, "id": 106, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(1,rate(alertmanager_nflog_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Nf log query duration", "refId": "A" } ], "title": "Nf log query duration for $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 23 }, "id": 97, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_nflog_snapshot_size_bytes{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Nf log snapshot size", "refId": "A" } ], "title": "Nf log snapshot size for $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 29 }, "id": 101, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval]) / rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Nf log snapshot size", "refId": "A" } ], "title": "Nf log snapshot duration for $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 161 }, "id": 92, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_nflog_gc_duration_seconds{instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Cluster joined peers", "refId": "A" } ], "title": "Nf log Go GC time for $instance", "type": "timeseries" } ], "title": "Nflog", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, "id": 123, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 33 }, "id": 129, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_silences{instance=~\"$instance\"}", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{state}}", "refId": "A" } ], "title": "Silences count by state on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 53 }, "id": 134, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_silences_queries_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Silecnces query count", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "increase(alertmanager_silences_query_errors_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Silecnces query fails", "refId": "B" } ], "title": "Silences query count on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 59 }, "id": 138, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(1,rate(alertmanager_silences_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Silecnces query duration", "refId": "A" } ], "title": "Silences query duration on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 65 }, "id": 149, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_silences_snapshot_size_bytes{instance=~\"$instance\"}", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Silecnces snapshot size", "refId": "A" } ], "title": "Silences snapshot size on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 71 }, "id": 143, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_silences_snapshot_duration_seconds{instance=~\"$instance\", quantile=\"0.99\"}", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Silecnces snapshot duration", "refId": "A" } ], "title": "Silences snapshot duration on $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 77 }, "id": 131, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "alertmanager_silences_gc_duration_seconds{instance=~\"$instance\"}", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Silecnces GC duration", "refId": "A" } ], "title": "Silences GC duraton on $instance", "type": "timeseries" } ], "title": "Silences", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }, "id": 173, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 34 }, "id": 175, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(process_cpu_seconds_total{instance=~\"$instance\"}[$__interval])", "format": "time_series", "groupBy": [ { "params": ["$__interval"], "type": "time" }, { "params": ["null"], "type": "fill" } ], "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", "orderByTime": "ASC", "policy": "default", "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": ["value"], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max(kube_pod_container_resource_limits_cpu_cores{pod=~\"$instance\"}) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Limit {{pod}}", "refId": "B" } ], "title": "CPU usage/s for $instance", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 41 }, "id": 177, "options": {}, "repeat": "instance", "repeatDirection": "h", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "process_resident_memory_bytes{instance=~\"$instance\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{ instance }}", "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max(kube_pod_container_resource_limits_memory_bytes{pod=~\"$instance\"}) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Limit {{ pod }}", "refId": "A" } ], "title": "Memory usage for $instance", "type": "timeseries" } ], "title": "Resources", "type": "row" } ], "preload": true, "refresh": "", "schemaVersion": 40, "tags": ["service", "monitoring"], "templating": { "list": [ { "current": { "text": "Prometheus", "value": "prometheus" }, "includeAll": false, "label": "Datasource", "name": "datasource", "options": [], "query": "prometheus", "refresh": 1, "regex": "", "type": "datasource" }, { "current": { "text": ["All"], "value": ["$__all"] }, "datasource": "Prometheus", "definition": "query_result(alertmanager_build_info)", "includeAll": true, "label": "Instance", "multi": true, "name": "instance", "options": [], "query": "query_result(alertmanager_build_info)", "refresh": 2, "regex": "/.*instance=\"([^\"]+)\".*/", "sort": 1, "type": "query" } ] }, "time": { "from": "now-24h", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "Alertmanager", "uid": "02db5f01b75bbf1b630d08226d49148f", "version": 1, "weekStart": "" }