Merge pull request #280 from yuxiangqian/metrics
add metric to common controller - basic metrics utilities
This commit is contained in:
4
go.mod
4
go.mod
@@ -10,11 +10,15 @@ require (
|
||||
github.com/imdario/mergo v0.3.9 // indirect
|
||||
github.com/kubernetes-csi/csi-lib-utils v0.7.0
|
||||
github.com/kubernetes-csi/csi-test v2.0.0+incompatible
|
||||
github.com/prometheus/client_golang v1.0.0
|
||||
github.com/prometheus/client_model v0.2.0
|
||||
github.com/prometheus/common v0.4.1
|
||||
google.golang.org/grpc v1.28.0
|
||||
k8s.io/api v0.18.0
|
||||
k8s.io/apimachinery v0.18.0
|
||||
k8s.io/client-go v0.18.0
|
||||
k8s.io/code-generator v0.18.0
|
||||
k8s.io/component-base v0.18.0
|
||||
k8s.io/klog v1.0.0
|
||||
k8s.io/kubernetes v1.18.0
|
||||
)
|
||||
|
177
pkg/metrics/metrics.go
Normal file
177
pkg/metrics/metrics.go
Normal file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
k8smetrics "k8s.io/component-base/metrics"
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
const (
|
||||
opStatusUnknown = "Unknown"
|
||||
labelDriverName = "driver_name"
|
||||
labelOperationName = "operation_name"
|
||||
labelOperationStatus = "operation_status"
|
||||
subSystem = "snapshot_controller"
|
||||
metricName = "operation_total_seconds"
|
||||
metricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end"
|
||||
)
|
||||
|
||||
// OperationStatus is the interface type for representing an operation's execution
|
||||
// status, with the nil value representing an "Unknown" status of the operation.
|
||||
type OperationStatus interface {
|
||||
String() string
|
||||
}
|
||||
|
||||
var metricBuckets = []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 300, 600}
|
||||
|
||||
type MetricsManager interface {
|
||||
// StartMetricsEndpoint starts the metrics endpoint at the specified addr/pattern for
|
||||
// metrics managed by this MetricsManager. It spawns a goroutine to listen to
|
||||
// and serve HTTP requests received on addr/pattern.
|
||||
// If the "pattern" is empty (i.e., ""), no endpoint will be started.
|
||||
// An error will be returned if there is any.
|
||||
StartMetricsEndpoint(pattern, addr string, logger promhttp.Logger, wg *sync.WaitGroup) (*http.Server, error)
|
||||
|
||||
// OperationStart takes in an operation and caches its start time.
|
||||
// if the operation already exists, it's an no-op.
|
||||
OperationStart(op Operation)
|
||||
|
||||
// DropOperation removes an operation from cache.
|
||||
// if the operation does not exist, it's an no-op.
|
||||
DropOperation(op Operation)
|
||||
|
||||
// RecordMetrics records a metric point. Note that it will be an no-op if an
|
||||
// operation has NOT been marked "Started" previously via invoking "OperationStart".
|
||||
// Invoking of RecordMetrics effectively removes the cached entry.
|
||||
// op - the operation which the metric is associated with.
|
||||
// status - the operation status, if not specified, i.e., status == nil, an
|
||||
// "Unknown" status of the passed-in operation is assumed.
|
||||
RecordMetrics(op Operation, status OperationStatus)
|
||||
}
|
||||
|
||||
// Operation is a structure which holds information to identify a snapshot
|
||||
// related operation
|
||||
type Operation struct {
|
||||
// the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot"
|
||||
Name string
|
||||
// the name of the driver which executes the operation
|
||||
Driver string
|
||||
// the resource UID to which the operation has been executed against
|
||||
ResourceID types.UID
|
||||
}
|
||||
|
||||
type operationMetricsManager struct {
|
||||
// cache is a concurrent-safe map which stores start timestamps for all
|
||||
// ongoing operations.
|
||||
// key is an Operation
|
||||
// value is the timestamp of the start time of the operation
|
||||
cache sync.Map
|
||||
|
||||
// registry is a wrapper around Prometheus Registry
|
||||
registry k8smetrics.KubeRegistry
|
||||
|
||||
// opLatencyMetrics is a Histogram metrics
|
||||
opLatencyMetrics *k8smetrics.HistogramVec
|
||||
}
|
||||
|
||||
func NewMetricsManager() MetricsManager {
|
||||
mgr := &operationMetricsManager{
|
||||
cache: sync.Map{},
|
||||
}
|
||||
mgr.init()
|
||||
return mgr
|
||||
}
|
||||
|
||||
func (opMgr *operationMetricsManager) OperationStart(op Operation) {
|
||||
opMgr.cache.LoadOrStore(op, time.Now())
|
||||
}
|
||||
|
||||
func (opMgr *operationMetricsManager) DropOperation(op Operation) {
|
||||
opMgr.cache.Delete(op)
|
||||
}
|
||||
|
||||
func (opMgr *operationMetricsManager) RecordMetrics(op Operation, status OperationStatus) {
|
||||
obj, exists := opMgr.cache.Load(op)
|
||||
if !exists {
|
||||
// the operation has not been cached, return directly
|
||||
return
|
||||
}
|
||||
ts, ok := obj.(time.Time)
|
||||
if !ok {
|
||||
// the cached item is not a time.Time, should NEVER happen, clean and return
|
||||
klog.Errorf("Invalid cache entry for key %v", op)
|
||||
opMgr.cache.Delete(op)
|
||||
return
|
||||
}
|
||||
strStatus := opStatusUnknown
|
||||
if status != nil {
|
||||
strStatus = status.String()
|
||||
}
|
||||
duration := time.Since(ts).Seconds()
|
||||
opMgr.opLatencyMetrics.WithLabelValues(op.Driver, op.Name, strStatus).Observe(duration)
|
||||
opMgr.cache.Delete(op)
|
||||
}
|
||||
|
||||
func (opMgr *operationMetricsManager) init() {
|
||||
opMgr.registry = k8smetrics.NewKubeRegistry()
|
||||
opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec(
|
||||
&k8smetrics.HistogramOpts{
|
||||
Subsystem: subSystem,
|
||||
Name: metricName,
|
||||
Help: metricHelpMsg,
|
||||
Buckets: metricBuckets,
|
||||
},
|
||||
[]string{labelDriverName, labelOperationName, labelOperationStatus},
|
||||
)
|
||||
opMgr.registry.MustRegister(opMgr.opLatencyMetrics)
|
||||
}
|
||||
|
||||
func (opMgr *operationMetricsManager) StartMetricsEndpoint(pattern, addr string, logger promhttp.Logger, wg *sync.WaitGroup) (*http.Server, error) {
|
||||
if addr == "" {
|
||||
return nil, fmt.Errorf("metrics endpoint will not be started as endpoint address is not specified")
|
||||
}
|
||||
// start listening
|
||||
l, err := net.Listen("tcp", addr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to listen on address[%s], error[%v]", addr, err)
|
||||
}
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle(pattern, k8smetrics.HandlerFor(
|
||||
opMgr.registry,
|
||||
k8smetrics.HandlerOpts{
|
||||
ErrorLog: logger,
|
||||
ErrorHandling: k8smetrics.ContinueOnError,
|
||||
}))
|
||||
srv := &http.Server{Addr: l.Addr().String(), Handler: mux}
|
||||
// start serving the endpoint
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err := srv.Serve(l); err != http.ErrServerClosed {
|
||||
klog.Fatalf("failed to start endpoint at:%s/%s, error: %v", addr, pattern, err)
|
||||
}
|
||||
}()
|
||||
return srv, nil
|
||||
}
|
556
pkg/metrics/metrics_test.go
Normal file
556
pkg/metrics/metrics_test.go
Normal file
@@ -0,0 +1,556 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"reflect"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cmg "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
)
|
||||
|
||||
var (
|
||||
statusMap map[int]string = map[int]string{
|
||||
0: "Success",
|
||||
1: "Failure",
|
||||
2: "Unknown",
|
||||
}
|
||||
)
|
||||
|
||||
const (
|
||||
httpPattern = "/metrics"
|
||||
addr = "localhost:0"
|
||||
)
|
||||
|
||||
type fakeOpStatus struct {
|
||||
statusCode int
|
||||
}
|
||||
|
||||
func (s *fakeOpStatus) String() string {
|
||||
if str, ok := statusMap[s.statusCode]; ok {
|
||||
return str
|
||||
}
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func initMgr() (MetricsManager, *sync.WaitGroup, *http.Server) {
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
mgr := NewMetricsManager()
|
||||
srv, err := mgr.StartMetricsEndpoint(httpPattern, addr, nil, wg)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to start serving [%v]", err)
|
||||
}
|
||||
return mgr, wg, srv
|
||||
}
|
||||
|
||||
func shutdown(srv *http.Server, wg *sync.WaitGroup) {
|
||||
if err := srv.Shutdown(context.Background()); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
defer shutdown(srv, wg)
|
||||
if mgr == nil {
|
||||
t.Errorf("failed testing new")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDropNonExistingOperation(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
defer shutdown(srv, wg)
|
||||
op := Operation{
|
||||
Name: "drop-non-existing-operation-should-be-noop",
|
||||
Driver: "driver",
|
||||
ResourceID: types.UID("uid"),
|
||||
}
|
||||
mgr.DropOperation(op)
|
||||
}
|
||||
|
||||
func TestRecordMetricsForNonExistingOperation(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
srvAddr := "http://" + srv.Addr + httpPattern
|
||||
defer shutdown(srv, wg)
|
||||
op := Operation{
|
||||
Name: "no-metrics-should-be-recorded-as-operation-did-not-start",
|
||||
Driver: "driver",
|
||||
ResourceID: types.UID("uid"),
|
||||
}
|
||||
mgr.RecordMetrics(op, nil)
|
||||
rsp, err := http.Get(srvAddr)
|
||||
if err != nil || rsp.StatusCode != http.StatusOK {
|
||||
t.Errorf("failed to get response from server %v, %v", err, rsp)
|
||||
}
|
||||
r, err := ioutil.ReadAll(rsp.Body)
|
||||
if err != nil {
|
||||
t.Errorf("failed to read response body %v", err)
|
||||
}
|
||||
if strings.Contains(string(r), op.Name) {
|
||||
t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDropOperation(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
srvAddr := "http://" + srv.Addr + httpPattern
|
||||
defer shutdown(srv, wg)
|
||||
op := Operation{
|
||||
Name: "should-have-been-dropped",
|
||||
Driver: "driver",
|
||||
ResourceID: types.UID("uid"),
|
||||
}
|
||||
mgr.OperationStart(op)
|
||||
mgr.DropOperation(op)
|
||||
time.Sleep(300 * time.Millisecond)
|
||||
rsp, err := http.Get(srvAddr)
|
||||
if err != nil || rsp.StatusCode != http.StatusOK {
|
||||
t.Errorf("failed to get response from server %v, %v", err, rsp)
|
||||
}
|
||||
r, err := ioutil.ReadAll(rsp.Body)
|
||||
if err != nil {
|
||||
t.Errorf("failed to read response body %v", err)
|
||||
}
|
||||
if strings.Contains(string(r), op.Name) {
|
||||
t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r))
|
||||
}
|
||||
// re-add with a different name
|
||||
op.Name = "should-have-been-added"
|
||||
mgr.OperationStart(op)
|
||||
time.Sleep(300 * time.Millisecond)
|
||||
opStatus := &fakeOpStatus{
|
||||
statusCode: 0,
|
||||
}
|
||||
mgr.RecordMetrics(op, opStatus)
|
||||
expected :=
|
||||
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
||||
# TYPE snapshot_controller_operation_total_seconds histogram
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.25"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 0.3
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 1
|
||||
`
|
||||
|
||||
if err := verifyMetric(expected, srvAddr); err != nil {
|
||||
t.Errorf("failed testing[%v]", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnknownStatus(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
srvAddr := "http://" + srv.Addr + httpPattern
|
||||
defer shutdown(srv, wg)
|
||||
op := Operation{
|
||||
Name: "unknown-status-operation",
|
||||
Driver: "driver",
|
||||
ResourceID: types.UID("uid"),
|
||||
}
|
||||
mgr.OperationStart(op)
|
||||
// should create a Unknown data point with latency ~300ms
|
||||
time.Sleep(300 * time.Millisecond)
|
||||
mgr.RecordMetrics(op, nil)
|
||||
expected :=
|
||||
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
||||
# TYPE snapshot_controller_operation_total_seconds histogram
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.25"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 0.3
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 1
|
||||
`
|
||||
|
||||
if err := verifyMetric(expected, srvAddr); err != nil {
|
||||
t.Errorf("failed testing[%v]", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecordMetrics(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
srvAddr := "http://" + srv.Addr + httpPattern
|
||||
defer shutdown(srv, wg)
|
||||
// add an operation
|
||||
op := Operation{
|
||||
Name: "op1",
|
||||
Driver: "driver1",
|
||||
ResourceID: types.UID("uid1"),
|
||||
}
|
||||
mgr.OperationStart(op)
|
||||
// should create a Success data point with latency ~ 1100ms
|
||||
time.Sleep(1100 * time.Millisecond)
|
||||
success := &fakeOpStatus{
|
||||
statusCode: 0,
|
||||
}
|
||||
mgr.RecordMetrics(op, success)
|
||||
|
||||
// add another operation metric
|
||||
op.Name = "op2"
|
||||
op.Driver = "driver2"
|
||||
op.ResourceID = types.UID("uid2")
|
||||
mgr.OperationStart(op)
|
||||
// should create a Failure data point with latency ~ 100ms
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
failure := &fakeOpStatus{
|
||||
statusCode: 1,
|
||||
}
|
||||
mgr.RecordMetrics(op, failure)
|
||||
|
||||
expected :=
|
||||
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
||||
# TYPE snapshot_controller_operation_total_seconds histogram
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.25"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.5"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success"} 1.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.25"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure"} 0.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure"} 1
|
||||
`
|
||||
if err := verifyMetric(expected, srvAddr); err != nil {
|
||||
t.Errorf("failed testing [%v]", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConcurrency(t *testing.T) {
|
||||
mgr, wg, srv := initMgr()
|
||||
srvAddr := "http://" + srv.Addr + httpPattern
|
||||
defer shutdown(srv, wg)
|
||||
success := &fakeOpStatus{
|
||||
statusCode: 0,
|
||||
}
|
||||
failure := &fakeOpStatus{
|
||||
statusCode: 1,
|
||||
}
|
||||
ops := []struct {
|
||||
op Operation
|
||||
desiredLatencyMs time.Duration
|
||||
status OperationStatus
|
||||
drop bool
|
||||
}{
|
||||
{
|
||||
Operation{
|
||||
Name: "success1",
|
||||
Driver: "driver1",
|
||||
ResourceID: types.UID("uid1"),
|
||||
},
|
||||
100,
|
||||
success,
|
||||
false,
|
||||
},
|
||||
{
|
||||
Operation{
|
||||
Name: "success2",
|
||||
Driver: "driver2",
|
||||
ResourceID: types.UID("uid2"),
|
||||
},
|
||||
100,
|
||||
success,
|
||||
false,
|
||||
},
|
||||
{
|
||||
Operation{
|
||||
Name: "failure1",
|
||||
Driver: "driver3",
|
||||
ResourceID: types.UID("uid3"),
|
||||
},
|
||||
100,
|
||||
failure,
|
||||
false,
|
||||
},
|
||||
{
|
||||
Operation{
|
||||
Name: "failure2",
|
||||
Driver: "driver4",
|
||||
ResourceID: types.UID("uid4"),
|
||||
},
|
||||
100,
|
||||
failure,
|
||||
false,
|
||||
},
|
||||
{
|
||||
Operation{
|
||||
Name: "unknown",
|
||||
Driver: "driver5",
|
||||
ResourceID: types.UID("uid5"),
|
||||
},
|
||||
100,
|
||||
nil,
|
||||
false,
|
||||
},
|
||||
{
|
||||
Operation{
|
||||
Name: "drop-from-cache",
|
||||
Driver: "driver6",
|
||||
ResourceID: types.UID("uid6"),
|
||||
},
|
||||
100,
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
}
|
||||
|
||||
for i := range ops {
|
||||
mgr.OperationStart(ops[i].op)
|
||||
}
|
||||
// add an extra operation which should remain in the cache
|
||||
remaining := Operation{
|
||||
Name: "remaining-in-cache",
|
||||
Driver: "driver7",
|
||||
ResourceID: types.UID("uid7"),
|
||||
}
|
||||
mgr.OperationStart(remaining)
|
||||
|
||||
var wgMetrics sync.WaitGroup
|
||||
|
||||
for i := range ops {
|
||||
wgMetrics.Add(1)
|
||||
go func(i int) {
|
||||
defer wgMetrics.Done()
|
||||
if ops[i].desiredLatencyMs > 0 {
|
||||
time.Sleep(ops[i].desiredLatencyMs * time.Millisecond)
|
||||
}
|
||||
if ops[i].drop {
|
||||
mgr.DropOperation(ops[i].op)
|
||||
} else {
|
||||
mgr.RecordMetrics(ops[i].op, ops[i].status)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
wgMetrics.Wait()
|
||||
|
||||
// validate
|
||||
expected :=
|
||||
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
||||
# TYPE snapshot_controller_operation_total_seconds histogram
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.25"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success"} 0.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.25"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success"} 0.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.25"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 0.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.25"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 0.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.1"} 0
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.25"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="1"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="2.5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="5"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="10"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="15"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="30"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="60"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="120"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="300"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="600"} 1
|
||||
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="+Inf"} 1
|
||||
snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 0.1
|
||||
snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 1
|
||||
`
|
||||
if err := verifyMetric(expected, srvAddr); err != nil {
|
||||
t.Errorf("failed testing [%v]", err)
|
||||
}
|
||||
}
|
||||
|
||||
func verifyMetric(expected, srvAddr string) error {
|
||||
rsp, err := http.Get(srvAddr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if rsp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode))
|
||||
}
|
||||
r, err := ioutil.ReadAll(rsp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
format := expfmt.ResponseFormat(rsp.Header)
|
||||
reader := strings.NewReader(string(r))
|
||||
decoder := expfmt.NewDecoder(reader, format)
|
||||
mf := &cmg.MetricFamily{}
|
||||
if err := decoder.Decode(mf); err != nil {
|
||||
return err
|
||||
}
|
||||
reader = strings.NewReader(expected)
|
||||
decoder = expfmt.NewDecoder(reader, format)
|
||||
expectedMf := &cmg.MetricFamily{}
|
||||
if err := decoder.Decode(expectedMf); err != nil {
|
||||
return err
|
||||
}
|
||||
if !containsMetrics(expectedMf, mf) {
|
||||
return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func containsMetrics(expected, got *cmg.MetricFamily) bool {
|
||||
if (got.Name == nil || *(got.Name) != *(expected.Name)) ||
|
||||
(got.Type == nil || *(got.Type) != *(expected.Type)) ||
|
||||
(got.Help == nil || *(got.Help) != *(expected.Help)) {
|
||||
return false
|
||||
}
|
||||
|
||||
numRecords := len(expected.Metric)
|
||||
if len(got.Metric) < numRecords {
|
||||
return false
|
||||
}
|
||||
matchCount := 0
|
||||
for i := 0; i < len(got.Metric); i++ {
|
||||
for j := 0; j < numRecords; j++ {
|
||||
// labels should be the same
|
||||
if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) {
|
||||
continue
|
||||
}
|
||||
gh := got.Metric[i].Histogram
|
||||
eh := expected.Metric[j].Histogram
|
||||
if gh == nil {
|
||||
continue
|
||||
}
|
||||
if !reflect.DeepEqual(gh.Bucket, eh.Bucket) {
|
||||
continue
|
||||
}
|
||||
// this is a sum record, expecting a latency which is more than the
|
||||
// expected one. If the sum is smaller than expected, it will be considered
|
||||
// as NOT a match
|
||||
if gh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) {
|
||||
continue
|
||||
}
|
||||
if gh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) {
|
||||
continue
|
||||
}
|
||||
// this is a match
|
||||
matchCount = matchCount + 1
|
||||
break
|
||||
}
|
||||
}
|
||||
return matchCount == numRecords
|
||||
}
|
1
v2/pkg/metrics/metrics.go
Symbolic link
1
v2/pkg/metrics/metrics.go
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../pkg/metrics/metrics.go
|
1
v2/pkg/metrics/metrics_test.go
Symbolic link
1
v2/pkg/metrics/metrics_test.go
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../pkg/metrics/metrics_test.go
|
Reference in New Issue
Block a user