Merge pull request #280 from yuxiangqian/metrics

add metric to common controller - basic metrics utilities
This commit is contained in:
Kubernetes Prow Robot
2020-05-26 14:08:41 -07:00
committed by GitHub
5 changed files with 739 additions and 0 deletions

4
go.mod
View File

@@ -10,11 +10,15 @@ require (
github.com/imdario/mergo v0.3.9 // indirect
github.com/kubernetes-csi/csi-lib-utils v0.7.0
github.com/kubernetes-csi/csi-test v2.0.0+incompatible
github.com/prometheus/client_golang v1.0.0
github.com/prometheus/client_model v0.2.0
github.com/prometheus/common v0.4.1
google.golang.org/grpc v1.28.0
k8s.io/api v0.18.0
k8s.io/apimachinery v0.18.0
k8s.io/client-go v0.18.0
k8s.io/code-generator v0.18.0
k8s.io/component-base v0.18.0
k8s.io/klog v1.0.0
k8s.io/kubernetes v1.18.0
)

177
pkg/metrics/metrics.go Normal file
View File

@@ -0,0 +1,177 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"fmt"
"net"
"net/http"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/apimachinery/pkg/types"
k8smetrics "k8s.io/component-base/metrics"
"k8s.io/klog"
)
const (
opStatusUnknown = "Unknown"
labelDriverName = "driver_name"
labelOperationName = "operation_name"
labelOperationStatus = "operation_status"
subSystem = "snapshot_controller"
metricName = "operation_total_seconds"
metricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end"
)
// OperationStatus is the interface type for representing an operation's execution
// status, with the nil value representing an "Unknown" status of the operation.
type OperationStatus interface {
String() string
}
var metricBuckets = []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 300, 600}
type MetricsManager interface {
// StartMetricsEndpoint starts the metrics endpoint at the specified addr/pattern for
// metrics managed by this MetricsManager. It spawns a goroutine to listen to
// and serve HTTP requests received on addr/pattern.
// If the "pattern" is empty (i.e., ""), no endpoint will be started.
// An error will be returned if there is any.
StartMetricsEndpoint(pattern, addr string, logger promhttp.Logger, wg *sync.WaitGroup) (*http.Server, error)
// OperationStart takes in an operation and caches its start time.
// if the operation already exists, it's an no-op.
OperationStart(op Operation)
// DropOperation removes an operation from cache.
// if the operation does not exist, it's an no-op.
DropOperation(op Operation)
// RecordMetrics records a metric point. Note that it will be an no-op if an
// operation has NOT been marked "Started" previously via invoking "OperationStart".
// Invoking of RecordMetrics effectively removes the cached entry.
// op - the operation which the metric is associated with.
// status - the operation status, if not specified, i.e., status == nil, an
// "Unknown" status of the passed-in operation is assumed.
RecordMetrics(op Operation, status OperationStatus)
}
// Operation is a structure which holds information to identify a snapshot
// related operation
type Operation struct {
// the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot"
Name string
// the name of the driver which executes the operation
Driver string
// the resource UID to which the operation has been executed against
ResourceID types.UID
}
type operationMetricsManager struct {
// cache is a concurrent-safe map which stores start timestamps for all
// ongoing operations.
// key is an Operation
// value is the timestamp of the start time of the operation
cache sync.Map
// registry is a wrapper around Prometheus Registry
registry k8smetrics.KubeRegistry
// opLatencyMetrics is a Histogram metrics
opLatencyMetrics *k8smetrics.HistogramVec
}
func NewMetricsManager() MetricsManager {
mgr := &operationMetricsManager{
cache: sync.Map{},
}
mgr.init()
return mgr
}
func (opMgr *operationMetricsManager) OperationStart(op Operation) {
opMgr.cache.LoadOrStore(op, time.Now())
}
func (opMgr *operationMetricsManager) DropOperation(op Operation) {
opMgr.cache.Delete(op)
}
func (opMgr *operationMetricsManager) RecordMetrics(op Operation, status OperationStatus) {
obj, exists := opMgr.cache.Load(op)
if !exists {
// the operation has not been cached, return directly
return
}
ts, ok := obj.(time.Time)
if !ok {
// the cached item is not a time.Time, should NEVER happen, clean and return
klog.Errorf("Invalid cache entry for key %v", op)
opMgr.cache.Delete(op)
return
}
strStatus := opStatusUnknown
if status != nil {
strStatus = status.String()
}
duration := time.Since(ts).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(op.Driver, op.Name, strStatus).Observe(duration)
opMgr.cache.Delete(op)
}
func (opMgr *operationMetricsManager) init() {
opMgr.registry = k8smetrics.NewKubeRegistry()
opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec(
&k8smetrics.HistogramOpts{
Subsystem: subSystem,
Name: metricName,
Help: metricHelpMsg,
Buckets: metricBuckets,
},
[]string{labelDriverName, labelOperationName, labelOperationStatus},
)
opMgr.registry.MustRegister(opMgr.opLatencyMetrics)
}
func (opMgr *operationMetricsManager) StartMetricsEndpoint(pattern, addr string, logger promhttp.Logger, wg *sync.WaitGroup) (*http.Server, error) {
if addr == "" {
return nil, fmt.Errorf("metrics endpoint will not be started as endpoint address is not specified")
}
// start listening
l, err := net.Listen("tcp", addr)
if err != nil {
return nil, fmt.Errorf("failed to listen on address[%s], error[%v]", addr, err)
}
mux := http.NewServeMux()
mux.Handle(pattern, k8smetrics.HandlerFor(
opMgr.registry,
k8smetrics.HandlerOpts{
ErrorLog: logger,
ErrorHandling: k8smetrics.ContinueOnError,
}))
srv := &http.Server{Addr: l.Addr().String(), Handler: mux}
// start serving the endpoint
go func() {
defer wg.Done()
if err := srv.Serve(l); err != http.ErrServerClosed {
klog.Fatalf("failed to start endpoint at:%s/%s, error: %v", addr, pattern, err)
}
}()
return srv, nil
}

556
pkg/metrics/metrics_test.go Normal file
View File

@@ -0,0 +1,556 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"context"
"fmt"
"io/ioutil"
"log"
"net/http"
"reflect"
"strings"
"sync"
"testing"
"time"
cmg "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"k8s.io/apimachinery/pkg/types"
)
var (
statusMap map[int]string = map[int]string{
0: "Success",
1: "Failure",
2: "Unknown",
}
)
const (
httpPattern = "/metrics"
addr = "localhost:0"
)
type fakeOpStatus struct {
statusCode int
}
func (s *fakeOpStatus) String() string {
if str, ok := statusMap[s.statusCode]; ok {
return str
}
return "Unknown"
}
func initMgr() (MetricsManager, *sync.WaitGroup, *http.Server) {
wg := &sync.WaitGroup{}
wg.Add(1)
mgr := NewMetricsManager()
srv, err := mgr.StartMetricsEndpoint(httpPattern, addr, nil, wg)
if err != nil {
log.Fatalf("failed to start serving [%v]", err)
}
return mgr, wg, srv
}
func shutdown(srv *http.Server, wg *sync.WaitGroup) {
if err := srv.Shutdown(context.Background()); err != nil {
panic(err)
}
wg.Wait()
}
func TestNew(t *testing.T) {
mgr, wg, srv := initMgr()
defer shutdown(srv, wg)
if mgr == nil {
t.Errorf("failed testing new")
}
}
func TestDropNonExistingOperation(t *testing.T) {
mgr, wg, srv := initMgr()
defer shutdown(srv, wg)
op := Operation{
Name: "drop-non-existing-operation-should-be-noop",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.DropOperation(op)
}
func TestRecordMetricsForNonExistingOperation(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
op := Operation{
Name: "no-metrics-should-be-recorded-as-operation-did-not-start",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.RecordMetrics(op, nil)
rsp, err := http.Get(srvAddr)
if err != nil || rsp.StatusCode != http.StatusOK {
t.Errorf("failed to get response from server %v, %v", err, rsp)
}
r, err := ioutil.ReadAll(rsp.Body)
if err != nil {
t.Errorf("failed to read response body %v", err)
}
if strings.Contains(string(r), op.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r))
}
}
func TestDropOperation(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
op := Operation{
Name: "should-have-been-dropped",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.OperationStart(op)
mgr.DropOperation(op)
time.Sleep(300 * time.Millisecond)
rsp, err := http.Get(srvAddr)
if err != nil || rsp.StatusCode != http.StatusOK {
t.Errorf("failed to get response from server %v, %v", err, rsp)
}
r, err := ioutil.ReadAll(rsp.Body)
if err != nil {
t.Errorf("failed to read response body %v", err)
}
if strings.Contains(string(r), op.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r))
}
// re-add with a different name
op.Name = "should-have-been-added"
mgr.OperationStart(op)
time.Sleep(300 * time.Millisecond)
opStatus := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordMetrics(op, opStatus)
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing[%v]", err)
}
}
func TestUnknownStatus(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
op := Operation{
Name: "unknown-status-operation",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.OperationStart(op)
// should create a Unknown data point with latency ~300ms
time.Sleep(300 * time.Millisecond)
mgr.RecordMetrics(op, nil)
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing[%v]", err)
}
}
func TestRecordMetrics(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
// add an operation
op := Operation{
Name: "op1",
Driver: "driver1",
ResourceID: types.UID("uid1"),
}
mgr.OperationStart(op)
// should create a Success data point with latency ~ 1100ms
time.Sleep(1100 * time.Millisecond)
success := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordMetrics(op, success)
// add another operation metric
op.Name = "op2"
op.Driver = "driver2"
op.ResourceID = types.UID("uid2")
mgr.OperationStart(op)
// should create a Failure data point with latency ~ 100ms
time.Sleep(100 * time.Millisecond)
failure := &fakeOpStatus{
statusCode: 1,
}
mgr.RecordMetrics(op, failure)
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.5"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success"} 1.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure"} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}
func TestConcurrency(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
success := &fakeOpStatus{
statusCode: 0,
}
failure := &fakeOpStatus{
statusCode: 1,
}
ops := []struct {
op Operation
desiredLatencyMs time.Duration
status OperationStatus
drop bool
}{
{
Operation{
Name: "success1",
Driver: "driver1",
ResourceID: types.UID("uid1"),
},
100,
success,
false,
},
{
Operation{
Name: "success2",
Driver: "driver2",
ResourceID: types.UID("uid2"),
},
100,
success,
false,
},
{
Operation{
Name: "failure1",
Driver: "driver3",
ResourceID: types.UID("uid3"),
},
100,
failure,
false,
},
{
Operation{
Name: "failure2",
Driver: "driver4",
ResourceID: types.UID("uid4"),
},
100,
failure,
false,
},
{
Operation{
Name: "unknown",
Driver: "driver5",
ResourceID: types.UID("uid5"),
},
100,
nil,
false,
},
{
Operation{
Name: "drop-from-cache",
Driver: "driver6",
ResourceID: types.UID("uid6"),
},
100,
nil,
true,
},
}
for i := range ops {
mgr.OperationStart(ops[i].op)
}
// add an extra operation which should remain in the cache
remaining := Operation{
Name: "remaining-in-cache",
Driver: "driver7",
ResourceID: types.UID("uid7"),
}
mgr.OperationStart(remaining)
var wgMetrics sync.WaitGroup
for i := range ops {
wgMetrics.Add(1)
go func(i int) {
defer wgMetrics.Done()
if ops[i].desiredLatencyMs > 0 {
time.Sleep(ops[i].desiredLatencyMs * time.Millisecond)
}
if ops[i].drop {
mgr.DropOperation(ops[i].op)
} else {
mgr.RecordMetrics(ops[i].op, ops[i].status)
}
}(i)
}
wgMetrics.Wait()
// validate
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}
func verifyMetric(expected, srvAddr string) error {
rsp, err := http.Get(srvAddr)
if err != nil {
return err
}
if rsp.StatusCode != http.StatusOK {
return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode))
}
r, err := ioutil.ReadAll(rsp.Body)
if err != nil {
return err
}
format := expfmt.ResponseFormat(rsp.Header)
reader := strings.NewReader(string(r))
decoder := expfmt.NewDecoder(reader, format)
mf := &cmg.MetricFamily{}
if err := decoder.Decode(mf); err != nil {
return err
}
reader = strings.NewReader(expected)
decoder = expfmt.NewDecoder(reader, format)
expectedMf := &cmg.MetricFamily{}
if err := decoder.Decode(expectedMf); err != nil {
return err
}
if !containsMetrics(expectedMf, mf) {
return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r))
}
return nil
}
func containsMetrics(expected, got *cmg.MetricFamily) bool {
if (got.Name == nil || *(got.Name) != *(expected.Name)) ||
(got.Type == nil || *(got.Type) != *(expected.Type)) ||
(got.Help == nil || *(got.Help) != *(expected.Help)) {
return false
}
numRecords := len(expected.Metric)
if len(got.Metric) < numRecords {
return false
}
matchCount := 0
for i := 0; i < len(got.Metric); i++ {
for j := 0; j < numRecords; j++ {
// labels should be the same
if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) {
continue
}
gh := got.Metric[i].Histogram
eh := expected.Metric[j].Histogram
if gh == nil {
continue
}
if !reflect.DeepEqual(gh.Bucket, eh.Bucket) {
continue
}
// this is a sum record, expecting a latency which is more than the
// expected one. If the sum is smaller than expected, it will be considered
// as NOT a match
if gh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) {
continue
}
if gh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) {
continue
}
// this is a match
matchCount = matchCount + 1
break
}
}
return matchCount == numRecords
}

1
v2/pkg/metrics/metrics.go Symbolic link
View File

@@ -0,0 +1 @@
../../../pkg/metrics/metrics.go

View File

@@ -0,0 +1 @@
../../../pkg/metrics/metrics_test.go