Files
external-snapshotter/pkg/metrics/metrics_test.go
2023-06-25 11:09:17 +08:00

741 lines
37 KiB
Go

/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"context"
"fmt"
"io"
"log"
"net"
"net/http"
"reflect"
"sort"
"strings"
"sync"
"testing"
"time"
cmg "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"k8s.io/apimachinery/pkg/types"
)
var (
statusMap map[int]string = map[int]string{
0: "Success",
1: "Failure",
2: "Unknown",
}
)
const (
httpPattern = "/metrics"
addr = "localhost:0"
processStartTimeMetric = "process_start_time_seconds"
)
type fakeOpStatus struct {
statusCode int
}
func (s *fakeOpStatus) String() string {
if str, ok := statusMap[s.statusCode]; ok {
return str
}
return "Unknown"
}
func initMgr() (MetricsManager, *http.Server) {
mgr := NewMetricsManager()
mux := http.NewServeMux()
err := mgr.PrepareMetricsPath(mux, httpPattern, nil)
if err != nil {
log.Fatalf("failed to start serving [%v]", err)
}
l, err := net.Listen("tcp", addr)
if err != nil {
log.Fatalf("failed to listen on address[%s], error[%v]", addr, err)
}
srv := &http.Server{Addr: l.Addr().String(), Handler: mux}
go func() {
if err := srv.Serve(l); err != http.ErrServerClosed {
log.Fatalf("failed to start endpoint at:%s/%s, error: %v", addr, httpPattern, err)
}
}()
return mgr, srv
}
func shutdown(srv *http.Server) {
if err := srv.Shutdown(context.Background()); err != nil {
panic(err)
}
}
func TestNew(t *testing.T) {
mgr, srv := initMgr()
defer shutdown(srv)
if mgr == nil {
t.Errorf("failed testing new")
}
}
func TestDropNonExistingOperation(t *testing.T) {
mgr, srv := initMgr()
defer shutdown(srv)
op := OperationKey{
Name: "drop-non-existing-operation-should-be-noop",
ResourceID: types.UID("uid"),
}
mgr.DropOperation(op)
}
func TestRecordMetricsForNonExistingOperation(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
opKey := OperationKey{
Name: "no-metrics-should-be-recorded-as-operation-did-not-start",
ResourceID: types.UID("uid"),
}
mgr.RecordMetrics(opKey, nil, "driver")
rsp, err := http.Get(srvAddr)
if err != nil || rsp.StatusCode != http.StatusOK {
t.Errorf("failed to get response from server %v, %v", err, rsp)
}
r, err := io.ReadAll(rsp.Body)
if err != nil {
t.Errorf("failed to read response body %v", err)
}
if strings.Contains(string(r), opKey.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r))
}
}
func TestDropOperation(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
opKey := OperationKey{
Name: "should-have-been-dropped",
ResourceID: types.UID("uid"),
}
opVal := NewOperationValue("driver", DynamicSnapshotType)
mgr.OperationStart(opKey, opVal)
mgr.DropOperation(opKey)
time.Sleep(300 * time.Millisecond)
rsp, err := http.Get(srvAddr)
if err != nil || rsp.StatusCode != http.StatusOK {
t.Errorf("failed to get response from server %v, %v", err, rsp)
}
r, err := io.ReadAll(rsp.Body)
if err != nil {
t.Errorf("failed to read response body %v", err)
}
if strings.Contains(string(r), opKey.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r))
}
// re-add with a different name
opKey.Name = "should-have-been-added"
mgr.OperationStart(opKey, opVal)
time.Sleep(300 * time.Millisecond)
opStatus := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordMetrics(opKey, opStatus, "driver")
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing[%v]", err)
}
}
func TestUnknownStatus(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
opKey := OperationKey{
Name: "unknown-status-operation",
ResourceID: types.UID("uid"),
}
mgr.OperationStart(opKey, NewOperationValue("driver", DynamicSnapshotType))
// should create a Unknown data point with latency ~300ms
time.Sleep(300 * time.Millisecond)
mgr.RecordMetrics(opKey, nil, "driver")
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing[%v]", err)
}
}
func TestRecordMetrics(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
// add an operation
opKey := OperationKey{
Name: "op1",
ResourceID: types.UID("uid1"),
}
opVal := NewOperationValue("driver", DynamicSnapshotType)
mgr.OperationStart(opKey, opVal)
// should create a Success data point with latency ~ 1100ms
time.Sleep(1100 * time.Millisecond)
success := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordMetrics(opKey, success, "driver")
// add another operation metric
opKey.Name = "op2"
opKey.ResourceID = types.UID("uid2")
mgr.OperationStart(opKey, opVal)
// should create a Failure data point with latency ~ 100ms
time.Sleep(100 * time.Millisecond)
failure := &fakeOpStatus{
statusCode: 1,
}
mgr.RecordMetrics(opKey, failure, "driver")
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}
func TestConcurrency(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
success := &fakeOpStatus{
statusCode: 0,
}
failure := &fakeOpStatus{
statusCode: 1,
}
ops := []struct {
op OperationKey
desiredLatencyMs time.Duration
status OperationStatus
drop bool
}{
{
OperationKey{
Name: "success1",
ResourceID: types.UID("uid1"),
},
100,
success,
false,
},
{
OperationKey{
Name: "success2",
ResourceID: types.UID("uid2"),
},
100,
success,
false,
},
{
OperationKey{
Name: "failure1",
ResourceID: types.UID("uid3"),
},
100,
failure,
false,
},
{
OperationKey{
Name: "failure2",
ResourceID: types.UID("uid4"),
},
100,
failure,
false,
},
{
OperationKey{
Name: "unknown",
ResourceID: types.UID("uid5"),
},
100,
nil,
false,
},
{
OperationKey{
Name: "drop-from-cache",
ResourceID: types.UID("uid6"),
},
100,
nil,
true,
},
}
for i := range ops {
mgr.OperationStart(ops[i].op, OperationValue{
Driver: fmt.Sprintf("driver%v", i),
SnapshotType: string(DynamicSnapshotType),
})
}
// add an extra operation which should remain in the cache
remaining := OperationKey{
Name: "remaining-in-cache",
ResourceID: types.UID("uid7"),
}
mgr.OperationStart(remaining, OperationValue{
Driver: "driver7",
SnapshotType: string(DynamicSnapshotType),
})
var wgMetrics sync.WaitGroup
for i := range ops {
wgMetrics.Add(1)
go func(i int) {
defer wgMetrics.Done()
if ops[i].desiredLatencyMs > 0 {
time.Sleep(ops[i].desiredLatencyMs * time.Millisecond)
}
if ops[i].drop {
mgr.DropOperation(ops[i].op)
} else {
mgr.RecordMetrics(ops[i].op, ops[i].status, fmt.Sprintf("driver%v", i))
}
}(i)
}
wgMetrics.Wait()
// validate
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}
func TestInFlightMetric(t *testing.T) {
inFlightCheckInterval = time.Millisecond * 50
mgr, srv := initMgr()
defer shutdown(srv)
srvAddr := "http://" + srv.Addr + httpPattern
// Start first operation, should be 1
opKey := OperationKey{
Name: "leaked",
ResourceID: types.UID("uid"),
}
opVal := NewOperationValue("driver", "test")
mgr.OperationStart(opKey, opVal)
time.Sleep(500 * time.Millisecond)
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 1`, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
// Start second operation, should be 2
opKey = OperationKey{
Name: "leaked2",
ResourceID: types.UID("uid"),
}
opVal = NewOperationValue("driver2", "test2")
mgr.OperationStart(opKey, opVal)
time.Sleep(500 * time.Millisecond)
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 2`, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
// Record, should be down to 1
mgr.RecordMetrics(opKey, nil, "driver")
time.Sleep(500 * time.Millisecond)
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 1`, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
// Start 50 operations, should be 51
for i := 0; i < 50; i++ {
opKey := OperationKey{
Name: fmt.Sprintf("op%d", i),
ResourceID: types.UID("uid%d"),
}
opVal := NewOperationValue("driver1", "test")
mgr.OperationStart(opKey, opVal)
}
time.Sleep(500 * time.Millisecond)
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 51`, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}
func verifyInFlightMetric(expected string, srvAddr string) error {
rsp, err := http.Get(srvAddr)
if err != nil {
return err
}
if rsp.StatusCode != http.StatusOK {
return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode))
}
r, err := io.ReadAll(rsp.Body)
if err != nil {
return err
}
if !strings.Contains(string(r), expected) {
return fmt.Errorf("failed, not equal")
}
return nil
}
func verifyMetric(expected, srvAddr string) error {
rsp, err := http.Get(srvAddr)
if err != nil {
return err
}
if rsp.StatusCode != http.StatusOK {
return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode))
}
r, err := io.ReadAll(rsp.Body)
if err != nil {
return err
}
format := expfmt.ResponseFormat(rsp.Header)
gotReader := strings.NewReader(string(r))
gotDecoder := expfmt.NewDecoder(gotReader, format)
expectedReader := strings.NewReader(expected)
expectedDecoder := expfmt.NewDecoder(expectedReader, format)
gotMfs := []*cmg.MetricFamily{}
expectedMfs := []*cmg.MetricFamily{}
for {
gotMf := &cmg.MetricFamily{}
gotErr := gotDecoder.Decode(gotMf)
expectedMf := &cmg.MetricFamily{}
if expectedErr := expectedDecoder.Decode(expectedMf); expectedErr != nil {
// return correctly if both are EOF
if expectedErr == io.EOF && gotErr == io.EOF {
break
} else {
return err
}
}
gotMfs = append(gotMfs, gotMf)
expectedMfs = append(expectedMfs, expectedMf)
}
if !containsMetrics(expectedMfs, gotMfs) {
return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r))
}
return nil
}
// sortMfs, sorts metric families in alphabetical order by type.
// currently only supports counter and histogram
func sortMfs(mfs []*cmg.MetricFamily) []*cmg.MetricFamily {
var sortedMfs []*cmg.MetricFamily
// Sort first by type
sort.Slice(mfs, func(i, j int) bool {
return *mfs[i].Type < *mfs[j].Type
})
// Next, sort by length of name
sort.Slice(mfs, func(i, j int) bool {
return len(*mfs[i].Name) < len(*mfs[j].Name)
})
return sortedMfs
}
func containsMetrics(expectedMfs, gotMfs []*cmg.MetricFamily) bool {
if len(gotMfs) != len(expectedMfs) {
fmt.Printf("Not same length: expected and got metrics families: %v vs. %v\n", len(expectedMfs), len(gotMfs))
return false
}
// sort metric families for deterministic comparison.
sortedExpectedMfs := sortMfs(expectedMfs)
sortedGotMfs := sortMfs(gotMfs)
// compare expected vs. sorted actual metrics
for k, got := range sortedGotMfs {
matchCount := 0
expected := sortedExpectedMfs[k]
if (got.Name == nil || *(got.Name) != *(expected.Name)) ||
(got.Type == nil || *(got.Type) != *(expected.Type)) ||
(got.Help == nil || *(got.Help) != *(expected.Help)) {
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Name, *expected.Name)
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Type, *expected.Type)
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Help, *expected.Help)
return false
}
numRecords := len(expected.Metric)
if len(got.Metric) < numRecords {
fmt.Printf("Not the same number of records: got.Metric: %v, numRecords: %v\n", len(got.Metric), numRecords)
return false
}
for i := 0; i < len(got.Metric); i++ {
for j := 0; j < numRecords; j++ {
if got.Metric[i].Histogram == nil && expected.Metric[j].Histogram != nil ||
got.Metric[i].Histogram != nil && expected.Metric[j].Histogram == nil {
fmt.Printf("got metric and expected metric histogram type mismatch")
return false
}
// labels should be the same
if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) {
continue
}
// metric type specific checks
switch {
case got.Metric[i].Histogram != nil && expected.Metric[j].Histogram != nil:
gh := got.Metric[i].Histogram
eh := expected.Metric[j].Histogram
if gh == nil || eh == nil {
continue
}
if !reflect.DeepEqual(gh.Bucket, eh.Bucket) {
fmt.Println("got and expected histogram bucket not equal")
continue
}
// this is a sum record, expecting a latency which is more than the
// expected one. If the sum is smaller than expected, it will be considered
// as NOT a match
if gh.SampleSum == nil || eh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) {
fmt.Println("difference in sample sum")
continue
}
if gh.SampleCount == nil || eh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) {
fmt.Println("difference in sample count")
continue
}
case got.Metric[i].Counter != nil && expected.Metric[j].Counter != nil:
gc := got.Metric[i].Counter
ec := expected.Metric[j].Counter
if gc.Value == nil || *(gc.Value) != *(ec.Value) {
fmt.Println("difference in counter values")
continue
}
}
// this is a match
matchCount = matchCount + 1
break
}
}
if matchCount != numRecords {
fmt.Printf("matchCount %v, numRecords %v\n", matchCount, numRecords)
return false
}
}
return true
}
func TestProcessStartTimeMetricExist(t *testing.T) {
mgr, srv := initMgr()
defer shutdown(srv)
metricsFamilies, err := mgr.GetRegistry().Gather()
if err != nil {
t.Fatalf("Error fetching metrics: %v", err)
}
for _, metricsFamily := range metricsFamilies {
if metricsFamily.GetName() == processStartTimeMetric {
return
}
m := metricsFamily.GetMetric()
if m[0].GetGauge().GetValue() <= 0 {
t.Fatalf("Expected non zero timestamp for process start time")
}
}
t.Fatalf("Metrics does not contain %v. Scraped content: %v", processStartTimeMetric, metricsFamilies)
}