741 lines
37 KiB
Go
741 lines
37 KiB
Go
/*
|
|
Copyright 2020 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net"
|
|
"net/http"
|
|
"reflect"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
cmg "github.com/prometheus/client_model/go"
|
|
"github.com/prometheus/common/expfmt"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
)
|
|
|
|
var (
|
|
statusMap map[int]string = map[int]string{
|
|
0: "Success",
|
|
1: "Failure",
|
|
2: "Unknown",
|
|
}
|
|
)
|
|
|
|
const (
|
|
httpPattern = "/metrics"
|
|
addr = "localhost:0"
|
|
processStartTimeMetric = "process_start_time_seconds"
|
|
)
|
|
|
|
type fakeOpStatus struct {
|
|
statusCode int
|
|
}
|
|
|
|
func (s *fakeOpStatus) String() string {
|
|
if str, ok := statusMap[s.statusCode]; ok {
|
|
return str
|
|
}
|
|
return "Unknown"
|
|
}
|
|
|
|
func initMgr() (MetricsManager, *http.Server) {
|
|
mgr := NewMetricsManager()
|
|
mux := http.NewServeMux()
|
|
err := mgr.PrepareMetricsPath(mux, httpPattern, nil)
|
|
if err != nil {
|
|
log.Fatalf("failed to start serving [%v]", err)
|
|
}
|
|
l, err := net.Listen("tcp", addr)
|
|
if err != nil {
|
|
log.Fatalf("failed to listen on address[%s], error[%v]", addr, err)
|
|
}
|
|
srv := &http.Server{Addr: l.Addr().String(), Handler: mux}
|
|
go func() {
|
|
if err := srv.Serve(l); err != http.ErrServerClosed {
|
|
log.Fatalf("failed to start endpoint at:%s/%s, error: %v", addr, httpPattern, err)
|
|
}
|
|
}()
|
|
|
|
return mgr, srv
|
|
}
|
|
|
|
func shutdown(srv *http.Server) {
|
|
if err := srv.Shutdown(context.Background()); err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
func TestNew(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
defer shutdown(srv)
|
|
if mgr == nil {
|
|
t.Errorf("failed testing new")
|
|
}
|
|
}
|
|
|
|
func TestDropNonExistingOperation(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
defer shutdown(srv)
|
|
op := OperationKey{
|
|
Name: "drop-non-existing-operation-should-be-noop",
|
|
ResourceID: types.UID("uid"),
|
|
}
|
|
mgr.DropOperation(op)
|
|
}
|
|
|
|
func TestRecordMetricsForNonExistingOperation(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
srvAddr := "http://" + srv.Addr + httpPattern
|
|
defer shutdown(srv)
|
|
opKey := OperationKey{
|
|
Name: "no-metrics-should-be-recorded-as-operation-did-not-start",
|
|
ResourceID: types.UID("uid"),
|
|
}
|
|
mgr.RecordMetrics(opKey, nil, "driver")
|
|
rsp, err := http.Get(srvAddr)
|
|
if err != nil || rsp.StatusCode != http.StatusOK {
|
|
t.Errorf("failed to get response from server %v, %v", err, rsp)
|
|
}
|
|
r, err := io.ReadAll(rsp.Body)
|
|
if err != nil {
|
|
t.Errorf("failed to read response body %v", err)
|
|
}
|
|
if strings.Contains(string(r), opKey.Name) {
|
|
t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r))
|
|
}
|
|
}
|
|
|
|
func TestDropOperation(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
srvAddr := "http://" + srv.Addr + httpPattern
|
|
defer shutdown(srv)
|
|
opKey := OperationKey{
|
|
Name: "should-have-been-dropped",
|
|
ResourceID: types.UID("uid"),
|
|
}
|
|
opVal := NewOperationValue("driver", DynamicSnapshotType)
|
|
mgr.OperationStart(opKey, opVal)
|
|
mgr.DropOperation(opKey)
|
|
time.Sleep(300 * time.Millisecond)
|
|
rsp, err := http.Get(srvAddr)
|
|
if err != nil || rsp.StatusCode != http.StatusOK {
|
|
t.Errorf("failed to get response from server %v, %v", err, rsp)
|
|
}
|
|
r, err := io.ReadAll(rsp.Body)
|
|
if err != nil {
|
|
t.Errorf("failed to read response body %v", err)
|
|
}
|
|
if strings.Contains(string(r), opKey.Name) {
|
|
t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r))
|
|
}
|
|
// re-add with a different name
|
|
opKey.Name = "should-have-been-added"
|
|
mgr.OperationStart(opKey, opVal)
|
|
time.Sleep(300 * time.Millisecond)
|
|
opStatus := &fakeOpStatus{
|
|
statusCode: 0,
|
|
}
|
|
mgr.RecordMetrics(opKey, opStatus, "driver")
|
|
expected :=
|
|
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
|
# TYPE snapshot_controller_operation_total_seconds histogram
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.25"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 0.3
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 1
|
|
`
|
|
|
|
if err := verifyMetric(expected, srvAddr); err != nil {
|
|
t.Errorf("failed testing[%v]", err)
|
|
}
|
|
}
|
|
|
|
func TestUnknownStatus(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
srvAddr := "http://" + srv.Addr + httpPattern
|
|
defer shutdown(srv)
|
|
opKey := OperationKey{
|
|
Name: "unknown-status-operation",
|
|
ResourceID: types.UID("uid"),
|
|
}
|
|
mgr.OperationStart(opKey, NewOperationValue("driver", DynamicSnapshotType))
|
|
// should create a Unknown data point with latency ~300ms
|
|
time.Sleep(300 * time.Millisecond)
|
|
mgr.RecordMetrics(opKey, nil, "driver")
|
|
expected :=
|
|
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
|
# TYPE snapshot_controller_operation_total_seconds histogram
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.25"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 0.3
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 1
|
|
`
|
|
|
|
if err := verifyMetric(expected, srvAddr); err != nil {
|
|
t.Errorf("failed testing[%v]", err)
|
|
}
|
|
}
|
|
|
|
func TestRecordMetrics(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
srvAddr := "http://" + srv.Addr + httpPattern
|
|
defer shutdown(srv)
|
|
// add an operation
|
|
opKey := OperationKey{
|
|
Name: "op1",
|
|
ResourceID: types.UID("uid1"),
|
|
}
|
|
opVal := NewOperationValue("driver", DynamicSnapshotType)
|
|
mgr.OperationStart(opKey, opVal)
|
|
// should create a Success data point with latency ~ 1100ms
|
|
time.Sleep(1100 * time.Millisecond)
|
|
success := &fakeOpStatus{
|
|
statusCode: 0,
|
|
}
|
|
mgr.RecordMetrics(opKey, success, "driver")
|
|
|
|
// add another operation metric
|
|
opKey.Name = "op2"
|
|
opKey.ResourceID = types.UID("uid2")
|
|
mgr.OperationStart(opKey, opVal)
|
|
// should create a Failure data point with latency ~ 100ms
|
|
time.Sleep(100 * time.Millisecond)
|
|
failure := &fakeOpStatus{
|
|
statusCode: 1,
|
|
}
|
|
mgr.RecordMetrics(opKey, failure, "driver")
|
|
|
|
expected :=
|
|
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
|
# TYPE snapshot_controller_operation_total_seconds histogram
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1
|
|
`
|
|
if err := verifyMetric(expected, srvAddr); err != nil {
|
|
t.Errorf("failed testing [%v]", err)
|
|
}
|
|
}
|
|
|
|
func TestConcurrency(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
srvAddr := "http://" + srv.Addr + httpPattern
|
|
defer shutdown(srv)
|
|
success := &fakeOpStatus{
|
|
statusCode: 0,
|
|
}
|
|
failure := &fakeOpStatus{
|
|
statusCode: 1,
|
|
}
|
|
ops := []struct {
|
|
op OperationKey
|
|
desiredLatencyMs time.Duration
|
|
status OperationStatus
|
|
drop bool
|
|
}{
|
|
{
|
|
OperationKey{
|
|
Name: "success1",
|
|
ResourceID: types.UID("uid1"),
|
|
},
|
|
100,
|
|
success,
|
|
false,
|
|
},
|
|
{
|
|
OperationKey{
|
|
Name: "success2",
|
|
ResourceID: types.UID("uid2"),
|
|
},
|
|
100,
|
|
success,
|
|
false,
|
|
},
|
|
{
|
|
OperationKey{
|
|
Name: "failure1",
|
|
ResourceID: types.UID("uid3"),
|
|
},
|
|
100,
|
|
failure,
|
|
false,
|
|
},
|
|
{
|
|
OperationKey{
|
|
Name: "failure2",
|
|
ResourceID: types.UID("uid4"),
|
|
},
|
|
100,
|
|
failure,
|
|
false,
|
|
},
|
|
{
|
|
OperationKey{
|
|
Name: "unknown",
|
|
ResourceID: types.UID("uid5"),
|
|
},
|
|
100,
|
|
nil,
|
|
false,
|
|
},
|
|
{
|
|
OperationKey{
|
|
Name: "drop-from-cache",
|
|
ResourceID: types.UID("uid6"),
|
|
},
|
|
100,
|
|
nil,
|
|
true,
|
|
},
|
|
}
|
|
|
|
for i := range ops {
|
|
mgr.OperationStart(ops[i].op, OperationValue{
|
|
Driver: fmt.Sprintf("driver%v", i),
|
|
SnapshotType: string(DynamicSnapshotType),
|
|
})
|
|
}
|
|
// add an extra operation which should remain in the cache
|
|
remaining := OperationKey{
|
|
Name: "remaining-in-cache",
|
|
ResourceID: types.UID("uid7"),
|
|
}
|
|
mgr.OperationStart(remaining, OperationValue{
|
|
Driver: "driver7",
|
|
SnapshotType: string(DynamicSnapshotType),
|
|
})
|
|
|
|
var wgMetrics sync.WaitGroup
|
|
|
|
for i := range ops {
|
|
wgMetrics.Add(1)
|
|
go func(i int) {
|
|
defer wgMetrics.Done()
|
|
if ops[i].desiredLatencyMs > 0 {
|
|
time.Sleep(ops[i].desiredLatencyMs * time.Millisecond)
|
|
}
|
|
if ops[i].drop {
|
|
mgr.DropOperation(ops[i].op)
|
|
} else {
|
|
mgr.RecordMetrics(ops[i].op, ops[i].status, fmt.Sprintf("driver%v", i))
|
|
}
|
|
}(i)
|
|
}
|
|
wgMetrics.Wait()
|
|
|
|
// validate
|
|
expected :=
|
|
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
|
|
# TYPE snapshot_controller_operation_total_seconds histogram
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.25"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 0.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.25"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 0.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.25"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 0.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.25"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 0.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.1"} 0
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.25"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="1"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="2.5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="5"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="10"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="15"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="30"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="60"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="120"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="300"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="600"} 1
|
|
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="+Inf"} 1
|
|
snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 0.1
|
|
snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 1
|
|
`
|
|
if err := verifyMetric(expected, srvAddr); err != nil {
|
|
t.Errorf("failed testing [%v]", err)
|
|
}
|
|
}
|
|
|
|
func TestInFlightMetric(t *testing.T) {
|
|
inFlightCheckInterval = time.Millisecond * 50
|
|
|
|
mgr, srv := initMgr()
|
|
defer shutdown(srv)
|
|
srvAddr := "http://" + srv.Addr + httpPattern
|
|
|
|
// Start first operation, should be 1
|
|
opKey := OperationKey{
|
|
Name: "leaked",
|
|
ResourceID: types.UID("uid"),
|
|
}
|
|
opVal := NewOperationValue("driver", "test")
|
|
mgr.OperationStart(opKey, opVal)
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 1`, srvAddr); err != nil {
|
|
t.Errorf("failed testing [%v]", err)
|
|
}
|
|
|
|
// Start second operation, should be 2
|
|
opKey = OperationKey{
|
|
Name: "leaked2",
|
|
ResourceID: types.UID("uid"),
|
|
}
|
|
opVal = NewOperationValue("driver2", "test2")
|
|
mgr.OperationStart(opKey, opVal)
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 2`, srvAddr); err != nil {
|
|
t.Errorf("failed testing [%v]", err)
|
|
}
|
|
|
|
// Record, should be down to 1
|
|
mgr.RecordMetrics(opKey, nil, "driver")
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 1`, srvAddr); err != nil {
|
|
t.Errorf("failed testing [%v]", err)
|
|
}
|
|
|
|
// Start 50 operations, should be 51
|
|
for i := 0; i < 50; i++ {
|
|
opKey := OperationKey{
|
|
Name: fmt.Sprintf("op%d", i),
|
|
ResourceID: types.UID("uid%d"),
|
|
}
|
|
opVal := NewOperationValue("driver1", "test")
|
|
mgr.OperationStart(opKey, opVal)
|
|
}
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
if err := verifyInFlightMetric(`snapshot_controller_operations_in_flight 51`, srvAddr); err != nil {
|
|
t.Errorf("failed testing [%v]", err)
|
|
}
|
|
}
|
|
|
|
func verifyInFlightMetric(expected string, srvAddr string) error {
|
|
rsp, err := http.Get(srvAddr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if rsp.StatusCode != http.StatusOK {
|
|
return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode))
|
|
}
|
|
r, err := io.ReadAll(rsp.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !strings.Contains(string(r), expected) {
|
|
return fmt.Errorf("failed, not equal")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func verifyMetric(expected, srvAddr string) error {
|
|
rsp, err := http.Get(srvAddr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if rsp.StatusCode != http.StatusOK {
|
|
return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode))
|
|
}
|
|
r, err := io.ReadAll(rsp.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
format := expfmt.ResponseFormat(rsp.Header)
|
|
gotReader := strings.NewReader(string(r))
|
|
gotDecoder := expfmt.NewDecoder(gotReader, format)
|
|
expectedReader := strings.NewReader(expected)
|
|
expectedDecoder := expfmt.NewDecoder(expectedReader, format)
|
|
|
|
gotMfs := []*cmg.MetricFamily{}
|
|
expectedMfs := []*cmg.MetricFamily{}
|
|
for {
|
|
gotMf := &cmg.MetricFamily{}
|
|
gotErr := gotDecoder.Decode(gotMf)
|
|
expectedMf := &cmg.MetricFamily{}
|
|
if expectedErr := expectedDecoder.Decode(expectedMf); expectedErr != nil {
|
|
// return correctly if both are EOF
|
|
if expectedErr == io.EOF && gotErr == io.EOF {
|
|
break
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
gotMfs = append(gotMfs, gotMf)
|
|
expectedMfs = append(expectedMfs, expectedMf)
|
|
}
|
|
|
|
if !containsMetrics(expectedMfs, gotMfs) {
|
|
return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r))
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// sortMfs, sorts metric families in alphabetical order by type.
|
|
// currently only supports counter and histogram
|
|
func sortMfs(mfs []*cmg.MetricFamily) []*cmg.MetricFamily {
|
|
var sortedMfs []*cmg.MetricFamily
|
|
|
|
// Sort first by type
|
|
sort.Slice(mfs, func(i, j int) bool {
|
|
return *mfs[i].Type < *mfs[j].Type
|
|
})
|
|
|
|
// Next, sort by length of name
|
|
sort.Slice(mfs, func(i, j int) bool {
|
|
return len(*mfs[i].Name) < len(*mfs[j].Name)
|
|
})
|
|
|
|
return sortedMfs
|
|
}
|
|
|
|
func containsMetrics(expectedMfs, gotMfs []*cmg.MetricFamily) bool {
|
|
if len(gotMfs) != len(expectedMfs) {
|
|
fmt.Printf("Not same length: expected and got metrics families: %v vs. %v\n", len(expectedMfs), len(gotMfs))
|
|
return false
|
|
}
|
|
|
|
// sort metric families for deterministic comparison.
|
|
sortedExpectedMfs := sortMfs(expectedMfs)
|
|
sortedGotMfs := sortMfs(gotMfs)
|
|
|
|
// compare expected vs. sorted actual metrics
|
|
for k, got := range sortedGotMfs {
|
|
matchCount := 0
|
|
expected := sortedExpectedMfs[k]
|
|
|
|
if (got.Name == nil || *(got.Name) != *(expected.Name)) ||
|
|
(got.Type == nil || *(got.Type) != *(expected.Type)) ||
|
|
(got.Help == nil || *(got.Help) != *(expected.Help)) {
|
|
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Name, *expected.Name)
|
|
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Type, *expected.Type)
|
|
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Help, *expected.Help)
|
|
return false
|
|
}
|
|
|
|
numRecords := len(expected.Metric)
|
|
if len(got.Metric) < numRecords {
|
|
fmt.Printf("Not the same number of records: got.Metric: %v, numRecords: %v\n", len(got.Metric), numRecords)
|
|
return false
|
|
}
|
|
for i := 0; i < len(got.Metric); i++ {
|
|
for j := 0; j < numRecords; j++ {
|
|
if got.Metric[i].Histogram == nil && expected.Metric[j].Histogram != nil ||
|
|
got.Metric[i].Histogram != nil && expected.Metric[j].Histogram == nil {
|
|
fmt.Printf("got metric and expected metric histogram type mismatch")
|
|
return false
|
|
}
|
|
|
|
// labels should be the same
|
|
if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) {
|
|
continue
|
|
}
|
|
|
|
// metric type specific checks
|
|
switch {
|
|
case got.Metric[i].Histogram != nil && expected.Metric[j].Histogram != nil:
|
|
gh := got.Metric[i].Histogram
|
|
eh := expected.Metric[j].Histogram
|
|
if gh == nil || eh == nil {
|
|
continue
|
|
}
|
|
if !reflect.DeepEqual(gh.Bucket, eh.Bucket) {
|
|
fmt.Println("got and expected histogram bucket not equal")
|
|
continue
|
|
}
|
|
|
|
// this is a sum record, expecting a latency which is more than the
|
|
// expected one. If the sum is smaller than expected, it will be considered
|
|
// as NOT a match
|
|
if gh.SampleSum == nil || eh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) {
|
|
fmt.Println("difference in sample sum")
|
|
continue
|
|
}
|
|
if gh.SampleCount == nil || eh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) {
|
|
fmt.Println("difference in sample count")
|
|
continue
|
|
}
|
|
|
|
case got.Metric[i].Counter != nil && expected.Metric[j].Counter != nil:
|
|
gc := got.Metric[i].Counter
|
|
ec := expected.Metric[j].Counter
|
|
if gc.Value == nil || *(gc.Value) != *(ec.Value) {
|
|
fmt.Println("difference in counter values")
|
|
continue
|
|
}
|
|
}
|
|
|
|
// this is a match
|
|
matchCount = matchCount + 1
|
|
break
|
|
}
|
|
}
|
|
|
|
if matchCount != numRecords {
|
|
fmt.Printf("matchCount %v, numRecords %v\n", matchCount, numRecords)
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func TestProcessStartTimeMetricExist(t *testing.T) {
|
|
mgr, srv := initMgr()
|
|
defer shutdown(srv)
|
|
metricsFamilies, err := mgr.GetRegistry().Gather()
|
|
if err != nil {
|
|
t.Fatalf("Error fetching metrics: %v", err)
|
|
}
|
|
|
|
for _, metricsFamily := range metricsFamilies {
|
|
if metricsFamily.GetName() == processStartTimeMetric {
|
|
return
|
|
}
|
|
m := metricsFamily.GetMetric()
|
|
if m[0].GetGauge().GetValue() <= 0 {
|
|
t.Fatalf("Expected non zero timestamp for process start time")
|
|
}
|
|
}
|
|
|
|
t.Fatalf("Metrics does not contain %v. Scraped content: %v", processStartTimeMetric, metricsFamilies)
|
|
}
|