Merge pull request #409 from ggriffiths/snapshot_controller_metrics

Add snapshot controller metrics
This commit is contained in:
Kubernetes Prow Robot
2020-12-02 22:49:00 -08:00
committed by GitHub
7 changed files with 655 additions and 282 deletions

View File

@@ -22,6 +22,7 @@ import (
"fmt"
"os"
"os/signal"
"sync"
"time"
"k8s.io/client-go/kubernetes"
@@ -32,6 +33,7 @@ import (
"github.com/kubernetes-csi/csi-lib-utils/leaderelection"
controller "github.com/kubernetes-csi/external-snapshotter/v3/pkg/common-controller"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned"
snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme"
@@ -48,6 +50,9 @@ var (
leaderElection = flag.Bool("leader-election", false, "Enables leader election.")
leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.")
httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics, will listen (example: :8080). The default is empty string, which means the server is disabled.")
metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
)
var (
@@ -87,6 +92,28 @@ func main() {
factory := informers.NewSharedInformerFactory(snapClient, *resyncPeriod)
coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, *resyncPeriod)
// Create and register metrics manager
metricsManager := metrics.NewMetricsManager()
wg := &sync.WaitGroup{}
wg.Add(1)
if *httpEndpoint != "" {
srv, err := metricsManager.StartMetricsEndpoint(*metricsPath, *httpEndpoint, promklog{}, wg)
if err != nil {
klog.Errorf("Failed to start metrics server: %s", err.Error())
os.Exit(1)
}
defer func() {
err := srv.Shutdown(context.Background())
if err != nil {
klog.Errorf("Failed to shutdown metrics server: %s", err.Error())
}
klog.Infof("Metrics server successfully shutdown")
wg.Done()
}()
klog.Infof("Metrics server successfully started on %s, %s", *httpEndpoint, *metricsPath)
}
// Add Snapshot types to the default Kubernetes so events can be logged for them
snapshotscheme.AddToScheme(scheme.Scheme)
@@ -99,6 +126,7 @@ func main() {
factory.Snapshot().V1beta1().VolumeSnapshotContents(),
factory.Snapshot().V1beta1().VolumeSnapshotClasses(),
coreFactory.Core().V1().PersistentVolumeClaims(),
metricsManager,
*resyncPeriod,
)
@@ -142,3 +170,9 @@ func buildConfig(kubeconfig string) (*rest.Config, error) {
}
return rest.InClusterConfig()
}
type promklog struct{}
func (pl promklog) Println(v ...interface{}) {
klog.Error(v...)
}

View File

@@ -34,6 +34,7 @@ import (
snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme"
informers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions"
storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
@@ -734,6 +735,10 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
}
coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, utils.NoResyncPeriodFunc())
metricsManager := metrics.NewMetricsManager()
wg := &sync.WaitGroup{}
wg.Add(1)
metricsManager.StartMetricsEndpoint("/metrics", "localhost:0", nil, wg)
ctrl := NewCSISnapshotCommonController(
clientset,
@@ -742,6 +747,7 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
informerFactory.Snapshot().V1beta1().VolumeSnapshotContents(),
informerFactory.Snapshot().V1beta1().VolumeSnapshotClasses(),
coreFactory.Core().V1().PersistentVolumeClaims(),
metricsManager,
60*time.Second,
)

View File

@@ -32,6 +32,7 @@ import (
klog "k8s.io/klog/v2"
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v3/apis/volumesnapshot/v1beta1"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils"
)
@@ -235,6 +236,20 @@ func (ctrl *csiSnapshotCommonController) syncSnapshot(snapshot *crdv1.VolumeSnap
// 2. Call checkandRemoveSnapshotFinalizersAndCheckandDeleteContent() with information obtained from step 1. This function name is very long but the name suggests what it does. It determines whether to remove finalizers on snapshot and whether to delete content.
func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(snapshot *crdv1.VolumeSnapshot) error {
klog.V(5).Infof("processSnapshotWithDeletionTimestamp VolumeSnapshot[%s]: %s", utils.SnapshotKey(snapshot), utils.GetSnapshotStatusForLogging(snapshot))
driverName, err := ctrl.getSnapshotDriverName(snapshot)
if err != nil {
klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %v", utils.SnapshotKey(snapshot), err)
}
snapshotProvisionType := metrics.DynamicSnapshotType
if snapshot.Spec.Source.VolumeSnapshotContentName != nil {
snapshotProvisionType = metrics.PreProvisionedSnapshotType
}
// Processing delete, start operation metric
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID)
deleteOperationValue := metrics.NewOperationValue(driverName, snapshotProvisionType)
ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue)
var contentName string
if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil {
@@ -269,6 +284,7 @@ func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(sn
}
klog.V(5).Infof("processSnapshotWithDeletionTimestamp[%s]: delete snapshot content and remove finalizer from snapshot if needed", utils.SnapshotKey(snapshot))
return ctrl.checkandRemoveSnapshotFinalizersAndCheckandDeleteContent(snapshot, content, deleteContent)
}
@@ -388,6 +404,7 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum
// snapshot is bound but content is not pointing to the snapshot
return ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotMisbound", "VolumeSnapshotContent is not bound to the VolumeSnapshot correctly")
}
// everything is verified, return
return nil
}
@@ -396,6 +413,28 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum
func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.VolumeSnapshot) error {
uniqueSnapshotName := utils.SnapshotKey(snapshot)
klog.V(5).Infof("syncUnreadySnapshot %s", uniqueSnapshotName)
driverName, err := ctrl.getSnapshotDriverName(snapshot)
if err != nil {
klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err)
}
snapshotProvisionType := metrics.DynamicSnapshotType
if snapshot.Spec.Source.VolumeSnapshotContentName != nil {
snapshotProvisionType = metrics.PreProvisionedSnapshotType
}
// Start metrics operations
if !utils.IsSnapshotCreated(snapshot) {
// Only start CreateSnapshot operation if the snapshot has not been cut
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID),
metrics.NewOperationValue(driverName, snapshotProvisionType),
)
}
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID),
metrics.NewOperationValue(driverName, snapshotProvisionType),
)
// Pre-provisioned snapshot
if snapshot.Spec.Source.VolumeSnapshotContentName != nil {
@@ -403,13 +442,16 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol
if err != nil {
return err
}
// if no content found yet, update status and return
if content == nil {
// can not find the desired VolumeSnapshotContent from cache store
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotContentMissing", "VolumeSnapshotContent is missing")
klog.V(4).Infof("syncUnreadySnapshot[%s]: snapshot content %q requested but not found, will try again", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName)
return fmt.Errorf("snapshot %s requests an non-existing content %s", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName)
}
// Set VolumeSnapshotRef UID
newContent, err := ctrl.checkandBindSnapshotContent(snapshot, content)
if err != nil {
@@ -426,8 +468,10 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err))
return err
}
return nil
}
// snapshot.Spec.Source.VolumeSnapshotContentName == nil - dynamically creating snapshot
klog.V(5).Infof("getDynamicallyProvisionedContentFromStore for snapshot %s", uniqueSnapshotName)
contentObj, err := ctrl.getDynamicallyProvisionedContentFromStore(snapshot)
@@ -1094,10 +1138,30 @@ func (ctrl *csiSnapshotCommonController) updateSnapshotStatus(snapshot *crdv1.Vo
if updated {
snapshotClone := snapshotObj.DeepCopy()
snapshotClone.Status = newStatus
// We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call.
// Must meet the following criteria to emit a successful CreateSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
driverName := content.Spec.Driver
createOperationKey := metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID)
if !utils.IsSnapshotCreated(snapshotObj) && utils.IsSnapshotCreated(snapshotClone) {
ctrl.metricsManager.RecordMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}
// Must meet the following criteria to emit a successful CreateSnapshotAndReady status
// 1. Previous status was nil OR Previous status had a nil ReadyToUse OR Previous status had a false ReadyToUse
// 2. New status must be non-nil with a ReadyToUse as true
if !utils.IsSnapshotReady(snapshotObj) && utils.IsSnapshotReady(snapshotClone) {
createAndReadyOperation := metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID)
ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}
newSnapshotObj, err := ctrl.clientset.SnapshotV1beta1().VolumeSnapshots(snapshotClone.Namespace).UpdateStatus(context.TODO(), snapshotClone, metav1.UpdateOptions{})
if err != nil {
return nil, newControllerUpdateError(utils.SnapshotKey(snapshot), err.Error())
}
return newSnapshotObj, nil
}
@@ -1177,6 +1241,45 @@ func (ctrl *csiSnapshotCommonController) getSnapshotClass(className string) (*cr
return class, nil
}
// getSnapshotDriverName is a helper function to get snapshot driver from the VolumeSnapshot.
// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName.
func (ctrl *csiSnapshotCommonController) getSnapshotDriverName(vs *crdv1.VolumeSnapshot) (string, error) {
klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vs.Name)
var driverName string
// Pre-Provisioned snapshots have contentName as source
var contentName string
if vs.Spec.Source.VolumeSnapshotContentName != nil {
contentName = *vs.Spec.Source.VolumeSnapshotContentName
}
// Get Driver name from SnapshotContent if we found a contentName
if contentName != "" {
content, err := ctrl.contentLister.Get(contentName)
if err != nil {
klog.Errorf("getSnapshotDriverName: failed to get snapshotContent: %v", contentName)
} else {
driverName = content.Spec.Driver
}
if driverName != "" {
return driverName, nil
}
}
// Dynamic snapshots will have a snapshotclass with a driver
if vs.Spec.VolumeSnapshotClassName != nil {
class, err := ctrl.getSnapshotClass(*vs.Spec.VolumeSnapshotClassName)
if err != nil {
klog.Errorf("getSnapshotDriverName: failed to get snapshotClass: %v", *vs.Spec.VolumeSnapshotClassName)
} else {
driverName = class.Driver
}
}
return driverName, nil
}
// SetDefaultSnapshotClass is a helper function to figure out the default snapshot class.
// For pre-provisioned case, it's an no-op.
// For dynamic provisioning, it gets the default SnapshotClasses in the system if there is any(could be multiple),

View File

@@ -24,6 +24,7 @@ import (
clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned"
storageinformers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions/volumesnapshot/v1beta1"
storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils"
v1 "k8s.io/api/core/v1"
@@ -60,6 +61,8 @@ type csiSnapshotCommonController struct {
snapshotStore cache.Store
contentStore cache.Store
metricsManager metrics.MetricsManager
resyncPeriod time.Duration
}
@@ -71,6 +74,7 @@ func NewCSISnapshotCommonController(
volumeSnapshotContentInformer storageinformers.VolumeSnapshotContentInformer,
volumeSnapshotClassInformer storageinformers.VolumeSnapshotClassInformer,
pvcInformer coreinformers.PersistentVolumeClaimInformer,
metricsManager metrics.MetricsManager,
resyncPeriod time.Duration,
) *csiSnapshotCommonController {
broadcaster := record.NewBroadcaster()
@@ -80,14 +84,15 @@ func NewCSISnapshotCommonController(
eventRecorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: fmt.Sprintf("snapshot-controller")})
ctrl := &csiSnapshotCommonController{
clientset: clientset,
client: client,
eventRecorder: eventRecorder,
resyncPeriod: resyncPeriod,
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
clientset: clientset,
client: client,
eventRecorder: eventRecorder,
resyncPeriod: resyncPeriod,
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
metricsManager: metricsManager,
}
ctrl.pvcLister = pvcInformer.Lister()
@@ -363,6 +368,7 @@ func (ctrl *csiSnapshotCommonController) updateSnapshot(snapshot *crdv1.VolumeSn
if !newSnapshot {
return nil
}
err = ctrl.syncSnapshot(snapshot)
if err != nil {
if errors.IsConflict(err) {
@@ -407,6 +413,13 @@ func (ctrl *csiSnapshotCommonController) updateContent(content *crdv1.VolumeSnap
func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSnapshot) {
_ = ctrl.snapshotStore.Delete(snapshot)
klog.V(4).Infof("snapshot %q deleted", utils.SnapshotKey(snapshot))
driverName, err := ctrl.getSnapshotDriverName(snapshot)
if err != nil {
klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err)
} else {
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID)
ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}
snapshotContentName := ""
if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil {
@@ -416,6 +429,7 @@ func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSn
klog.V(5).Infof("deleteSnapshot[%q]: content not bound", utils.SnapshotKey(snapshot))
return
}
// sync the content when its snapshot is deleted. Explicitly sync'ing the
// content here in response to snapshot deletion prevents the content from
// waiting until the next sync period for its Release.

View File

@@ -30,13 +30,48 @@ import (
)
const (
opStatusUnknown = "Unknown"
labelDriverName = "driver_name"
labelOperationName = "operation_name"
labelOperationStatus = "operation_status"
subSystem = "snapshot_controller"
metricName = "operation_total_seconds"
metricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end"
labelDriverName = "driver_name"
labelOperationName = "operation_name"
labelOperationStatus = "operation_status"
labelSnapshotType = "snapshot_type"
subSystem = "snapshot_controller"
operationLatencyMetricName = "operation_total_seconds"
operationLatencyMetricHelpMsg = "Total number of seconds spent by the controller on an operation"
unknownDriverName = "unknown"
// CreateSnapshotOperationName is the operation that tracks how long the controller takes to create a snapshot.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeSnapshot CR to dynamically provision a snapshot
// - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil
CreateSnapshotOperationName = "CreateSnapshot"
// CreateSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a snapshot and for it to be ready.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeSnapshot CR(both dynamic and pre-provisioned cases)
// - End_time: controller notices that the CR has a status with Ready field set to be true
CreateSnapshotAndReadyOperationName = "CreateSnapshotAndReady"
// DeleteSnapshotOperationName is the operation that tracks how long a snapshot deletion takes.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeSnapshot CR and the CR is ready to be deleted. Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time.
// - End_time: controller removed all finalizers on the VolumeSnapshot CR such that the CR is ready to be removed in the API server.
DeleteSnapshotOperationName = "DeleteSnapshot"
// DynamicSnapshotType represents a snapshot that is being dynamically provisioned
DynamicSnapshotType = snapshotProvisionType("dynamic")
// PreProvisionedSnapshotType represents a snapshot that is pre-provisioned
PreProvisionedSnapshotType = snapshotProvisionType("pre-provisioned")
// SnapshotStatusTypeUnknown represents that the status is unknown
SnapshotStatusTypeUnknown snapshotStatusType = "unknown"
// Success and Cancel are statuses for operation time (operation_total_seconds) as seen by snapshot controller
// SnapshotStatusTypeSuccess represents that a CreateSnapshot, CreateSnapshotAndReady,
// or DeleteSnapshot has finished successfully.
// Individual reconciliations (reconciliation_total_seconds) also use this status.
SnapshotStatusTypeSuccess snapshotStatusType = "success"
// SnapshotStatusTypeCancel represents that a CreateSnapshot, CreateSnapshotAndReady,
// or DeleteSnapshot has been deleted before finishing.
SnapshotStatusTypeCancel snapshotStatusType = "cancel"
)
// OperationStatus is the interface type for representing an operation's execution
@@ -57,11 +92,11 @@ type MetricsManager interface {
// OperationStart takes in an operation and caches its start time.
// if the operation already exists, it's an no-op.
OperationStart(op Operation)
OperationStart(key OperationKey, val OperationValue)
// DropOperation removes an operation from cache.
// if the operation does not exist, it's an no-op.
DropOperation(op Operation)
DropOperation(op OperationKey)
// RecordMetrics records a metric point. Note that it will be an no-op if an
// operation has NOT been marked "Started" previously via invoking "OperationStart".
@@ -69,20 +104,49 @@ type MetricsManager interface {
// op - the operation which the metric is associated with.
// status - the operation status, if not specified, i.e., status == nil, an
// "Unknown" status of the passed-in operation is assumed.
RecordMetrics(op Operation, status OperationStatus)
RecordMetrics(op OperationKey, status OperationStatus, driverName string)
}
// Operation is a structure which holds information to identify a snapshot
// related operation
type Operation struct {
// the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot"
// OperationKey is a structure which holds information to
// uniquely identify a snapshot related operation
type OperationKey struct {
// Name is the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot"
Name string
// the name of the driver which executes the operation
Driver string
// the resource UID to which the operation has been executed against
// ResourceID is the resource UID to which the operation has been executed against
ResourceID types.UID
}
// OperationValue is a structure which holds operation metadata
type OperationValue struct {
// Driver is the driver name which executes the operation
Driver string
// SnapshotType represents the snapshot type, for example: "dynamic", "pre-provisioned"
SnapshotType string
// startTime is the time when the operation first started
startTime time.Time
}
// NewOperationKey initializes a new OperationKey
func NewOperationKey(name string, snapshotUID types.UID) OperationKey {
return OperationKey{
Name: name,
ResourceID: snapshotUID,
}
}
// NewOperationValue initializes a new OperationValue
func NewOperationValue(driver string, snapshotType snapshotProvisionType) OperationValue {
if driver == "" {
driver = unknownDriverName
}
return OperationValue{
Driver: driver,
SnapshotType: string(snapshotType),
}
}
type operationMetricsManager struct {
// cache is a concurrent-safe map which stores start timestamps for all
// ongoing operations.
@@ -93,10 +157,11 @@ type operationMetricsManager struct {
// registry is a wrapper around Prometheus Registry
registry k8smetrics.KubeRegistry
// opLatencyMetrics is a Histogram metrics
// opLatencyMetrics is a Histogram metrics for operation time per request
opLatencyMetrics *k8smetrics.HistogramVec
}
// NewMetricsManager creates a new MetricsManager instance
func NewMetricsManager() MetricsManager {
mgr := &operationMetricsManager{
cache: sync.Map{},
@@ -105,34 +170,83 @@ func NewMetricsManager() MetricsManager {
return mgr
}
func (opMgr *operationMetricsManager) OperationStart(op Operation) {
opMgr.cache.LoadOrStore(op, time.Now())
// OperationStart starts a new operation
func (opMgr *operationMetricsManager) OperationStart(key OperationKey, val OperationValue) {
val.startTime = time.Now()
opMgr.cache.LoadOrStore(key, val)
}
func (opMgr *operationMetricsManager) DropOperation(op Operation) {
// OperationStart drops an operation
func (opMgr *operationMetricsManager) DropOperation(op OperationKey) {
opMgr.cache.Delete(op)
}
func (opMgr *operationMetricsManager) RecordMetrics(op Operation, status OperationStatus) {
obj, exists := opMgr.cache.Load(op)
// RecordMetrics emits operation metrics
func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) {
obj, exists := opMgr.cache.Load(opKey)
if !exists {
// the operation has not been cached, return directly
return
}
ts, ok := obj.(time.Time)
opVal, ok := obj.(OperationValue)
if !ok {
// the cached item is not a time.Time, should NEVER happen, clean and return
klog.Errorf("Invalid cache entry for key %v", op)
opMgr.cache.Delete(op)
// the cached item is not a OperationValue, should NEVER happen, clean and return
klog.Errorf("Invalid cache entry for key %v", opKey)
opMgr.cache.Delete(opKey)
return
}
strStatus := opStatusUnknown
if status != nil {
strStatus = status.String()
status := string(SnapshotStatusTypeUnknown)
if opStatus != nil {
status = opStatus.String()
}
duration := time.Since(ts).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(op.Driver, op.Name, strStatus).Observe(duration)
opMgr.cache.Delete(op)
// if we do not know the driverName while recording metrics,
// refer to the cached version instead.
if driverName == "" || driverName == unknownDriverName {
driverName = opVal.Driver
}
operationDuration := time.Since(opVal.startTime).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration)
// Report cancel metrics if we are deleting an unfinished VolumeSnapshot
if opKey.Name == DeleteSnapshotOperationName {
// check if we have a CreateSnapshot operation pending for this
createKey := NewOperationKey(CreateSnapshotOperationName, opKey.ResourceID)
obj, exists := opMgr.cache.Load(createKey)
if exists {
// record a cancel metric if found
opMgr.recordCancelMetric(obj, createKey, operationDuration)
}
// check if we have a CreateSnapshotAndReady operation pending for this
createAndReadyKey := NewOperationKey(CreateSnapshotAndReadyOperationName, opKey.ResourceID)
obj, exists = opMgr.cache.Load(createAndReadyKey)
if exists {
// record a cancel metric if found
opMgr.recordCancelMetric(obj, createAndReadyKey, operationDuration)
}
}
opMgr.cache.Delete(opKey)
}
// recordCancelMetric records a metric for a create operation that hasn't finished
func (opMgr *operationMetricsManager) recordCancelMetric(obj interface{}, key OperationKey, duration float64) {
// record a cancel metric if found
val, ok := obj.(OperationValue)
if !ok {
klog.Errorf("Invalid cache entry for key %v", key)
opMgr.cache.Delete(key)
return
}
opMgr.opLatencyMetrics.WithLabelValues(
val.Driver,
key.Name,
val.SnapshotType,
string(SnapshotStatusTypeCancel),
).Observe(duration)
opMgr.cache.Delete(key)
}
func (opMgr *operationMetricsManager) init() {
@@ -140,11 +254,11 @@ func (opMgr *operationMetricsManager) init() {
opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec(
&k8smetrics.HistogramOpts{
Subsystem: subSystem,
Name: metricName,
Help: metricHelpMsg,
Name: operationLatencyMetricName,
Help: operationLatencyMetricHelpMsg,
Buckets: metricBuckets,
},
[]string{labelDriverName, labelOperationName, labelOperationStatus},
[]string{labelDriverName, labelOperationName, labelSnapshotType, labelOperationStatus},
)
opMgr.registry.MustRegister(opMgr.opLatencyMetrics)
}
@@ -175,3 +289,25 @@ func (opMgr *operationMetricsManager) StartMetricsEndpoint(pattern, addr string,
}()
return srv, nil
}
// snapshotProvisionType represents which kind of snapshot a metric is
type snapshotProvisionType string
// snapshotStatusType represents the type of snapshot status to report
type snapshotStatusType string
// SnapshotOperationStatus represents the status for a snapshot controller operation
type SnapshotOperationStatus struct {
status snapshotStatusType
}
// NewSnapshotOperationStatus returns a new SnapshotOperationStatus
func NewSnapshotOperationStatus(status snapshotStatusType) SnapshotOperationStatus {
return SnapshotOperationStatus{
status: status,
}
}
func (sos SnapshotOperationStatus) String() string {
return string(sos.status)
}

View File

@@ -19,10 +19,12 @@ package metrics
import (
"context"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"reflect"
"sort"
"strings"
"sync"
"testing"
@@ -86,9 +88,8 @@ func TestNew(t *testing.T) {
func TestDropNonExistingOperation(t *testing.T) {
mgr, wg, srv := initMgr()
defer shutdown(srv, wg)
op := Operation{
op := OperationKey{
Name: "drop-non-existing-operation-should-be-noop",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.DropOperation(op)
@@ -98,12 +99,11 @@ func TestRecordMetricsForNonExistingOperation(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
op := Operation{
opKey := OperationKey{
Name: "no-metrics-should-be-recorded-as-operation-did-not-start",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.RecordMetrics(op, nil)
mgr.RecordMetrics(opKey, nil, "driver")
rsp, err := http.Get(srvAddr)
if err != nil || rsp.StatusCode != http.StatusOK {
t.Errorf("failed to get response from server %v, %v", err, rsp)
@@ -112,8 +112,8 @@ func TestRecordMetricsForNonExistingOperation(t *testing.T) {
if err != nil {
t.Errorf("failed to read response body %v", err)
}
if strings.Contains(string(r), op.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r))
if strings.Contains(string(r), opKey.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r))
}
}
@@ -121,13 +121,13 @@ func TestDropOperation(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
op := Operation{
opKey := OperationKey{
Name: "should-have-been-dropped",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.OperationStart(op)
mgr.DropOperation(op)
opVal := NewOperationValue("driver", DynamicSnapshotType)
mgr.OperationStart(opKey, opVal)
mgr.DropOperation(opKey)
time.Sleep(300 * time.Millisecond)
rsp, err := http.Get(srvAddr)
if err != nil || rsp.StatusCode != http.StatusOK {
@@ -137,36 +137,36 @@ func TestDropOperation(t *testing.T) {
if err != nil {
t.Errorf("failed to read response body %v", err)
}
if strings.Contains(string(r), op.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r))
if strings.Contains(string(r), opKey.Name) {
t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r))
}
// re-add with a different name
op.Name = "should-have-been-added"
mgr.OperationStart(op)
opKey.Name = "should-have-been-added"
mgr.OperationStart(opKey, opVal)
time.Sleep(300 * time.Millisecond)
opStatus := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordMetrics(op, opStatus)
mgr.RecordMetrics(opKey, opStatus, "driver")
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
@@ -178,34 +178,33 @@ func TestUnknownStatus(t *testing.T) {
mgr, wg, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
op := Operation{
opKey := OperationKey{
Name: "unknown-status-operation",
Driver: "driver",
ResourceID: types.UID("uid"),
}
mgr.OperationStart(op)
mgr.OperationStart(opKey, NewOperationValue("driver", DynamicSnapshotType))
// should create a Unknown data point with latency ~300ms
time.Sleep(300 * time.Millisecond)
mgr.RecordMetrics(op, nil)
mgr.RecordMetrics(opKey, nil, "driver")
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 0.3
snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
@@ -218,66 +217,65 @@ func TestRecordMetrics(t *testing.T) {
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv, wg)
// add an operation
op := Operation{
opKey := OperationKey{
Name: "op1",
Driver: "driver1",
ResourceID: types.UID("uid1"),
}
mgr.OperationStart(op)
opVal := NewOperationValue("driver", DynamicSnapshotType)
mgr.OperationStart(opKey, opVal)
// should create a Success data point with latency ~ 1100ms
time.Sleep(1100 * time.Millisecond)
success := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordMetrics(op, success)
mgr.RecordMetrics(opKey, success, "driver")
// add another operation metric
op.Name = "op2"
op.Driver = "driver2"
op.ResourceID = types.UID("uid2")
mgr.OperationStart(op)
opKey.Name = "op2"
opKey.ResourceID = types.UID("uid2")
mgr.OperationStart(opKey, opVal)
// should create a Failure data point with latency ~ 100ms
time.Sleep(100 * time.Millisecond)
failure := &fakeOpStatus{
statusCode: 1,
}
mgr.RecordMetrics(op, failure)
mgr.RecordMetrics(opKey, failure, "driver")
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.5"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success"} 1.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
@@ -295,15 +293,14 @@ func TestConcurrency(t *testing.T) {
statusCode: 1,
}
ops := []struct {
op Operation
op OperationKey
desiredLatencyMs time.Duration
status OperationStatus
drop bool
}{
{
Operation{
OperationKey{
Name: "success1",
Driver: "driver1",
ResourceID: types.UID("uid1"),
},
100,
@@ -311,9 +308,8 @@ func TestConcurrency(t *testing.T) {
false,
},
{
Operation{
OperationKey{
Name: "success2",
Driver: "driver2",
ResourceID: types.UID("uid2"),
},
100,
@@ -321,9 +317,8 @@ func TestConcurrency(t *testing.T) {
false,
},
{
Operation{
OperationKey{
Name: "failure1",
Driver: "driver3",
ResourceID: types.UID("uid3"),
},
100,
@@ -331,9 +326,8 @@ func TestConcurrency(t *testing.T) {
false,
},
{
Operation{
OperationKey{
Name: "failure2",
Driver: "driver4",
ResourceID: types.UID("uid4"),
},
100,
@@ -341,9 +335,8 @@ func TestConcurrency(t *testing.T) {
false,
},
{
Operation{
OperationKey{
Name: "unknown",
Driver: "driver5",
ResourceID: types.UID("uid5"),
},
100,
@@ -351,9 +344,8 @@ func TestConcurrency(t *testing.T) {
false,
},
{
Operation{
OperationKey{
Name: "drop-from-cache",
Driver: "driver6",
ResourceID: types.UID("uid6"),
},
100,
@@ -363,15 +355,20 @@ func TestConcurrency(t *testing.T) {
}
for i := range ops {
mgr.OperationStart(ops[i].op)
mgr.OperationStart(ops[i].op, OperationValue{
Driver: fmt.Sprintf("driver%v", i),
SnapshotType: string(DynamicSnapshotType),
})
}
// add an extra operation which should remain in the cache
remaining := Operation{
remaining := OperationKey{
Name: "remaining-in-cache",
Driver: "driver7",
ResourceID: types.UID("uid7"),
}
mgr.OperationStart(remaining)
mgr.OperationStart(remaining, OperationValue{
Driver: "driver7",
SnapshotType: string(DynamicSnapshotType),
})
var wgMetrics sync.WaitGroup
@@ -385,7 +382,7 @@ func TestConcurrency(t *testing.T) {
if ops[i].drop {
mgr.DropOperation(ops[i].op)
} else {
mgr.RecordMetrics(ops[i].op, ops[i].status)
mgr.RecordMetrics(ops[i].op, ops[i].status, fmt.Sprintf("driver%v", i))
}
}(i)
}
@@ -395,86 +392,86 @@ func TestConcurrency(t *testing.T) {
expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
@@ -494,63 +491,141 @@ func verifyMetric(expected, srvAddr string) error {
return err
}
format := expfmt.ResponseFormat(rsp.Header)
reader := strings.NewReader(string(r))
decoder := expfmt.NewDecoder(reader, format)
mf := &cmg.MetricFamily{}
if err := decoder.Decode(mf); err != nil {
return err
gotReader := strings.NewReader(string(r))
gotDecoder := expfmt.NewDecoder(gotReader, format)
expectedReader := strings.NewReader(expected)
expectedDecoder := expfmt.NewDecoder(expectedReader, format)
gotMfs := []*cmg.MetricFamily{}
expectedMfs := []*cmg.MetricFamily{}
for {
gotMf := &cmg.MetricFamily{}
gotErr := gotDecoder.Decode(gotMf)
expectedMf := &cmg.MetricFamily{}
if expectedErr := expectedDecoder.Decode(expectedMf); expectedErr != nil {
// return correctly if both are EOF
if expectedErr == io.EOF && gotErr == io.EOF {
break
} else {
return err
}
}
gotMfs = append(gotMfs, gotMf)
expectedMfs = append(expectedMfs, expectedMf)
}
reader = strings.NewReader(expected)
decoder = expfmt.NewDecoder(reader, format)
expectedMf := &cmg.MetricFamily{}
if err := decoder.Decode(expectedMf); err != nil {
return err
}
if !containsMetrics(expectedMf, mf) {
if !containsMetrics(expectedMfs, gotMfs) {
return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r))
}
return nil
}
func containsMetrics(expected, got *cmg.MetricFamily) bool {
if (got.Name == nil || *(got.Name) != *(expected.Name)) ||
(got.Type == nil || *(got.Type) != *(expected.Type)) ||
(got.Help == nil || *(got.Help) != *(expected.Help)) {
// sortMfs, sorts metric families in alphabetical order by type.
// currently only supports counter and histogram
func sortMfs(mfs []*cmg.MetricFamily) []*cmg.MetricFamily {
var sortedMfs []*cmg.MetricFamily
// Sort first by type
sort.Slice(mfs, func(i, j int) bool {
return *mfs[i].Type < *mfs[j].Type
})
// Next, sort by length of name
sort.Slice(mfs, func(i, j int) bool {
return len(*mfs[i].Name) < len(*mfs[j].Name)
})
return sortedMfs
}
func containsMetrics(expectedMfs, gotMfs []*cmg.MetricFamily) bool {
if len(gotMfs) != len(expectedMfs) {
fmt.Printf("Not same length: expected and got metrics families: %v vs. %v\n", len(expectedMfs), len(gotMfs))
return false
}
numRecords := len(expected.Metric)
if len(got.Metric) < numRecords {
return false
}
matchCount := 0
for i := 0; i < len(got.Metric); i++ {
for j := 0; j < numRecords; j++ {
// labels should be the same
if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) {
continue
// sort metric families for deterministic comparison.
sortedExpectedMfs := sortMfs(expectedMfs)
sortedGotMfs := sortMfs(gotMfs)
// compare expected vs. sorted actual metrics
for k, got := range sortedGotMfs {
matchCount := 0
expected := sortedExpectedMfs[k]
if (got.Name == nil || *(got.Name) != *(expected.Name)) ||
(got.Type == nil || *(got.Type) != *(expected.Type)) ||
(got.Help == nil || *(got.Help) != *(expected.Help)) {
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Name, *expected.Name)
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Type, *expected.Type)
fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Help, *expected.Help)
return false
}
numRecords := len(expected.Metric)
if len(got.Metric) < numRecords {
fmt.Printf("Not the same number of records: got.Metric: %v, numRecords: %v\n", len(got.Metric), numRecords)
return false
}
for i := 0; i < len(got.Metric); i++ {
for j := 0; j < numRecords; j++ {
if got.Metric[i].Histogram == nil && expected.Metric[j].Histogram != nil ||
got.Metric[i].Histogram != nil && expected.Metric[j].Histogram == nil {
fmt.Printf("got metric and expected metric histogram type mismatch")
return false
}
// labels should be the same
if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) {
continue
}
// metric type specific checks
switch {
case got.Metric[i].Histogram != nil && expected.Metric[j].Histogram != nil:
gh := got.Metric[i].Histogram
eh := expected.Metric[j].Histogram
if gh == nil || eh == nil {
continue
}
if !reflect.DeepEqual(gh.Bucket, eh.Bucket) {
fmt.Println("got and expected histogram bucket not equal")
continue
}
// this is a sum record, expecting a latency which is more than the
// expected one. If the sum is smaller than expected, it will be considered
// as NOT a match
if gh.SampleSum == nil || eh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) {
fmt.Println("difference in sample sum")
continue
}
if gh.SampleCount == nil || eh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) {
fmt.Println("difference in sample count")
continue
}
case got.Metric[i].Counter != nil && expected.Metric[j].Counter != nil:
gc := got.Metric[i].Counter
ec := expected.Metric[j].Counter
if gc.Value == nil || *(gc.Value) != *(ec.Value) {
fmt.Println("difference in counter values")
continue
}
}
// this is a match
matchCount = matchCount + 1
break
}
gh := got.Metric[i].Histogram
eh := expected.Metric[j].Histogram
if gh == nil {
continue
}
if !reflect.DeepEqual(gh.Bucket, eh.Bucket) {
continue
}
// this is a sum record, expecting a latency which is more than the
// expected one. If the sum is smaller than expected, it will be considered
// as NOT a match
if gh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) {
continue
}
if gh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) {
continue
}
// this is a match
matchCount = matchCount + 1
break
}
if matchCount != numRecords {
fmt.Printf("matchCount %v, numRecords %v\n", matchCount, numRecords)
return false
}
}
return matchCount == numRecords
return true
}

View File

@@ -519,3 +519,8 @@ func IsSnapshotReady(snapshot *crdv1.VolumeSnapshot) bool {
}
return true
}
// IsSnapshotCreated indicates that the snapshot has been cut on a storage system
func IsSnapshotCreated(snapshot *crdv1.VolumeSnapshot) bool {
return snapshot.Status != nil && snapshot.Status.CreationTime != nil
}