diff --git a/cmd/snapshot-controller/main.go b/cmd/snapshot-controller/main.go index 8df295cb..0b6e84f4 100644 --- a/cmd/snapshot-controller/main.go +++ b/cmd/snapshot-controller/main.go @@ -22,6 +22,7 @@ import ( "fmt" "os" "os/signal" + "sync" "time" "k8s.io/client-go/kubernetes" @@ -32,6 +33,7 @@ import ( "github.com/kubernetes-csi/csi-lib-utils/leaderelection" controller "github.com/kubernetes-csi/external-snapshotter/v3/pkg/common-controller" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned" snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme" @@ -48,6 +50,9 @@ var ( leaderElection = flag.Bool("leader-election", false, "Enables leader election.") leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.") + + httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics, will listen (example: :8080). The default is empty string, which means the server is disabled.") + metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") ) var ( @@ -87,6 +92,28 @@ func main() { factory := informers.NewSharedInformerFactory(snapClient, *resyncPeriod) coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, *resyncPeriod) + // Create and register metrics manager + metricsManager := metrics.NewMetricsManager() + wg := &sync.WaitGroup{} + wg.Add(1) + if *httpEndpoint != "" { + srv, err := metricsManager.StartMetricsEndpoint(*metricsPath, *httpEndpoint, promklog{}, wg) + if err != nil { + klog.Errorf("Failed to start metrics server: %s", err.Error()) + os.Exit(1) + } + defer func() { + err := srv.Shutdown(context.Background()) + if err != nil { + klog.Errorf("Failed to shutdown metrics server: %s", err.Error()) + } + + klog.Infof("Metrics server successfully shutdown") + wg.Done() + }() + klog.Infof("Metrics server successfully started on %s, %s", *httpEndpoint, *metricsPath) + } + // Add Snapshot types to the default Kubernetes so events can be logged for them snapshotscheme.AddToScheme(scheme.Scheme) @@ -99,6 +126,7 @@ func main() { factory.Snapshot().V1beta1().VolumeSnapshotContents(), factory.Snapshot().V1beta1().VolumeSnapshotClasses(), coreFactory.Core().V1().PersistentVolumeClaims(), + metricsManager, *resyncPeriod, ) @@ -142,3 +170,9 @@ func buildConfig(kubeconfig string) (*rest.Config, error) { } return rest.InClusterConfig() } + +type promklog struct{} + +func (pl promklog) Println(v ...interface{}) { + klog.Error(v...) +} diff --git a/pkg/common-controller/framework_test.go b/pkg/common-controller/framework_test.go index 62c85d66..1660e1df 100644 --- a/pkg/common-controller/framework_test.go +++ b/pkg/common-controller/framework_test.go @@ -34,6 +34,7 @@ import ( snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme" informers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions" storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" "github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -734,6 +735,10 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte } coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, utils.NoResyncPeriodFunc()) + metricsManager := metrics.NewMetricsManager() + wg := &sync.WaitGroup{} + wg.Add(1) + metricsManager.StartMetricsEndpoint("/metrics", "localhost:0", nil, wg) ctrl := NewCSISnapshotCommonController( clientset, @@ -742,6 +747,7 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte informerFactory.Snapshot().V1beta1().VolumeSnapshotContents(), informerFactory.Snapshot().V1beta1().VolumeSnapshotClasses(), coreFactory.Core().V1().PersistentVolumeClaims(), + metricsManager, 60*time.Second, ) diff --git a/pkg/common-controller/snapshot_controller.go b/pkg/common-controller/snapshot_controller.go index 34b62774..63a2458d 100644 --- a/pkg/common-controller/snapshot_controller.go +++ b/pkg/common-controller/snapshot_controller.go @@ -32,6 +32,7 @@ import ( klog "k8s.io/klog/v2" crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v3/apis/volumesnapshot/v1beta1" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" "github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils" ) @@ -235,6 +236,20 @@ func (ctrl *csiSnapshotCommonController) syncSnapshot(snapshot *crdv1.VolumeSnap // 2. Call checkandRemoveSnapshotFinalizersAndCheckandDeleteContent() with information obtained from step 1. This function name is very long but the name suggests what it does. It determines whether to remove finalizers on snapshot and whether to delete content. func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(snapshot *crdv1.VolumeSnapshot) error { klog.V(5).Infof("processSnapshotWithDeletionTimestamp VolumeSnapshot[%s]: %s", utils.SnapshotKey(snapshot), utils.GetSnapshotStatusForLogging(snapshot)) + driverName, err := ctrl.getSnapshotDriverName(snapshot) + if err != nil { + klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %v", utils.SnapshotKey(snapshot), err) + } + + snapshotProvisionType := metrics.DynamicSnapshotType + if snapshot.Spec.Source.VolumeSnapshotContentName != nil { + snapshotProvisionType = metrics.PreProvisionedSnapshotType + } + + // Processing delete, start operation metric + deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID) + deleteOperationValue := metrics.NewOperationValue(driverName, snapshotProvisionType) + ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue) var contentName string if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil { @@ -269,6 +284,7 @@ func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(sn } klog.V(5).Infof("processSnapshotWithDeletionTimestamp[%s]: delete snapshot content and remove finalizer from snapshot if needed", utils.SnapshotKey(snapshot)) + return ctrl.checkandRemoveSnapshotFinalizersAndCheckandDeleteContent(snapshot, content, deleteContent) } @@ -388,6 +404,7 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum // snapshot is bound but content is not pointing to the snapshot return ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotMisbound", "VolumeSnapshotContent is not bound to the VolumeSnapshot correctly") } + // everything is verified, return return nil } @@ -396,6 +413,28 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.VolumeSnapshot) error { uniqueSnapshotName := utils.SnapshotKey(snapshot) klog.V(5).Infof("syncUnreadySnapshot %s", uniqueSnapshotName) + driverName, err := ctrl.getSnapshotDriverName(snapshot) + if err != nil { + klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err) + } + + snapshotProvisionType := metrics.DynamicSnapshotType + if snapshot.Spec.Source.VolumeSnapshotContentName != nil { + snapshotProvisionType = metrics.PreProvisionedSnapshotType + } + + // Start metrics operations + if !utils.IsSnapshotCreated(snapshot) { + // Only start CreateSnapshot operation if the snapshot has not been cut + ctrl.metricsManager.OperationStart( + metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID), + metrics.NewOperationValue(driverName, snapshotProvisionType), + ) + } + ctrl.metricsManager.OperationStart( + metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID), + metrics.NewOperationValue(driverName, snapshotProvisionType), + ) // Pre-provisioned snapshot if snapshot.Spec.Source.VolumeSnapshotContentName != nil { @@ -403,13 +442,16 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol if err != nil { return err } + // if no content found yet, update status and return if content == nil { // can not find the desired VolumeSnapshotContent from cache store ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotContentMissing", "VolumeSnapshotContent is missing") klog.V(4).Infof("syncUnreadySnapshot[%s]: snapshot content %q requested but not found, will try again", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName) + return fmt.Errorf("snapshot %s requests an non-existing content %s", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName) } + // Set VolumeSnapshotRef UID newContent, err := ctrl.checkandBindSnapshotContent(snapshot, content) if err != nil { @@ -426,8 +468,10 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err)) return err } + return nil } + // snapshot.Spec.Source.VolumeSnapshotContentName == nil - dynamically creating snapshot klog.V(5).Infof("getDynamicallyProvisionedContentFromStore for snapshot %s", uniqueSnapshotName) contentObj, err := ctrl.getDynamicallyProvisionedContentFromStore(snapshot) @@ -1094,10 +1138,30 @@ func (ctrl *csiSnapshotCommonController) updateSnapshotStatus(snapshot *crdv1.Vo if updated { snapshotClone := snapshotObj.DeepCopy() snapshotClone.Status = newStatus + + // We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call. + // Must meet the following criteria to emit a successful CreateSnapshot status + // 1. Previous status was nil OR Previous status had a nil CreationTime + // 2. New status must be non-nil with a non-nil CreationTime + driverName := content.Spec.Driver + createOperationKey := metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID) + if !utils.IsSnapshotCreated(snapshotObj) && utils.IsSnapshotCreated(snapshotClone) { + ctrl.metricsManager.RecordMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName) + } + + // Must meet the following criteria to emit a successful CreateSnapshotAndReady status + // 1. Previous status was nil OR Previous status had a nil ReadyToUse OR Previous status had a false ReadyToUse + // 2. New status must be non-nil with a ReadyToUse as true + if !utils.IsSnapshotReady(snapshotObj) && utils.IsSnapshotReady(snapshotClone) { + createAndReadyOperation := metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID) + ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName) + } + newSnapshotObj, err := ctrl.clientset.SnapshotV1beta1().VolumeSnapshots(snapshotClone.Namespace).UpdateStatus(context.TODO(), snapshotClone, metav1.UpdateOptions{}) if err != nil { return nil, newControllerUpdateError(utils.SnapshotKey(snapshot), err.Error()) } + return newSnapshotObj, nil } @@ -1177,6 +1241,45 @@ func (ctrl *csiSnapshotCommonController) getSnapshotClass(className string) (*cr return class, nil } +// getSnapshotDriverName is a helper function to get snapshot driver from the VolumeSnapshot. +// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName. +func (ctrl *csiSnapshotCommonController) getSnapshotDriverName(vs *crdv1.VolumeSnapshot) (string, error) { + klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vs.Name) + var driverName string + + // Pre-Provisioned snapshots have contentName as source + var contentName string + if vs.Spec.Source.VolumeSnapshotContentName != nil { + contentName = *vs.Spec.Source.VolumeSnapshotContentName + } + + // Get Driver name from SnapshotContent if we found a contentName + if contentName != "" { + content, err := ctrl.contentLister.Get(contentName) + if err != nil { + klog.Errorf("getSnapshotDriverName: failed to get snapshotContent: %v", contentName) + } else { + driverName = content.Spec.Driver + } + + if driverName != "" { + return driverName, nil + } + } + + // Dynamic snapshots will have a snapshotclass with a driver + if vs.Spec.VolumeSnapshotClassName != nil { + class, err := ctrl.getSnapshotClass(*vs.Spec.VolumeSnapshotClassName) + if err != nil { + klog.Errorf("getSnapshotDriverName: failed to get snapshotClass: %v", *vs.Spec.VolumeSnapshotClassName) + } else { + driverName = class.Driver + } + } + + return driverName, nil +} + // SetDefaultSnapshotClass is a helper function to figure out the default snapshot class. // For pre-provisioned case, it's an no-op. // For dynamic provisioning, it gets the default SnapshotClasses in the system if there is any(could be multiple), diff --git a/pkg/common-controller/snapshot_controller_base.go b/pkg/common-controller/snapshot_controller_base.go index 2046b24c..dbd8f9d0 100644 --- a/pkg/common-controller/snapshot_controller_base.go +++ b/pkg/common-controller/snapshot_controller_base.go @@ -24,6 +24,7 @@ import ( clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned" storageinformers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions/volumesnapshot/v1beta1" storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" "github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils" v1 "k8s.io/api/core/v1" @@ -60,6 +61,8 @@ type csiSnapshotCommonController struct { snapshotStore cache.Store contentStore cache.Store + metricsManager metrics.MetricsManager + resyncPeriod time.Duration } @@ -71,6 +74,7 @@ func NewCSISnapshotCommonController( volumeSnapshotContentInformer storageinformers.VolumeSnapshotContentInformer, volumeSnapshotClassInformer storageinformers.VolumeSnapshotClassInformer, pvcInformer coreinformers.PersistentVolumeClaimInformer, + metricsManager metrics.MetricsManager, resyncPeriod time.Duration, ) *csiSnapshotCommonController { broadcaster := record.NewBroadcaster() @@ -80,14 +84,15 @@ func NewCSISnapshotCommonController( eventRecorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: fmt.Sprintf("snapshot-controller")}) ctrl := &csiSnapshotCommonController{ - clientset: clientset, - client: client, - eventRecorder: eventRecorder, - resyncPeriod: resyncPeriod, - snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), - contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), - snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"), - contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"), + clientset: clientset, + client: client, + eventRecorder: eventRecorder, + resyncPeriod: resyncPeriod, + snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), + contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), + snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"), + contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"), + metricsManager: metricsManager, } ctrl.pvcLister = pvcInformer.Lister() @@ -363,6 +368,7 @@ func (ctrl *csiSnapshotCommonController) updateSnapshot(snapshot *crdv1.VolumeSn if !newSnapshot { return nil } + err = ctrl.syncSnapshot(snapshot) if err != nil { if errors.IsConflict(err) { @@ -407,6 +413,13 @@ func (ctrl *csiSnapshotCommonController) updateContent(content *crdv1.VolumeSnap func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSnapshot) { _ = ctrl.snapshotStore.Delete(snapshot) klog.V(4).Infof("snapshot %q deleted", utils.SnapshotKey(snapshot)) + driverName, err := ctrl.getSnapshotDriverName(snapshot) + if err != nil { + klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err) + } else { + deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID) + ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName) + } snapshotContentName := "" if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil { @@ -416,6 +429,7 @@ func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSn klog.V(5).Infof("deleteSnapshot[%q]: content not bound", utils.SnapshotKey(snapshot)) return } + // sync the content when its snapshot is deleted. Explicitly sync'ing the // content here in response to snapshot deletion prevents the content from // waiting until the next sync period for its Release. diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c283caf5..b1f11ed0 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -30,13 +30,48 @@ import ( ) const ( - opStatusUnknown = "Unknown" - labelDriverName = "driver_name" - labelOperationName = "operation_name" - labelOperationStatus = "operation_status" - subSystem = "snapshot_controller" - metricName = "operation_total_seconds" - metricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end" + labelDriverName = "driver_name" + labelOperationName = "operation_name" + labelOperationStatus = "operation_status" + labelSnapshotType = "snapshot_type" + subSystem = "snapshot_controller" + operationLatencyMetricName = "operation_total_seconds" + operationLatencyMetricHelpMsg = "Total number of seconds spent by the controller on an operation" + unknownDriverName = "unknown" + + // CreateSnapshotOperationName is the operation that tracks how long the controller takes to create a snapshot. + // Specifically, the operation metric is emitted based on the following timestamps: + // - Start_time: controller notices the first time that there is a new VolumeSnapshot CR to dynamically provision a snapshot + // - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil + CreateSnapshotOperationName = "CreateSnapshot" + + // CreateSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a snapshot and for it to be ready. + // Specifically, the operation metric is emitted based on the following timestamps: + // - Start_time: controller notices the first time that there is a new VolumeSnapshot CR(both dynamic and pre-provisioned cases) + // - End_time: controller notices that the CR has a status with Ready field set to be true + CreateSnapshotAndReadyOperationName = "CreateSnapshotAndReady" + + // DeleteSnapshotOperationName is the operation that tracks how long a snapshot deletion takes. + // Specifically, the operation metric is emitted based on the following timestamps: + // - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeSnapshot CR and the CR is ready to be deleted. Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time. + // - End_time: controller removed all finalizers on the VolumeSnapshot CR such that the CR is ready to be removed in the API server. + DeleteSnapshotOperationName = "DeleteSnapshot" + + // DynamicSnapshotType represents a snapshot that is being dynamically provisioned + DynamicSnapshotType = snapshotProvisionType("dynamic") + // PreProvisionedSnapshotType represents a snapshot that is pre-provisioned + PreProvisionedSnapshotType = snapshotProvisionType("pre-provisioned") + + // SnapshotStatusTypeUnknown represents that the status is unknown + SnapshotStatusTypeUnknown snapshotStatusType = "unknown" + // Success and Cancel are statuses for operation time (operation_total_seconds) as seen by snapshot controller + // SnapshotStatusTypeSuccess represents that a CreateSnapshot, CreateSnapshotAndReady, + // or DeleteSnapshot has finished successfully. + // Individual reconciliations (reconciliation_total_seconds) also use this status. + SnapshotStatusTypeSuccess snapshotStatusType = "success" + // SnapshotStatusTypeCancel represents that a CreateSnapshot, CreateSnapshotAndReady, + // or DeleteSnapshot has been deleted before finishing. + SnapshotStatusTypeCancel snapshotStatusType = "cancel" ) // OperationStatus is the interface type for representing an operation's execution @@ -57,11 +92,11 @@ type MetricsManager interface { // OperationStart takes in an operation and caches its start time. // if the operation already exists, it's an no-op. - OperationStart(op Operation) + OperationStart(key OperationKey, val OperationValue) // DropOperation removes an operation from cache. // if the operation does not exist, it's an no-op. - DropOperation(op Operation) + DropOperation(op OperationKey) // RecordMetrics records a metric point. Note that it will be an no-op if an // operation has NOT been marked "Started" previously via invoking "OperationStart". @@ -69,20 +104,49 @@ type MetricsManager interface { // op - the operation which the metric is associated with. // status - the operation status, if not specified, i.e., status == nil, an // "Unknown" status of the passed-in operation is assumed. - RecordMetrics(op Operation, status OperationStatus) + RecordMetrics(op OperationKey, status OperationStatus, driverName string) } -// Operation is a structure which holds information to identify a snapshot -// related operation -type Operation struct { - // the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot" +// OperationKey is a structure which holds information to +// uniquely identify a snapshot related operation +type OperationKey struct { + // Name is the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot" Name string - // the name of the driver which executes the operation - Driver string - // the resource UID to which the operation has been executed against + // ResourceID is the resource UID to which the operation has been executed against ResourceID types.UID } +// OperationValue is a structure which holds operation metadata +type OperationValue struct { + // Driver is the driver name which executes the operation + Driver string + // SnapshotType represents the snapshot type, for example: "dynamic", "pre-provisioned" + SnapshotType string + + // startTime is the time when the operation first started + startTime time.Time +} + +// NewOperationKey initializes a new OperationKey +func NewOperationKey(name string, snapshotUID types.UID) OperationKey { + return OperationKey{ + Name: name, + ResourceID: snapshotUID, + } +} + +// NewOperationValue initializes a new OperationValue +func NewOperationValue(driver string, snapshotType snapshotProvisionType) OperationValue { + if driver == "" { + driver = unknownDriverName + } + + return OperationValue{ + Driver: driver, + SnapshotType: string(snapshotType), + } +} + type operationMetricsManager struct { // cache is a concurrent-safe map which stores start timestamps for all // ongoing operations. @@ -93,10 +157,11 @@ type operationMetricsManager struct { // registry is a wrapper around Prometheus Registry registry k8smetrics.KubeRegistry - // opLatencyMetrics is a Histogram metrics + // opLatencyMetrics is a Histogram metrics for operation time per request opLatencyMetrics *k8smetrics.HistogramVec } +// NewMetricsManager creates a new MetricsManager instance func NewMetricsManager() MetricsManager { mgr := &operationMetricsManager{ cache: sync.Map{}, @@ -105,34 +170,83 @@ func NewMetricsManager() MetricsManager { return mgr } -func (opMgr *operationMetricsManager) OperationStart(op Operation) { - opMgr.cache.LoadOrStore(op, time.Now()) +// OperationStart starts a new operation +func (opMgr *operationMetricsManager) OperationStart(key OperationKey, val OperationValue) { + val.startTime = time.Now() + opMgr.cache.LoadOrStore(key, val) } -func (opMgr *operationMetricsManager) DropOperation(op Operation) { +// OperationStart drops an operation +func (opMgr *operationMetricsManager) DropOperation(op OperationKey) { opMgr.cache.Delete(op) } -func (opMgr *operationMetricsManager) RecordMetrics(op Operation, status OperationStatus) { - obj, exists := opMgr.cache.Load(op) +// RecordMetrics emits operation metrics +func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) { + obj, exists := opMgr.cache.Load(opKey) if !exists { // the operation has not been cached, return directly return } - ts, ok := obj.(time.Time) + opVal, ok := obj.(OperationValue) if !ok { - // the cached item is not a time.Time, should NEVER happen, clean and return - klog.Errorf("Invalid cache entry for key %v", op) - opMgr.cache.Delete(op) + // the cached item is not a OperationValue, should NEVER happen, clean and return + klog.Errorf("Invalid cache entry for key %v", opKey) + opMgr.cache.Delete(opKey) return } - strStatus := opStatusUnknown - if status != nil { - strStatus = status.String() + status := string(SnapshotStatusTypeUnknown) + if opStatus != nil { + status = opStatus.String() } - duration := time.Since(ts).Seconds() - opMgr.opLatencyMetrics.WithLabelValues(op.Driver, op.Name, strStatus).Observe(duration) - opMgr.cache.Delete(op) + + // if we do not know the driverName while recording metrics, + // refer to the cached version instead. + if driverName == "" || driverName == unknownDriverName { + driverName = opVal.Driver + } + + operationDuration := time.Since(opVal.startTime).Seconds() + opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration) + + // Report cancel metrics if we are deleting an unfinished VolumeSnapshot + if opKey.Name == DeleteSnapshotOperationName { + // check if we have a CreateSnapshot operation pending for this + createKey := NewOperationKey(CreateSnapshotOperationName, opKey.ResourceID) + obj, exists := opMgr.cache.Load(createKey) + if exists { + // record a cancel metric if found + opMgr.recordCancelMetric(obj, createKey, operationDuration) + } + + // check if we have a CreateSnapshotAndReady operation pending for this + createAndReadyKey := NewOperationKey(CreateSnapshotAndReadyOperationName, opKey.ResourceID) + obj, exists = opMgr.cache.Load(createAndReadyKey) + if exists { + // record a cancel metric if found + opMgr.recordCancelMetric(obj, createAndReadyKey, operationDuration) + } + } + + opMgr.cache.Delete(opKey) +} + +// recordCancelMetric records a metric for a create operation that hasn't finished +func (opMgr *operationMetricsManager) recordCancelMetric(obj interface{}, key OperationKey, duration float64) { + // record a cancel metric if found + val, ok := obj.(OperationValue) + if !ok { + klog.Errorf("Invalid cache entry for key %v", key) + opMgr.cache.Delete(key) + return + } + opMgr.opLatencyMetrics.WithLabelValues( + val.Driver, + key.Name, + val.SnapshotType, + string(SnapshotStatusTypeCancel), + ).Observe(duration) + opMgr.cache.Delete(key) } func (opMgr *operationMetricsManager) init() { @@ -140,11 +254,11 @@ func (opMgr *operationMetricsManager) init() { opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec( &k8smetrics.HistogramOpts{ Subsystem: subSystem, - Name: metricName, - Help: metricHelpMsg, + Name: operationLatencyMetricName, + Help: operationLatencyMetricHelpMsg, Buckets: metricBuckets, }, - []string{labelDriverName, labelOperationName, labelOperationStatus}, + []string{labelDriverName, labelOperationName, labelSnapshotType, labelOperationStatus}, ) opMgr.registry.MustRegister(opMgr.opLatencyMetrics) } @@ -175,3 +289,25 @@ func (opMgr *operationMetricsManager) StartMetricsEndpoint(pattern, addr string, }() return srv, nil } + +// snapshotProvisionType represents which kind of snapshot a metric is +type snapshotProvisionType string + +// snapshotStatusType represents the type of snapshot status to report +type snapshotStatusType string + +// SnapshotOperationStatus represents the status for a snapshot controller operation +type SnapshotOperationStatus struct { + status snapshotStatusType +} + +// NewSnapshotOperationStatus returns a new SnapshotOperationStatus +func NewSnapshotOperationStatus(status snapshotStatusType) SnapshotOperationStatus { + return SnapshotOperationStatus{ + status: status, + } +} + +func (sos SnapshotOperationStatus) String() string { + return string(sos.status) +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 0360e928..aa766fb1 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -19,10 +19,12 @@ package metrics import ( "context" "fmt" + "io" "io/ioutil" "log" "net/http" "reflect" + "sort" "strings" "sync" "testing" @@ -86,9 +88,8 @@ func TestNew(t *testing.T) { func TestDropNonExistingOperation(t *testing.T) { mgr, wg, srv := initMgr() defer shutdown(srv, wg) - op := Operation{ + op := OperationKey{ Name: "drop-non-existing-operation-should-be-noop", - Driver: "driver", ResourceID: types.UID("uid"), } mgr.DropOperation(op) @@ -98,12 +99,11 @@ func TestRecordMetricsForNonExistingOperation(t *testing.T) { mgr, wg, srv := initMgr() srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) - op := Operation{ + opKey := OperationKey{ Name: "no-metrics-should-be-recorded-as-operation-did-not-start", - Driver: "driver", ResourceID: types.UID("uid"), } - mgr.RecordMetrics(op, nil) + mgr.RecordMetrics(opKey, nil, "driver") rsp, err := http.Get(srvAddr) if err != nil || rsp.StatusCode != http.StatusOK { t.Errorf("failed to get response from server %v, %v", err, rsp) @@ -112,8 +112,8 @@ func TestRecordMetricsForNonExistingOperation(t *testing.T) { if err != nil { t.Errorf("failed to read response body %v", err) } - if strings.Contains(string(r), op.Name) { - t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r)) + if strings.Contains(string(r), opKey.Name) { + t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r)) } } @@ -121,13 +121,13 @@ func TestDropOperation(t *testing.T) { mgr, wg, srv := initMgr() srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) - op := Operation{ + opKey := OperationKey{ Name: "should-have-been-dropped", - Driver: "driver", ResourceID: types.UID("uid"), } - mgr.OperationStart(op) - mgr.DropOperation(op) + opVal := NewOperationValue("driver", DynamicSnapshotType) + mgr.OperationStart(opKey, opVal) + mgr.DropOperation(opKey) time.Sleep(300 * time.Millisecond) rsp, err := http.Get(srvAddr) if err != nil || rsp.StatusCode != http.StatusOK { @@ -137,36 +137,36 @@ func TestDropOperation(t *testing.T) { if err != nil { t.Errorf("failed to read response body %v", err) } - if strings.Contains(string(r), op.Name) { - t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r)) + if strings.Contains(string(r), opKey.Name) { + t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r)) } // re-add with a different name - op.Name = "should-have-been-added" - mgr.OperationStart(op) + opKey.Name = "should-have-been-added" + mgr.OperationStart(opKey, opVal) time.Sleep(300 * time.Millisecond) opStatus := &fakeOpStatus{ statusCode: 0, } - mgr.RecordMetrics(op, opStatus) + mgr.RecordMetrics(opKey, opStatus, "driver") expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.25"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 0.3 -snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.25"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 0.3 +snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { @@ -178,34 +178,33 @@ func TestUnknownStatus(t *testing.T) { mgr, wg, srv := initMgr() srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) - op := Operation{ + opKey := OperationKey{ Name: "unknown-status-operation", - Driver: "driver", ResourceID: types.UID("uid"), } - mgr.OperationStart(op) + mgr.OperationStart(opKey, NewOperationValue("driver", DynamicSnapshotType)) // should create a Unknown data point with latency ~300ms time.Sleep(300 * time.Millisecond) - mgr.RecordMetrics(op, nil) + mgr.RecordMetrics(opKey, nil, "driver") expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.25"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 0.3 -snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.25"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 0.3 +snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { @@ -218,66 +217,65 @@ func TestRecordMetrics(t *testing.T) { srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) // add an operation - op := Operation{ + opKey := OperationKey{ Name: "op1", - Driver: "driver1", ResourceID: types.UID("uid1"), } - mgr.OperationStart(op) + opVal := NewOperationValue("driver", DynamicSnapshotType) + mgr.OperationStart(opKey, opVal) // should create a Success data point with latency ~ 1100ms time.Sleep(1100 * time.Millisecond) success := &fakeOpStatus{ statusCode: 0, } - mgr.RecordMetrics(op, success) + mgr.RecordMetrics(opKey, success, "driver") // add another operation metric - op.Name = "op2" - op.Driver = "driver2" - op.ResourceID = types.UID("uid2") - mgr.OperationStart(op) + opKey.Name = "op2" + opKey.ResourceID = types.UID("uid2") + mgr.OperationStart(opKey, opVal) // should create a Failure data point with latency ~ 100ms time.Sleep(100 * time.Millisecond) failure := &fakeOpStatus{ statusCode: 1, } - mgr.RecordMetrics(op, failure) + mgr.RecordMetrics(opKey, failure, "driver") expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.25"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.5"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success"} 1.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { t.Errorf("failed testing [%v]", err) @@ -295,15 +293,14 @@ func TestConcurrency(t *testing.T) { statusCode: 1, } ops := []struct { - op Operation + op OperationKey desiredLatencyMs time.Duration status OperationStatus drop bool }{ { - Operation{ + OperationKey{ Name: "success1", - Driver: "driver1", ResourceID: types.UID("uid1"), }, 100, @@ -311,9 +308,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "success2", - Driver: "driver2", ResourceID: types.UID("uid2"), }, 100, @@ -321,9 +317,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "failure1", - Driver: "driver3", ResourceID: types.UID("uid3"), }, 100, @@ -331,9 +326,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "failure2", - Driver: "driver4", ResourceID: types.UID("uid4"), }, 100, @@ -341,9 +335,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "unknown", - Driver: "driver5", ResourceID: types.UID("uid5"), }, 100, @@ -351,9 +344,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "drop-from-cache", - Driver: "driver6", ResourceID: types.UID("uid6"), }, 100, @@ -363,15 +355,20 @@ func TestConcurrency(t *testing.T) { } for i := range ops { - mgr.OperationStart(ops[i].op) + mgr.OperationStart(ops[i].op, OperationValue{ + Driver: fmt.Sprintf("driver%v", i), + SnapshotType: string(DynamicSnapshotType), + }) } // add an extra operation which should remain in the cache - remaining := Operation{ + remaining := OperationKey{ Name: "remaining-in-cache", - Driver: "driver7", ResourceID: types.UID("uid7"), } - mgr.OperationStart(remaining) + mgr.OperationStart(remaining, OperationValue{ + Driver: "driver7", + SnapshotType: string(DynamicSnapshotType), + }) var wgMetrics sync.WaitGroup @@ -385,7 +382,7 @@ func TestConcurrency(t *testing.T) { if ops[i].drop { mgr.DropOperation(ops[i].op) } else { - mgr.RecordMetrics(ops[i].op, ops[i].status) + mgr.RecordMetrics(ops[i].op, ops[i].status, fmt.Sprintf("driver%v", i)) } }(i) } @@ -395,86 +392,86 @@ func TestConcurrency(t *testing.T) { expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { t.Errorf("failed testing [%v]", err) @@ -494,63 +491,141 @@ func verifyMetric(expected, srvAddr string) error { return err } format := expfmt.ResponseFormat(rsp.Header) - reader := strings.NewReader(string(r)) - decoder := expfmt.NewDecoder(reader, format) - mf := &cmg.MetricFamily{} - if err := decoder.Decode(mf); err != nil { - return err + gotReader := strings.NewReader(string(r)) + gotDecoder := expfmt.NewDecoder(gotReader, format) + expectedReader := strings.NewReader(expected) + expectedDecoder := expfmt.NewDecoder(expectedReader, format) + + gotMfs := []*cmg.MetricFamily{} + expectedMfs := []*cmg.MetricFamily{} + for { + gotMf := &cmg.MetricFamily{} + gotErr := gotDecoder.Decode(gotMf) + expectedMf := &cmg.MetricFamily{} + if expectedErr := expectedDecoder.Decode(expectedMf); expectedErr != nil { + // return correctly if both are EOF + if expectedErr == io.EOF && gotErr == io.EOF { + break + } else { + return err + } + } + gotMfs = append(gotMfs, gotMf) + expectedMfs = append(expectedMfs, expectedMf) } - reader = strings.NewReader(expected) - decoder = expfmt.NewDecoder(reader, format) - expectedMf := &cmg.MetricFamily{} - if err := decoder.Decode(expectedMf); err != nil { - return err - } - if !containsMetrics(expectedMf, mf) { + + if !containsMetrics(expectedMfs, gotMfs) { return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r)) } + return nil } -func containsMetrics(expected, got *cmg.MetricFamily) bool { - if (got.Name == nil || *(got.Name) != *(expected.Name)) || - (got.Type == nil || *(got.Type) != *(expected.Type)) || - (got.Help == nil || *(got.Help) != *(expected.Help)) { +// sortMfs, sorts metric families in alphabetical order by type. +// currently only supports counter and histogram +func sortMfs(mfs []*cmg.MetricFamily) []*cmg.MetricFamily { + var sortedMfs []*cmg.MetricFamily + + // Sort first by type + sort.Slice(mfs, func(i, j int) bool { + return *mfs[i].Type < *mfs[j].Type + }) + + // Next, sort by length of name + sort.Slice(mfs, func(i, j int) bool { + return len(*mfs[i].Name) < len(*mfs[j].Name) + }) + + return sortedMfs +} + +func containsMetrics(expectedMfs, gotMfs []*cmg.MetricFamily) bool { + if len(gotMfs) != len(expectedMfs) { + fmt.Printf("Not same length: expected and got metrics families: %v vs. %v\n", len(expectedMfs), len(gotMfs)) return false } - numRecords := len(expected.Metric) - if len(got.Metric) < numRecords { - return false - } - matchCount := 0 - for i := 0; i < len(got.Metric); i++ { - for j := 0; j < numRecords; j++ { - // labels should be the same - if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) { - continue + // sort metric families for deterministic comparison. + sortedExpectedMfs := sortMfs(expectedMfs) + sortedGotMfs := sortMfs(gotMfs) + + // compare expected vs. sorted actual metrics + for k, got := range sortedGotMfs { + matchCount := 0 + expected := sortedExpectedMfs[k] + + if (got.Name == nil || *(got.Name) != *(expected.Name)) || + (got.Type == nil || *(got.Type) != *(expected.Type)) || + (got.Help == nil || *(got.Help) != *(expected.Help)) { + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Name, *expected.Name) + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Type, *expected.Type) + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Help, *expected.Help) + return false + } + + numRecords := len(expected.Metric) + if len(got.Metric) < numRecords { + fmt.Printf("Not the same number of records: got.Metric: %v, numRecords: %v\n", len(got.Metric), numRecords) + return false + } + for i := 0; i < len(got.Metric); i++ { + for j := 0; j < numRecords; j++ { + if got.Metric[i].Histogram == nil && expected.Metric[j].Histogram != nil || + got.Metric[i].Histogram != nil && expected.Metric[j].Histogram == nil { + fmt.Printf("got metric and expected metric histogram type mismatch") + return false + } + + // labels should be the same + if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) { + continue + } + + // metric type specific checks + switch { + case got.Metric[i].Histogram != nil && expected.Metric[j].Histogram != nil: + gh := got.Metric[i].Histogram + eh := expected.Metric[j].Histogram + if gh == nil || eh == nil { + continue + } + if !reflect.DeepEqual(gh.Bucket, eh.Bucket) { + fmt.Println("got and expected histogram bucket not equal") + continue + } + + // this is a sum record, expecting a latency which is more than the + // expected one. If the sum is smaller than expected, it will be considered + // as NOT a match + if gh.SampleSum == nil || eh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) { + fmt.Println("difference in sample sum") + continue + } + if gh.SampleCount == nil || eh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) { + fmt.Println("difference in sample count") + continue + } + + case got.Metric[i].Counter != nil && expected.Metric[j].Counter != nil: + gc := got.Metric[i].Counter + ec := expected.Metric[j].Counter + if gc.Value == nil || *(gc.Value) != *(ec.Value) { + fmt.Println("difference in counter values") + continue + } + } + + // this is a match + matchCount = matchCount + 1 + break } - gh := got.Metric[i].Histogram - eh := expected.Metric[j].Histogram - if gh == nil { - continue - } - if !reflect.DeepEqual(gh.Bucket, eh.Bucket) { - continue - } - // this is a sum record, expecting a latency which is more than the - // expected one. If the sum is smaller than expected, it will be considered - // as NOT a match - if gh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) { - continue - } - if gh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) { - continue - } - // this is a match - matchCount = matchCount + 1 - break + } + + if matchCount != numRecords { + fmt.Printf("matchCount %v, numRecords %v\n", matchCount, numRecords) + return false } } - return matchCount == numRecords + + return true } diff --git a/pkg/utils/util.go b/pkg/utils/util.go index c4744855..b563ca01 100644 --- a/pkg/utils/util.go +++ b/pkg/utils/util.go @@ -519,3 +519,8 @@ func IsSnapshotReady(snapshot *crdv1.VolumeSnapshot) bool { } return true } + +// IsSnapshotCreated indicates that the snapshot has been cut on a storage system +func IsSnapshotCreated(snapshot *crdv1.VolumeSnapshot) bool { + return snapshot.Status != nil && snapshot.Status.CreationTime != nil +}