Add snapshot controller metrics

Signed-off-by: Grant Griffiths <grant@portworx.com>
This commit is contained in:
Grant Griffiths
2020-10-07 14:44:02 -07:00
parent 684a1d3566
commit 941821bf99
7 changed files with 655 additions and 282 deletions

View File

@@ -30,13 +30,48 @@ import (
)
const (
opStatusUnknown = "Unknown"
labelDriverName = "driver_name"
labelOperationName = "operation_name"
labelOperationStatus = "operation_status"
subSystem = "snapshot_controller"
metricName = "operation_total_seconds"
metricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end"
labelDriverName = "driver_name"
labelOperationName = "operation_name"
labelOperationStatus = "operation_status"
labelSnapshotType = "snapshot_type"
subSystem = "snapshot_controller"
operationLatencyMetricName = "operation_total_seconds"
operationLatencyMetricHelpMsg = "Total number of seconds spent by the controller on an operation"
unknownDriverName = "unknown"
// CreateSnapshotOperationName is the operation that tracks how long the controller takes to create a snapshot.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeSnapshot CR to dynamically provision a snapshot
// - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil
CreateSnapshotOperationName = "CreateSnapshot"
// CreateSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a snapshot and for it to be ready.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeSnapshot CR(both dynamic and pre-provisioned cases)
// - End_time: controller notices that the CR has a status with Ready field set to be true
CreateSnapshotAndReadyOperationName = "CreateSnapshotAndReady"
// DeleteSnapshotOperationName is the operation that tracks how long a snapshot deletion takes.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeSnapshot CR and the CR is ready to be deleted. Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time.
// - End_time: controller removed all finalizers on the VolumeSnapshot CR such that the CR is ready to be removed in the API server.
DeleteSnapshotOperationName = "DeleteSnapshot"
// DynamicSnapshotType represents a snapshot that is being dynamically provisioned
DynamicSnapshotType = snapshotProvisionType("dynamic")
// PreProvisionedSnapshotType represents a snapshot that is pre-provisioned
PreProvisionedSnapshotType = snapshotProvisionType("pre-provisioned")
// SnapshotStatusTypeUnknown represents that the status is unknown
SnapshotStatusTypeUnknown snapshotStatusType = "unknown"
// Success and Cancel are statuses for operation time (operation_total_seconds) as seen by snapshot controller
// SnapshotStatusTypeSuccess represents that a CreateSnapshot, CreateSnapshotAndReady,
// or DeleteSnapshot has finished successfully.
// Individual reconciliations (reconciliation_total_seconds) also use this status.
SnapshotStatusTypeSuccess snapshotStatusType = "success"
// SnapshotStatusTypeCancel represents that a CreateSnapshot, CreateSnapshotAndReady,
// or DeleteSnapshot has been deleted before finishing.
SnapshotStatusTypeCancel snapshotStatusType = "cancel"
)
// OperationStatus is the interface type for representing an operation's execution
@@ -57,11 +92,11 @@ type MetricsManager interface {
// OperationStart takes in an operation and caches its start time.
// if the operation already exists, it's an no-op.
OperationStart(op Operation)
OperationStart(key OperationKey, val OperationValue)
// DropOperation removes an operation from cache.
// if the operation does not exist, it's an no-op.
DropOperation(op Operation)
DropOperation(op OperationKey)
// RecordMetrics records a metric point. Note that it will be an no-op if an
// operation has NOT been marked "Started" previously via invoking "OperationStart".
@@ -69,20 +104,49 @@ type MetricsManager interface {
// op - the operation which the metric is associated with.
// status - the operation status, if not specified, i.e., status == nil, an
// "Unknown" status of the passed-in operation is assumed.
RecordMetrics(op Operation, status OperationStatus)
RecordMetrics(op OperationKey, status OperationStatus, driverName string)
}
// Operation is a structure which holds information to identify a snapshot
// related operation
type Operation struct {
// the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot"
// OperationKey is a structure which holds information to
// uniquely identify a snapshot related operation
type OperationKey struct {
// Name is the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot"
Name string
// the name of the driver which executes the operation
Driver string
// the resource UID to which the operation has been executed against
// ResourceID is the resource UID to which the operation has been executed against
ResourceID types.UID
}
// OperationValue is a structure which holds operation metadata
type OperationValue struct {
// Driver is the driver name which executes the operation
Driver string
// SnapshotType represents the snapshot type, for example: "dynamic", "pre-provisioned"
SnapshotType string
// startTime is the time when the operation first started
startTime time.Time
}
// NewOperationKey initializes a new OperationKey
func NewOperationKey(name string, snapshotUID types.UID) OperationKey {
return OperationKey{
Name: name,
ResourceID: snapshotUID,
}
}
// NewOperationValue initializes a new OperationValue
func NewOperationValue(driver string, snapshotType snapshotProvisionType) OperationValue {
if driver == "" {
driver = unknownDriverName
}
return OperationValue{
Driver: driver,
SnapshotType: string(snapshotType),
}
}
type operationMetricsManager struct {
// cache is a concurrent-safe map which stores start timestamps for all
// ongoing operations.
@@ -93,10 +157,11 @@ type operationMetricsManager struct {
// registry is a wrapper around Prometheus Registry
registry k8smetrics.KubeRegistry
// opLatencyMetrics is a Histogram metrics
// opLatencyMetrics is a Histogram metrics for operation time per request
opLatencyMetrics *k8smetrics.HistogramVec
}
// NewMetricsManager creates a new MetricsManager instance
func NewMetricsManager() MetricsManager {
mgr := &operationMetricsManager{
cache: sync.Map{},
@@ -105,34 +170,83 @@ func NewMetricsManager() MetricsManager {
return mgr
}
func (opMgr *operationMetricsManager) OperationStart(op Operation) {
opMgr.cache.LoadOrStore(op, time.Now())
// OperationStart starts a new operation
func (opMgr *operationMetricsManager) OperationStart(key OperationKey, val OperationValue) {
val.startTime = time.Now()
opMgr.cache.LoadOrStore(key, val)
}
func (opMgr *operationMetricsManager) DropOperation(op Operation) {
// OperationStart drops an operation
func (opMgr *operationMetricsManager) DropOperation(op OperationKey) {
opMgr.cache.Delete(op)
}
func (opMgr *operationMetricsManager) RecordMetrics(op Operation, status OperationStatus) {
obj, exists := opMgr.cache.Load(op)
// RecordMetrics emits operation metrics
func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) {
obj, exists := opMgr.cache.Load(opKey)
if !exists {
// the operation has not been cached, return directly
return
}
ts, ok := obj.(time.Time)
opVal, ok := obj.(OperationValue)
if !ok {
// the cached item is not a time.Time, should NEVER happen, clean and return
klog.Errorf("Invalid cache entry for key %v", op)
opMgr.cache.Delete(op)
// the cached item is not a OperationValue, should NEVER happen, clean and return
klog.Errorf("Invalid cache entry for key %v", opKey)
opMgr.cache.Delete(opKey)
return
}
strStatus := opStatusUnknown
if status != nil {
strStatus = status.String()
status := string(SnapshotStatusTypeUnknown)
if opStatus != nil {
status = opStatus.String()
}
duration := time.Since(ts).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(op.Driver, op.Name, strStatus).Observe(duration)
opMgr.cache.Delete(op)
// if we do not know the driverName while recording metrics,
// refer to the cached version instead.
if driverName == "" || driverName == unknownDriverName {
driverName = opVal.Driver
}
operationDuration := time.Since(opVal.startTime).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration)
// Report cancel metrics if we are deleting an unfinished VolumeSnapshot
if opKey.Name == DeleteSnapshotOperationName {
// check if we have a CreateSnapshot operation pending for this
createKey := NewOperationKey(CreateSnapshotOperationName, opKey.ResourceID)
obj, exists := opMgr.cache.Load(createKey)
if exists {
// record a cancel metric if found
opMgr.recordCancelMetric(obj, createKey, operationDuration)
}
// check if we have a CreateSnapshotAndReady operation pending for this
createAndReadyKey := NewOperationKey(CreateSnapshotAndReadyOperationName, opKey.ResourceID)
obj, exists = opMgr.cache.Load(createAndReadyKey)
if exists {
// record a cancel metric if found
opMgr.recordCancelMetric(obj, createAndReadyKey, operationDuration)
}
}
opMgr.cache.Delete(opKey)
}
// recordCancelMetric records a metric for a create operation that hasn't finished
func (opMgr *operationMetricsManager) recordCancelMetric(obj interface{}, key OperationKey, duration float64) {
// record a cancel metric if found
val, ok := obj.(OperationValue)
if !ok {
klog.Errorf("Invalid cache entry for key %v", key)
opMgr.cache.Delete(key)
return
}
opMgr.opLatencyMetrics.WithLabelValues(
val.Driver,
key.Name,
val.SnapshotType,
string(SnapshotStatusTypeCancel),
).Observe(duration)
opMgr.cache.Delete(key)
}
func (opMgr *operationMetricsManager) init() {
@@ -140,11 +254,11 @@ func (opMgr *operationMetricsManager) init() {
opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec(
&k8smetrics.HistogramOpts{
Subsystem: subSystem,
Name: metricName,
Help: metricHelpMsg,
Name: operationLatencyMetricName,
Help: operationLatencyMetricHelpMsg,
Buckets: metricBuckets,
},
[]string{labelDriverName, labelOperationName, labelOperationStatus},
[]string{labelDriverName, labelOperationName, labelSnapshotType, labelOperationStatus},
)
opMgr.registry.MustRegister(opMgr.opLatencyMetrics)
}
@@ -175,3 +289,25 @@ func (opMgr *operationMetricsManager) StartMetricsEndpoint(pattern, addr string,
}()
return srv, nil
}
// snapshotProvisionType represents which kind of snapshot a metric is
type snapshotProvisionType string
// snapshotStatusType represents the type of snapshot status to report
type snapshotStatusType string
// SnapshotOperationStatus represents the status for a snapshot controller operation
type SnapshotOperationStatus struct {
status snapshotStatusType
}
// NewSnapshotOperationStatus returns a new SnapshotOperationStatus
func NewSnapshotOperationStatus(status snapshotStatusType) SnapshotOperationStatus {
return SnapshotOperationStatus{
status: status,
}
}
func (sos SnapshotOperationStatus) String() string {
return string(sos.status)
}