From 206fecb9f156b60b919347de37c091f9e68948f3 Mon Sep 17 00:00:00 2001
From: Humble Chirammal <hchiramm@redhat.com>
Date: Fri, 11 Jun 2021 20:56:51 +0530
Subject: [PATCH] Allow tuning Common-controller Ratelimiter with
 `retryIntervalStart & retryIntervalMax`

This patch adds two new parameters `retryIntervalStart & retryIntervalMax`
which can be configured to adjust the ratelimiters of snapshotqueue and contentqueue
in the controller.

Signed-off-by: Humble Chirammal <hchiramm@redhat.com>

```release-note
 `retry-interval-start` and `retry-interval-max` arguments are added to common-controller
  which controls retry interval of failed volume snapshot creation and deletion.
  These values set the ratelimiter for snapshot and content queues.
```

Signed-off-by: Humble Chirammal <hchiramm@redhat.com>
---
 README.md                                         | 3 +++
 cmd/csi-snapshotter/main.go                       | 4 ++--
 cmd/snapshot-controller/main.go                   | 9 +++++++--
 pkg/common-controller/framework_test.go           | 3 +++
 pkg/common-controller/snapshot_controller_base.go | 6 ++++--
 5 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 8bacf3dd..343c0046 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,9 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh
 
 * `--worker-threads`: Number of worker threads for running create snapshot and delete snapshot operations. Default value is 10.
 
+* `--retry-interval-start`: Initial retry interval of failed volume snapshot creation or deletion. It doubles with each failure, up to retry-interval-max. Default value is 1 second.
+
+*`--retry-interval-max`: Maximum retry interval of failed volume snapshot creation or deletion. Default value is 5 minutes.
 #### Other recognized arguments
 * `--kubeconfig <path>`: Path to Kubernetes client configuration that the CSI external-snapshotter uses to connect to Kubernetes API server. When omitted, default token provided by Kubernetes will be used. This option is useful only when the external-snapshotter does not run as a Kubernetes pod, e.g. for debugging.
 
diff --git a/cmd/csi-snapshotter/main.go b/cmd/csi-snapshotter/main.go
index a3287549..7e499c36 100644
--- a/cmd/csi-snapshotter/main.go
+++ b/cmd/csi-snapshotter/main.go
@@ -72,8 +72,8 @@ var (
 	metricsAddress     = flag.String("metrics-address", "", "(deprecated) The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled. Only one of `--metrics-address` and `--http-endpoint` can be set.")
 	httpEndpoint       = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics and leader election health check, will listen (example: `:8080`). The default is empty string, which means the server is disabled. Only one of `--metrics-address` and `--http-endpoint` can be set.")
 	metricsPath        = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
-	retryIntervalStart = flag.Duration("retry-interval-start", time.Second, "Initial retry interval of failed volume snapshot creation or deletion. It doubles with each failure, up to retry-interval-max. Default is 1 second")
-	retryIntervalMax   = flag.Duration("retry-interval-max", 5*time.Minute, "Maximum retry interval of failed volume snapshot creation or deletion. Default is 5 minutes")
+	retryIntervalStart = flag.Duration("retry-interval-start", time.Second, "Initial retry interval of failed volume snapshot creation or deletion. It doubles with each failure, up to retry-interval-max. Default is 1 second.")
+	retryIntervalMax   = flag.Duration("retry-interval-max", 5*time.Minute, "Maximum retry interval of failed volume snapshot creation or deletion. Default is 5 minutes.")
 )
 
 var (
diff --git a/cmd/snapshot-controller/main.go b/cmd/snapshot-controller/main.go
index 29afc328..4f516117 100644
--- a/cmd/snapshot-controller/main.go
+++ b/cmd/snapshot-controller/main.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"flag"
 	"fmt"
+	"k8s.io/client-go/util/workqueue"
 	"os"
 	"os/signal"
 	"sync"
@@ -55,8 +56,10 @@ var (
 	leaderElection          = flag.Bool("leader-election", false, "Enables leader election.")
 	leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.")
 
-	httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics, will listen (example: :8080). The default is empty string, which means the server is disabled.")
-	metricsPath  = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
+	httpEndpoint       = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics, will listen (example: :8080). The default is empty string, which means the server is disabled.")
+	metricsPath        = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
+	retryIntervalStart = flag.Duration("retry-interval-start", time.Second, "Initial retry interval of failed volume snapshot creation or deletion. It doubles with each failure, up to retry-interval-max. Default is 1 second.")
+	retryIntervalMax   = flag.Duration("retry-interval-max", 5*time.Minute, "Maximum retry interval of failed volume snapshot creation or deletion. Default is 5 minutes.")
 )
 
 var (
@@ -170,6 +173,8 @@ func main() {
 		coreFactory.Core().V1().PersistentVolumeClaims(),
 		metricsManager,
 		*resyncPeriod,
+		workqueue.NewItemExponentialFailureRateLimiter(*retryIntervalStart, *retryIntervalMax),
+		workqueue.NewItemExponentialFailureRateLimiter(*retryIntervalStart, *retryIntervalMax),
 	)
 
 	if err := ensureCustomResourceDefinitionsExist(snapClient); err != nil {
diff --git a/pkg/common-controller/framework_test.go b/pkg/common-controller/framework_test.go
index 81b340c3..414cfcc4 100644
--- a/pkg/common-controller/framework_test.go
+++ b/pkg/common-controller/framework_test.go
@@ -19,6 +19,7 @@ package common_controller
 import (
 	"errors"
 	"fmt"
+	"k8s.io/client-go/util/workqueue"
 	"reflect"
 	sysruntime "runtime"
 	"strconv"
@@ -749,6 +750,8 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
 		coreFactory.Core().V1().PersistentVolumeClaims(),
 		metricsManager,
 		60*time.Second,
+		workqueue.NewItemExponentialFailureRateLimiter(1*time.Millisecond, 1*time.Minute),
+		workqueue.NewItemExponentialFailureRateLimiter(1*time.Millisecond, 1*time.Minute),
 	)
 
 	ctrl.eventRecorder = record.NewFakeRecorder(1000)
diff --git a/pkg/common-controller/snapshot_controller_base.go b/pkg/common-controller/snapshot_controller_base.go
index 556c30e9..0d5b46d0 100644
--- a/pkg/common-controller/snapshot_controller_base.go
+++ b/pkg/common-controller/snapshot_controller_base.go
@@ -76,6 +76,8 @@ func NewCSISnapshotCommonController(
 	pvcInformer coreinformers.PersistentVolumeClaimInformer,
 	metricsManager metrics.MetricsManager,
 	resyncPeriod time.Duration,
+	snapshotRateLimiter workqueue.RateLimiter,
+	contentRateLimiter workqueue.RateLimiter,
 ) *csiSnapshotCommonController {
 	broadcaster := record.NewBroadcaster()
 	broadcaster.StartLogging(klog.Infof)
@@ -90,8 +92,8 @@ func NewCSISnapshotCommonController(
 		resyncPeriod:   resyncPeriod,
 		snapshotStore:  cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
 		contentStore:   cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
-		snapshotQueue:  workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
-		contentQueue:   workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
+		snapshotQueue:  workqueue.NewNamedRateLimitingQueue(snapshotRateLimiter, "snapshot-controller-snapshot"),
+		contentQueue:   workqueue.NewNamedRateLimitingQueue(contentRateLimiter, "snapshot-controller-content"),
 		metricsManager: metricsManager,
 	}