Merge pull request #987 from DataDog/fricounet/upstream/fix-controller-startup-time

Fix the max duration to wait for CRDs in the snapshot-controller
2024-01-10 16:34:01 +01:00
parent 612c89be2f 72b51c066c
commit b06160e5c9
3 changed files with 30 additions and 31 deletions
--- a/README.md
+++ b/README.md
@@ -131,7 +131,7 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh

 ##### Volume Snapshot Classes

-* There can only be a single default volume snapshot class for a particular driver. 
+* There can only be a single default volume snapshot class for a particular driver.

 ### Distributed Snapshotting

@@ -139,11 +139,11 @@ The distributed snapshotting feature is provided to handle snapshot operations f

 #### Snapshot controller option

-* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default. 
+* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default.

 #### CSI external snapshotter sidecar option

-* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default. 
+* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default.

 Other than this, the NODE_NAME environment variable must be set where the CSI snapshotter sidecar is deployed. The value of NODE_NAME should be the name of the node where the sidecar is running.

@@ -174,7 +174,7 @@ Other than this, the NODE_NAME environment variable must be set where the CSI sn

 * `--retry-interval-max`: Maximum retry interval of failed volume snapshot creation or deletion. Default value is 5 minutes.

-* `--retry-crd-interval-max`: Maximum retry interval for detecting the snapshot CRDs on controller startup. Default is 5 seconds.
+* `--retry-crd-interval-max`: Maximum retry duration for detecting the snapshot CRDs on controller startup. Default is 30 seconds.

 * `--enable-distributed-snapshotting` : Enables each node to handle snapshots for the volumes local to that node. Off by default. It should be set to true only if `--node-deployment` parameter for the csi external snapshotter sidecar is set to true. See https://github.com/kubernetes-csi/external-snapshotter/blob/master/README.md#distributed-snapshotting for details.

--- a/cmd/snapshot-controller/main.go
+++ b/cmd/snapshot-controller/main.go
@@ -74,46 +74,49 @@ var (
 	preventVolumeModeConversion   = flag.Bool("prevent-volume-mode-conversion", true, "Prevents an unauthorised user from modifying the volume mode when creating a PVC from an existing VolumeSnapshot.")
 	enableVolumeGroupSnapshots    = flag.Bool("enable-volume-group-snapshots", false, "Enables the volume group snapshot feature, allowing the user to create a snapshot of a group of volumes.")

-	retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 5*time.Second, "Maximum retry interval to wait for CRDs to appear. The default is 5 seconds.")
+	retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 30*time.Second, "Maximum time to wait for CRDs to appear. The default is 30 seconds.")
 )

 var version = "unknown"

-// Checks that the VolumeSnapshot v1 CRDs exist.
+// Checks that the VolumeSnapshot v1 CRDs exist. It will wait at most the duration specified by retryCRDIntervalMax
 func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVolumeGroupSnapshots bool) error {
-	condition := func() (bool, error) {
+	condition := func(ctx context.Context) (bool, error) {
 		var err error
+		// List calls should return faster with a limit of 0.
+		// We do not care about what is returned and just want to make sure the CRDs exist.
+		listOptions := metav1.ListOptions{Limit: 0}

 		// scoping to an empty namespace makes `List` work across all namespaces
-		_, err = client.SnapshotV1().VolumeSnapshots("").List(context.TODO(), metav1.ListOptions{})
+		_, err = client.SnapshotV1().VolumeSnapshots("").List(ctx, listOptions)
 		if err != nil {
 			klog.Errorf("Failed to list v1 volumesnapshots with error=%+v", err)
 			return false, nil
 		}

-		_, err = client.SnapshotV1().VolumeSnapshotClasses().List(context.TODO(), metav1.ListOptions{})
+		_, err = client.SnapshotV1().VolumeSnapshotClasses().List(ctx, listOptions)
 		if err != nil {
 			klog.Errorf("Failed to list v1 volumesnapshotclasses with error=%+v", err)
 			return false, nil
 		}
-		_, err = client.SnapshotV1().VolumeSnapshotContents().List(context.TODO(), metav1.ListOptions{})
+		_, err = client.SnapshotV1().VolumeSnapshotContents().List(ctx, listOptions)
 		if err != nil {
 			klog.Errorf("Failed to list v1 volumesnapshotcontents with error=%+v", err)
 			return false, nil
 		}
 		if enableVolumeGroupSnapshots {
-			_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(context.TODO(), metav1.ListOptions{})
+			_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(ctx, listOptions)
 			if err != nil {
 				klog.Errorf("Failed to list v1alpha1 volumegroupsnapshots with error=%+v", err)
 				return false, nil
 			}

-			_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(context.TODO(), metav1.ListOptions{})
+			_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(ctx, listOptions)
 			if err != nil {
 				klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotclasses with error=%+v", err)
 				return false, nil
 			}
-			_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(context.TODO(), metav1.ListOptions{})
+			_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(ctx, listOptions)
 			if err != nil {
 				klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotcontents with error=%+v", err)
 				return false, nil
@@ -123,24 +126,19 @@ func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVol
 		return true, nil
 	}

-	// The maximum retry duration = initial duration * retry factor ^ # steps. Rearranging, this gives
-	// # steps = log(maximum retry / initial duration) / log(retry factor).
 	const retryFactor = 1.5
-	const initialDurationMs = 100
-	maxMs := retryCRDIntervalMax.Milliseconds()
-	if maxMs < initialDurationMs {
-		maxMs = initialDurationMs
-	}
-	steps := int(math.Ceil(math.Log(float64(maxMs)/initialDurationMs) / math.Log(retryFactor)))
-	if steps < 1 {
-		steps = 1
-	}
+	const initialDuration = 100 * time.Millisecond
 	backoff := wait.Backoff{
-		Duration: initialDurationMs * time.Millisecond,
+		Duration: initialDuration,
 		Factor:   retryFactor,
-		Steps:    steps,
+		Steps:    math.MaxInt32, // effectively no limit until the timeout is reached
 	}
-	if err := wait.ExponentialBackoff(backoff, condition); err != nil {
+
+	// Sanity check to make sure we have a minimum duration of 1 second to work with
+	maxBackoffDuration := max(*retryCRDIntervalMax, 1*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), maxBackoffDuration)
+	defer cancel()
+	if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil {
 		return err
 	}

--- a/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml
+++ b/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml
@@ -16,10 +16,11 @@ spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: snapshot-controller
-  # the snapshot controller won't be marked as ready if the v1 CRDs are unavailable
-  # in #504 the snapshot-controller will exit after around 7.5 seconds if it
-  # can't find the v1 CRDs so this value should be greater than that
-  minReadySeconds: 15
+  # The snapshot controller won't be marked as ready if the v1 CRDs are unavailable.
+  # The flag --retry-crd-interval-max is used to determine how long the controller
+  # will wait for the CRDs to become available before exiting. The default is 30 seconds
+  # so minReadySeconds should be set slightly higher than the flag value.
+  minReadySeconds: 35
  strategy:
    rollingUpdate:
      maxSurge: 0