diff --git a/README.md b/README.md index eeb4378c..a68179e3 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh ##### Volume Snapshot Classes -* There can only be a single default volume snapshot class for a particular driver. +* There can only be a single default volume snapshot class for a particular driver. ### Distributed Snapshotting @@ -139,11 +139,11 @@ The distributed snapshotting feature is provided to handle snapshot operations f #### Snapshot controller option -* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default. +* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default. #### CSI external snapshotter sidecar option -* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default. +* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default. Other than this, the NODE_NAME environment variable must be set where the CSI snapshotter sidecar is deployed. The value of NODE_NAME should be the name of the node where the sidecar is running. @@ -174,7 +174,7 @@ Other than this, the NODE_NAME environment variable must be set where the CSI sn * `--retry-interval-max`: Maximum retry interval of failed volume snapshot creation or deletion. Default value is 5 minutes. -* `--retry-crd-interval-max`: Maximum retry interval for detecting the snapshot CRDs on controller startup. Default is 5 seconds. +* `--retry-crd-interval-max`: Maximum retry duration for detecting the snapshot CRDs on controller startup. Default is 30 seconds. * `--enable-distributed-snapshotting` : Enables each node to handle snapshots for the volumes local to that node. Off by default. It should be set to true only if `--node-deployment` parameter for the csi external snapshotter sidecar is set to true. See https://github.com/kubernetes-csi/external-snapshotter/blob/master/README.md#distributed-snapshotting for details. diff --git a/cmd/snapshot-controller/main.go b/cmd/snapshot-controller/main.go index 09ddb6c1..e336ba0a 100644 --- a/cmd/snapshot-controller/main.go +++ b/cmd/snapshot-controller/main.go @@ -74,46 +74,49 @@ var ( preventVolumeModeConversion = flag.Bool("prevent-volume-mode-conversion", true, "Prevents an unauthorised user from modifying the volume mode when creating a PVC from an existing VolumeSnapshot.") enableVolumeGroupSnapshots = flag.Bool("enable-volume-group-snapshots", false, "Enables the volume group snapshot feature, allowing the user to create a snapshot of a group of volumes.") - retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 5*time.Second, "Maximum retry interval to wait for CRDs to appear. The default is 5 seconds.") + retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 30*time.Second, "Maximum time to wait for CRDs to appear. The default is 30 seconds.") ) var version = "unknown" -// Checks that the VolumeSnapshot v1 CRDs exist. +// Checks that the VolumeSnapshot v1 CRDs exist. It will wait at most the duration specified by retryCRDIntervalMax func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVolumeGroupSnapshots bool) error { - condition := func() (bool, error) { + condition := func(ctx context.Context) (bool, error) { var err error + // List calls should return faster with a limit of 0. + // We do not care about what is returned and just want to make sure the CRDs exist. + listOptions := metav1.ListOptions{Limit: 0} // scoping to an empty namespace makes `List` work across all namespaces - _, err = client.SnapshotV1().VolumeSnapshots("").List(context.TODO(), metav1.ListOptions{}) + _, err = client.SnapshotV1().VolumeSnapshots("").List(ctx, listOptions) if err != nil { klog.Errorf("Failed to list v1 volumesnapshots with error=%+v", err) return false, nil } - _, err = client.SnapshotV1().VolumeSnapshotClasses().List(context.TODO(), metav1.ListOptions{}) + _, err = client.SnapshotV1().VolumeSnapshotClasses().List(ctx, listOptions) if err != nil { klog.Errorf("Failed to list v1 volumesnapshotclasses with error=%+v", err) return false, nil } - _, err = client.SnapshotV1().VolumeSnapshotContents().List(context.TODO(), metav1.ListOptions{}) + _, err = client.SnapshotV1().VolumeSnapshotContents().List(ctx, listOptions) if err != nil { klog.Errorf("Failed to list v1 volumesnapshotcontents with error=%+v", err) return false, nil } if enableVolumeGroupSnapshots { - _, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(context.TODO(), metav1.ListOptions{}) + _, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(ctx, listOptions) if err != nil { klog.Errorf("Failed to list v1alpha1 volumegroupsnapshots with error=%+v", err) return false, nil } - _, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(context.TODO(), metav1.ListOptions{}) + _, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(ctx, listOptions) if err != nil { klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotclasses with error=%+v", err) return false, nil } - _, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(context.TODO(), metav1.ListOptions{}) + _, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(ctx, listOptions) if err != nil { klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotcontents with error=%+v", err) return false, nil @@ -123,24 +126,19 @@ func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVol return true, nil } - // The maximum retry duration = initial duration * retry factor ^ # steps. Rearranging, this gives - // # steps = log(maximum retry / initial duration) / log(retry factor). const retryFactor = 1.5 - const initialDurationMs = 100 - maxMs := retryCRDIntervalMax.Milliseconds() - if maxMs < initialDurationMs { - maxMs = initialDurationMs - } - steps := int(math.Ceil(math.Log(float64(maxMs)/initialDurationMs) / math.Log(retryFactor))) - if steps < 1 { - steps = 1 - } + const initialDuration = 100 * time.Millisecond backoff := wait.Backoff{ - Duration: initialDurationMs * time.Millisecond, + Duration: initialDuration, Factor: retryFactor, - Steps: steps, + Steps: math.MaxInt32, // effectively no limit until the timeout is reached } - if err := wait.ExponentialBackoff(backoff, condition); err != nil { + + // Sanity check to make sure we have a minimum duration of 1 second to work with + maxBackoffDuration := max(*retryCRDIntervalMax, 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), maxBackoffDuration) + defer cancel() + if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil { return err } diff --git a/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml b/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml index 40dfefdd..5ac76262 100644 --- a/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml +++ b/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml @@ -16,10 +16,11 @@ spec: selector: matchLabels: app.kubernetes.io/name: snapshot-controller - # the snapshot controller won't be marked as ready if the v1 CRDs are unavailable - # in #504 the snapshot-controller will exit after around 7.5 seconds if it - # can't find the v1 CRDs so this value should be greater than that - minReadySeconds: 15 + # The snapshot controller won't be marked as ready if the v1 CRDs are unavailable. + # The flag --retry-crd-interval-max is used to determine how long the controller + # will wait for the CRDs to become available before exiting. The default is 30 seconds + # so minReadySeconds should be set slightly higher than the flag value. + minReadySeconds: 35 strategy: rollingUpdate: maxSurge: 0