[snapshot-controller] Fix wait for CRDs duration
Ensure the function `ensureCustomResourceDefinitionsExist` returns when the duration specified in `retryCRDIntervalMax` flag is reached. This works by passing a context to the backoff so that we can timeout with context.WithTimeout. The new (correct) default duration for the flag is 30s to give a bit more time when controller and CRD are created at the same time. Also limit the amount of resources returned by the list calls to 0 to speed up the operations because we only care that the CRD exists.
This commit is contained in:
@@ -131,7 +131,7 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh
|
|||||||
|
|
||||||
##### Volume Snapshot Classes
|
##### Volume Snapshot Classes
|
||||||
|
|
||||||
* There can only be a single default volume snapshot class for a particular driver.
|
* There can only be a single default volume snapshot class for a particular driver.
|
||||||
|
|
||||||
### Distributed Snapshotting
|
### Distributed Snapshotting
|
||||||
|
|
||||||
@@ -139,11 +139,11 @@ The distributed snapshotting feature is provided to handle snapshot operations f
|
|||||||
|
|
||||||
#### Snapshot controller option
|
#### Snapshot controller option
|
||||||
|
|
||||||
* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default.
|
* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default.
|
||||||
|
|
||||||
#### CSI external snapshotter sidecar option
|
#### CSI external snapshotter sidecar option
|
||||||
|
|
||||||
* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default.
|
* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default.
|
||||||
|
|
||||||
Other than this, the NODE_NAME environment variable must be set where the CSI snapshotter sidecar is deployed. The value of NODE_NAME should be the name of the node where the sidecar is running.
|
Other than this, the NODE_NAME environment variable must be set where the CSI snapshotter sidecar is deployed. The value of NODE_NAME should be the name of the node where the sidecar is running.
|
||||||
|
|
||||||
@@ -174,7 +174,7 @@ Other than this, the NODE_NAME environment variable must be set where the CSI sn
|
|||||||
|
|
||||||
* `--retry-interval-max`: Maximum retry interval of failed volume snapshot creation or deletion. Default value is 5 minutes.
|
* `--retry-interval-max`: Maximum retry interval of failed volume snapshot creation or deletion. Default value is 5 minutes.
|
||||||
|
|
||||||
* `--retry-crd-interval-max`: Maximum retry interval for detecting the snapshot CRDs on controller startup. Default is 5 seconds.
|
* `--retry-crd-interval-max`: Maximum retry duration for detecting the snapshot CRDs on controller startup. Default is 30 seconds.
|
||||||
|
|
||||||
* `--enable-distributed-snapshotting` : Enables each node to handle snapshots for the volumes local to that node. Off by default. It should be set to true only if `--node-deployment` parameter for the csi external snapshotter sidecar is set to true. See https://github.com/kubernetes-csi/external-snapshotter/blob/master/README.md#distributed-snapshotting for details.
|
* `--enable-distributed-snapshotting` : Enables each node to handle snapshots for the volumes local to that node. Off by default. It should be set to true only if `--node-deployment` parameter for the csi external snapshotter sidecar is set to true. See https://github.com/kubernetes-csi/external-snapshotter/blob/master/README.md#distributed-snapshotting for details.
|
||||||
|
|
||||||
|
@@ -74,46 +74,49 @@ var (
|
|||||||
preventVolumeModeConversion = flag.Bool("prevent-volume-mode-conversion", true, "Prevents an unauthorised user from modifying the volume mode when creating a PVC from an existing VolumeSnapshot.")
|
preventVolumeModeConversion = flag.Bool("prevent-volume-mode-conversion", true, "Prevents an unauthorised user from modifying the volume mode when creating a PVC from an existing VolumeSnapshot.")
|
||||||
enableVolumeGroupSnapshots = flag.Bool("enable-volume-group-snapshots", false, "Enables the volume group snapshot feature, allowing the user to create a snapshot of a group of volumes.")
|
enableVolumeGroupSnapshots = flag.Bool("enable-volume-group-snapshots", false, "Enables the volume group snapshot feature, allowing the user to create a snapshot of a group of volumes.")
|
||||||
|
|
||||||
retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 5*time.Second, "Maximum retry interval to wait for CRDs to appear. The default is 5 seconds.")
|
retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 30*time.Second, "Maximum time to wait for CRDs to appear. The default is 30 seconds.")
|
||||||
)
|
)
|
||||||
|
|
||||||
var version = "unknown"
|
var version = "unknown"
|
||||||
|
|
||||||
// Checks that the VolumeSnapshot v1 CRDs exist.
|
// Checks that the VolumeSnapshot v1 CRDs exist. It will wait at most the duration specified by retryCRDIntervalMax
|
||||||
func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVolumeGroupSnapshots bool) error {
|
func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVolumeGroupSnapshots bool) error {
|
||||||
condition := func() (bool, error) {
|
condition := func(ctx context.Context) (bool, error) {
|
||||||
var err error
|
var err error
|
||||||
|
// List calls should return faster with a limit of 0.
|
||||||
|
// We do not care about what is returned and just want to make sure the CRDs exist.
|
||||||
|
listOptions := metav1.ListOptions{Limit: 0}
|
||||||
|
|
||||||
// scoping to an empty namespace makes `List` work across all namespaces
|
// scoping to an empty namespace makes `List` work across all namespaces
|
||||||
_, err = client.SnapshotV1().VolumeSnapshots("").List(context.TODO(), metav1.ListOptions{})
|
_, err = client.SnapshotV1().VolumeSnapshots("").List(ctx, listOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list v1 volumesnapshots with error=%+v", err)
|
klog.Errorf("Failed to list v1 volumesnapshots with error=%+v", err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = client.SnapshotV1().VolumeSnapshotClasses().List(context.TODO(), metav1.ListOptions{})
|
_, err = client.SnapshotV1().VolumeSnapshotClasses().List(ctx, listOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list v1 volumesnapshotclasses with error=%+v", err)
|
klog.Errorf("Failed to list v1 volumesnapshotclasses with error=%+v", err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
_, err = client.SnapshotV1().VolumeSnapshotContents().List(context.TODO(), metav1.ListOptions{})
|
_, err = client.SnapshotV1().VolumeSnapshotContents().List(ctx, listOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list v1 volumesnapshotcontents with error=%+v", err)
|
klog.Errorf("Failed to list v1 volumesnapshotcontents with error=%+v", err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
if enableVolumeGroupSnapshots {
|
if enableVolumeGroupSnapshots {
|
||||||
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(context.TODO(), metav1.ListOptions{})
|
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(ctx, listOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshots with error=%+v", err)
|
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshots with error=%+v", err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(context.TODO(), metav1.ListOptions{})
|
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(ctx, listOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotclasses with error=%+v", err)
|
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotclasses with error=%+v", err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(context.TODO(), metav1.ListOptions{})
|
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(ctx, listOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotcontents with error=%+v", err)
|
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotcontents with error=%+v", err)
|
||||||
return false, nil
|
return false, nil
|
||||||
@@ -123,24 +126,19 @@ func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVol
|
|||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// The maximum retry duration = initial duration * retry factor ^ # steps. Rearranging, this gives
|
|
||||||
// # steps = log(maximum retry / initial duration) / log(retry factor).
|
|
||||||
const retryFactor = 1.5
|
const retryFactor = 1.5
|
||||||
const initialDurationMs = 100
|
const initialDuration = 100 * time.Millisecond
|
||||||
maxMs := retryCRDIntervalMax.Milliseconds()
|
|
||||||
if maxMs < initialDurationMs {
|
|
||||||
maxMs = initialDurationMs
|
|
||||||
}
|
|
||||||
steps := int(math.Ceil(math.Log(float64(maxMs)/initialDurationMs) / math.Log(retryFactor)))
|
|
||||||
if steps < 1 {
|
|
||||||
steps = 1
|
|
||||||
}
|
|
||||||
backoff := wait.Backoff{
|
backoff := wait.Backoff{
|
||||||
Duration: initialDurationMs * time.Millisecond,
|
Duration: initialDuration,
|
||||||
Factor: retryFactor,
|
Factor: retryFactor,
|
||||||
Steps: steps,
|
Steps: math.MaxInt32, // effectively no limit until the timeout is reached
|
||||||
}
|
}
|
||||||
if err := wait.ExponentialBackoff(backoff, condition); err != nil {
|
|
||||||
|
// Sanity check to make sure we have a minimum duration of 1 second to work with
|
||||||
|
maxBackoffDuration := max(*retryCRDIntervalMax, 1*time.Second)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), maxBackoffDuration)
|
||||||
|
defer cancel()
|
||||||
|
if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -16,10 +16,11 @@ spec:
|
|||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: snapshot-controller
|
app.kubernetes.io/name: snapshot-controller
|
||||||
# the snapshot controller won't be marked as ready if the v1 CRDs are unavailable
|
# The snapshot controller won't be marked as ready if the v1 CRDs are unavailable.
|
||||||
# in #504 the snapshot-controller will exit after around 7.5 seconds if it
|
# The flag --retry-crd-interval-max is used to determine how long the controller
|
||||||
# can't find the v1 CRDs so this value should be greater than that
|
# will wait for the CRDs to become available before exiting. The default is 30 seconds
|
||||||
minReadySeconds: 15
|
# so minReadySeconds should be set slightly higher than the flag value.
|
||||||
|
minReadySeconds: 35
|
||||||
strategy:
|
strategy:
|
||||||
rollingUpdate:
|
rollingUpdate:
|
||||||
maxSurge: 0
|
maxSurge: 0
|
||||||
|
Reference in New Issue
Block a user