Bumping k8s dependencies to 1.13

2018-11-16 14:08:25 -08:00
parent 305407125c
commit b4c0b68ec7
8002 changed files with 884099 additions and 276228 deletions
--- a/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go
+++ b/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go
@@ -39,6 +39,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/config"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 	"k8s.io/kubernetes/pkg/kubelet/metrics"
+	watcher "k8s.io/kubernetes/pkg/kubelet/util/pluginwatcher"
 	schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
 )

@@ -48,14 +49,14 @@ type ActivePodsFunc func() []*v1.Pod
 // monitorCallback is the function called when a device's health state changes,
 // or new devices are reported, or old devices are deleted.
 // Updated contains the most recent state of the Device.
-type monitorCallback func(resourceName string, added, updated, deleted []pluginapi.Device)
+type monitorCallback func(resourceName string, devices []pluginapi.Device)

 // ManagerImpl is the structure in charge of managing Device Plugins.
 type ManagerImpl struct {
 	socketname string
 	socketdir  string

-	endpoints map[string]endpoint // Key is ResourceName
+	endpoints map[string]endpointInfo // Key is ResourceName
 	mutex     sync.Mutex

 	server *grpc.Server
@@ -85,10 +86,14 @@ type ManagerImpl struct {

 	// podDevices contains pod to allocated device mapping.
 	podDevices        podDevices
-	pluginOpts        map[string]*pluginapi.DevicePluginOptions
 	checkpointManager checkpointmanager.CheckpointManager
 }

+type endpointInfo struct {
+	e    endpoint
+	opts *pluginapi.DevicePluginOptions
+}
+
 type sourcesReadyStub struct{}

 func (s *sourcesReadyStub) AddSource(source string) {}
@@ -103,18 +108,18 @@ func newManagerImpl(socketPath string) (*ManagerImpl, error) {
 	glog.V(2).Infof("Creating Device Plugin manager at %s", socketPath)

 	if socketPath == "" || !filepath.IsAbs(socketPath) {
-		return nil, fmt.Errorf(errBadSocket+" %v", socketPath)
+		return nil, fmt.Errorf(errBadSocket+" %s", socketPath)
 	}

 	dir, file := filepath.Split(socketPath)
 	manager := &ManagerImpl{
-		endpoints:        make(map[string]endpoint),
+		endpoints: make(map[string]endpointInfo),
+
 		socketname:       file,
 		socketdir:        dir,
 		healthyDevices:   make(map[string]sets.String),
 		unhealthyDevices: make(map[string]sets.String),
 		allocatedDevices: make(map[string]sets.String),
-		pluginOpts:       make(map[string]*pluginapi.DevicePluginOptions),
 		podDevices:       make(podDevices),
 	}
 	manager.callback = manager.genericDeviceUpdateCallback
@@ -125,35 +130,24 @@ func newManagerImpl(socketPath string) (*ManagerImpl, error) {
 	manager.sourcesReady = &sourcesReadyStub{}
 	checkpointManager, err := checkpointmanager.NewCheckpointManager(dir)
 	if err != nil {
-		return nil, fmt.Errorf("failed to initialize checkpoint manager: %+v", err)
+		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
 	}
 	manager.checkpointManager = checkpointManager

 	return manager, nil
 }

-func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, added, updated, deleted []pluginapi.Device) {
-	kept := append(updated, added...)
+func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
 	m.mutex.Lock()
-	if _, ok := m.healthyDevices[resourceName]; !ok {
-		m.healthyDevices[resourceName] = sets.NewString()
-	}
-	if _, ok := m.unhealthyDevices[resourceName]; !ok {
-		m.unhealthyDevices[resourceName] = sets.NewString()
-	}
-	for _, dev := range kept {
+	m.healthyDevices[resourceName] = sets.NewString()
+	m.unhealthyDevices[resourceName] = sets.NewString()
+	for _, dev := range devices {
 		if dev.Health == pluginapi.Healthy {
 			m.healthyDevices[resourceName].Insert(dev.ID)
-			m.unhealthyDevices[resourceName].Delete(dev.ID)
 		} else {
 			m.unhealthyDevices[resourceName].Insert(dev.ID)
-			m.healthyDevices[resourceName].Delete(dev.ID)
 		}
 	}
-	for _, dev := range deleted {
-		m.healthyDevices[resourceName].Delete(dev.ID)
-		m.unhealthyDevices[resourceName].Delete(dev.ID)
-	}
 	m.mutex.Unlock()
 	m.writeCheckpoint()
 }
@@ -175,7 +169,7 @@ func (m *ManagerImpl) removeContents(dir string) error {
 		}
 		stat, err := os.Stat(filePath)
 		if err != nil {
-			glog.Errorf("Failed to stat file %v: %v", filePath, err)
+			glog.Errorf("Failed to stat file %s: %v", filePath, err)
 			continue
 		}
 		if stat.IsDir() {
@@ -199,7 +193,6 @@ func (m *ManagerImpl) checkpointFile() string {
 // starts device plugin registration service.
 func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error {
 	glog.V(2).Infof("Starting Device Plugin manager")
-	fmt.Println("Starting Device Plugin manager")

 	m.activePods = activePods
 	m.sourcesReady = sourcesReady
@@ -216,12 +209,12 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
 	// Removes all stale sockets in m.socketdir. Device plugins can monitor
 	// this and use it as a signal to re-register with the new Kubelet.
 	if err := m.removeContents(m.socketdir); err != nil {
-		glog.Errorf("Fail to clean up stale contents under %s: %+v", m.socketdir, err)
+		glog.Errorf("Fail to clean up stale contents under %s: %v", m.socketdir, err)
 	}

 	s, err := net.Listen("unix", socketPath)
 	if err != nil {
-		glog.Errorf(errListenSocket+" %+v", err)
+		glog.Errorf(errListenSocket+" %v", err)
 		return err
 	}

@@ -239,19 +232,82 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
 	return nil
 }

-// Devices is the map of devices that are known by the Device
-// Plugin manager with the kind of the devices as key
-func (m *ManagerImpl) Devices() map[string][]pluginapi.Device {
+// GetWatcherHandler returns the plugin handler
+func (m *ManagerImpl) GetWatcherHandler() watcher.PluginHandler {
+	if f, err := os.Create(m.socketdir + "DEPRECATION"); err != nil {
+		glog.Errorf("Failed to create deprecation file at %s", m.socketdir)
+	} else {
+		f.Close()
+		glog.V(4).Infof("created deprecation file %s", f.Name())
+	}
+
+	return watcher.PluginHandler(m)
+}
+
+// ValidatePlugin validates a plugin if the version is correct and the name has the format of an extended resource
+func (m *ManagerImpl) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
+	glog.V(2).Infof("Got Plugin %s at endpoint %s with versions %v", pluginName, endpoint, versions)
+
+	if !m.isVersionCompatibleWithPlugin(versions) {
+		return fmt.Errorf("manager version, %s, is not among plugin supported versions %v", pluginapi.Version, versions)
+	}
+
+	if !v1helper.IsExtendedResourceName(v1.ResourceName(pluginName)) {
+		return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
+	}
+
+	return nil
+}
+
+// RegisterPlugin starts the endpoint and registers it
+// TODO: Start the endpoint and wait for the First ListAndWatch call
+//       before registering the plugin
+func (m *ManagerImpl) RegisterPlugin(pluginName string, endpoint string) error {
+	glog.V(2).Infof("Registering Plugin %s at endpoint %s", pluginName, endpoint)
+
+	e, err := newEndpointImpl(endpoint, pluginName, m.callback)
+	if err != nil {
+		return fmt.Errorf("Failed to dial device plugin with socketPath %s: %v", endpoint, err)
+	}
+
+	options, err := e.client.GetDevicePluginOptions(context.Background(), &pluginapi.Empty{})
+	if err != nil {
+		return fmt.Errorf("Failed to get device plugin options: %v", err)
+	}
+
+	m.registerEndpoint(pluginName, options, e)
+	go m.runEndpoint(pluginName, e)
+
+	return nil
+}
+
+// DeRegisterPlugin deregisters the plugin
+// TODO work on the behavior for deregistering plugins
+// e.g: Should we delete the resource
+func (m *ManagerImpl) DeRegisterPlugin(pluginName string) {
 	m.mutex.Lock()
 	defer m.mutex.Unlock()

-	devs := make(map[string][]pluginapi.Device)
-	for k, e := range m.endpoints {
-		glog.V(3).Infof("Endpoint: %+v: %p", k, e)
-		devs[k] = e.getDevices()
+	// Note: This will mark the resource unhealthy as per the behavior
+	// in runEndpoint
+	if eI, ok := m.endpoints[pluginName]; ok {
+		eI.e.stop()
 	}
+}

-	return devs
+func (m *ManagerImpl) isVersionCompatibleWithPlugin(versions []string) bool {
+	// TODO(vikasc): Currently this is fine as we only have a single supported version. When we do need to support
+	// multiple versions in the future, we may need to extend this function to return a supported version.
+	// E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin,
+	// this function should return v1beta1
+	for _, version := range versions {
+		for _, supportedVersion := range pluginapi.SupportedVersions {
+			if version == supportedVersion {
+				return true
+			}
+		}
+	}
+	return false
 }

 // Allocate is the call that you can use to allocate a set of devices
@@ -259,7 +315,6 @@ func (m *ManagerImpl) Devices() map[string][]pluginapi.Device {
 func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
 	pod := attrs.Pod
 	devicesToReuse := make(map[string]sets.String)
-	// TODO: Reuse devices between init containers and regular containers.
 	for _, container := range pod.Spec.InitContainers {
 		if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
 			return err
@@ -298,13 +353,13 @@ func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest
 	}
 	if !versionCompatible {
 		errorString := fmt.Sprintf(errUnsupportedVersion, r.Version, pluginapi.SupportedVersions)
-		glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
+		glog.Infof("Bad registration request from device plugin with resource name %q: %s", r.ResourceName, errorString)
 		return &pluginapi.Empty{}, fmt.Errorf(errorString)
 	}

 	if !v1helper.IsExtendedResourceName(v1.ResourceName(r.ResourceName)) {
 		errorString := fmt.Sprintf(errInvalidResourceName, r.ResourceName)
-		glog.Infof("Bad registration request from device plugin: %v", errorString)
+		glog.Infof("Bad registration request from device plugin: %s", errorString)
 		return &pluginapi.Empty{}, fmt.Errorf(errorString)
 	}

@@ -323,8 +378,8 @@ func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest
 func (m *ManagerImpl) Stop() error {
 	m.mutex.Lock()
 	defer m.mutex.Unlock()
-	for _, e := range m.endpoints {
-		e.stop()
+	for _, eI := range m.endpoints {
+		eI.e.stop()
 	}

 	if m.server == nil {
@@ -336,61 +391,37 @@ func (m *ManagerImpl) Stop() error {
 	return nil
 }

-func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
-	existingDevs := make(map[string]pluginapi.Device)
+func (m *ManagerImpl) registerEndpoint(resourceName string, options *pluginapi.DevicePluginOptions, e endpoint) {
 	m.mutex.Lock()
-	old, ok := m.endpoints[r.ResourceName]
-	if ok && old != nil {
-		// Pass devices of previous endpoint into re-registered one,
-		// to avoid potential orphaned devices upon re-registration
-		devices := make(map[string]pluginapi.Device)
-		for _, device := range old.getDevices() {
-			device.Health = pluginapi.Unhealthy
-			devices[device.ID] = device
-		}
-		existingDevs = devices
-	}
-	m.mutex.Unlock()
+	defer m.mutex.Unlock()

-	socketPath := filepath.Join(m.socketdir, r.Endpoint)
-	e, err := newEndpointImpl(socketPath, r.ResourceName, existingDevs, m.callback)
+	m.endpoints[resourceName] = endpointInfo{e: e, opts: options}
+	glog.V(2).Infof("Registered endpoint %v", e)
+}
+
+func (m *ManagerImpl) runEndpoint(resourceName string, e endpoint) {
+	e.run()
+	e.stop()
+
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+
+	if old, ok := m.endpoints[resourceName]; ok && old.e == e {
+		m.markResourceUnhealthy(resourceName)
+	}
+
+	glog.V(2).Infof("Endpoint (%s, %v) became unhealthy", resourceName, e)
+}
+
+func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
+	new, err := newEndpointImpl(filepath.Join(m.socketdir, r.Endpoint), r.ResourceName, m.callback)
 	if err != nil {
 		glog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
 		return
 	}
-	m.mutex.Lock()
-	if r.Options != nil {
-		m.pluginOpts[r.ResourceName] = r.Options
-	}
-	// Check for potential re-registration during the initialization of new endpoint,
-	// and skip updating if re-registration happens.
-	// TODO: simplify the part once we have a better way to handle registered devices
-	ext := m.endpoints[r.ResourceName]
-	if ext != old {
-		glog.Warningf("Some other endpoint %v is added while endpoint %v is initialized", ext, e)
-		m.mutex.Unlock()
-		e.stop()
-		return
-	}
-	// Associates the newly created endpoint with the corresponding resource name.
-	// Stops existing endpoint if there is any.
-	m.endpoints[r.ResourceName] = e
-	glog.V(2).Infof("Registered endpoint %v", e)
-	m.mutex.Unlock()
-
-	if old != nil {
-		old.stop()
-	}
-
+	m.registerEndpoint(r.ResourceName, r.Options, new)
 	go func() {
-		e.run()
-		e.stop()
-		m.mutex.Lock()
-		if old, ok := m.endpoints[r.ResourceName]; ok && old == e {
-			m.markResourceUnhealthy(r.ResourceName)
-		}
-		glog.V(2).Infof("Unregistered endpoint %v", e)
-		m.mutex.Unlock()
+		m.runEndpoint(r.ResourceName, new)
 	}()
 }

@@ -426,8 +457,8 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
 	deletedResources := sets.NewString()
 	m.mutex.Lock()
 	for resourceName, devices := range m.healthyDevices {
-		e, ok := m.endpoints[resourceName]
-		if (ok && e.stopGracePeriodExpired()) || !ok {
+		eI, ok := m.endpoints[resourceName]
+		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
 			// The resources contained in endpoints and (un)healthyDevices
 			// should always be consistent. Otherwise, we run with the risk
 			// of failing to garbage collect non-existing resources or devices.
@@ -444,8 +475,8 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
 		}
 	}
 	for resourceName, devices := range m.unhealthyDevices {
-		e, ok := m.endpoints[resourceName]
-		if (ok && e.stopGracePeriodExpired()) || !ok {
+		eI, ok := m.endpoints[resourceName]
+		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
 			if !ok {
 				glog.Errorf("unexpected: unhealthyDevices and endpoints are out of sync")
 			}
@@ -508,7 +539,7 @@ func (m *ManagerImpl) readCheckpoint() error {
 		// will stay zero till the corresponding device plugin re-registers.
 		m.healthyDevices[resource] = sets.NewString()
 		m.unhealthyDevices[resource] = sets.NewString()
-		m.endpoints[resource] = newStoppedEndpointImpl(resource, make(map[string]pluginapi.Device))
+		m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
 	}
 	return nil
 }
@@ -551,17 +582,17 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
 		// A pod's resource is not expected to change once admitted by the API server,
 		// so just fail loudly here. We can revisit this part if this no longer holds.
 		if needed != 0 {
-			return nil, fmt.Errorf("pod %v container %v changed request for resource %v from %v to %v", podUID, contName, resource, devices.Len(), required)
+			return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", podUID, contName, resource, devices.Len(), required)
 		}
 	}
 	if needed == 0 {
 		// No change, no work.
 		return nil, nil
 	}
-	glog.V(3).Infof("Needs to allocate %v %v for pod %q container %q", needed, resource, podUID, contName)
+	glog.V(3).Infof("Needs to allocate %d %q for pod %q container %q", needed, resource, podUID, contName)
 	// Needs to allocate additional devices.
 	if _, ok := m.healthyDevices[resource]; !ok {
-		return nil, fmt.Errorf("can't allocate unregistered device %v", resource)
+		return nil, fmt.Errorf("can't allocate unregistered device %s", resource)
 	}
 	devices = sets.NewString()
 	// Allocates from reusableDevices list first.
@@ -641,7 +672,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 		// plugin Allocate grpc calls if it becomes common that a container may require
 		// resources from multiple device plugins.
 		m.mutex.Lock()
-		e, ok := m.endpoints[resource]
+		eI, ok := m.endpoints[resource]
 		m.mutex.Unlock()
 		if !ok {
 			m.mutex.Lock()
@@ -654,7 +685,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 		// TODO: refactor this part of code to just append a ContainerAllocationRequest
 		// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
 		glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
-		resp, err := e.allocate(devs)
+		resp, err := eI.e.allocate(devs)
 		metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
 		if err != nil {
 			// In case of allocation failure, we want to restore m.allocatedDevices
@@ -665,6 +696,10 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 			return err
 		}

+		if len(resp.ContainerResponses) == 0 {
+			return fmt.Errorf("No containers return in allocation response %v", resp)
+		}
+
 		// Update internal cached podDevices state.
 		m.mutex.Lock()
 		m.podDevices.insert(podUID, contName, resource, allocDevices, resp.ContainerResponses[0])
@@ -700,16 +735,15 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
 // with PreStartRequired option set.
 func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource string) error {
 	m.mutex.Lock()
-	opts, ok := m.pluginOpts[resource]
+	eI, ok := m.endpoints[resource]
 	if !ok {
 		m.mutex.Unlock()
-		glog.V(4).Infof("Plugin options not found in cache for resource: %s. Skip PreStartContainer", resource)
-		return nil
+		return fmt.Errorf("endpoint not found in cache for a registered resource: %s", resource)
 	}

-	if !opts.PreStartRequired {
+	if eI.opts == nil || !eI.opts.PreStartRequired {
 		m.mutex.Unlock()
-		glog.V(4).Infof("Plugin options indicate to skip PreStartContainer for resource, %v", resource)
+		glog.V(4).Infof("Plugin options indicate to skip PreStartContainer for resource: %s", resource)
 		return nil
 	}

@@ -719,16 +753,10 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
 		return fmt.Errorf("no devices found allocated in local cache for pod %s, container %s, resource %s", podUID, contName, resource)
 	}

-	e, ok := m.endpoints[resource]
-	if !ok {
-		m.mutex.Unlock()
-		return fmt.Errorf("endpoint not found in cache for a registered resource: %s", resource)
-	}
-
 	m.mutex.Unlock()
 	devs := devices.UnsortedList()
 	glog.V(4).Infof("Issuing an PreStartContainer call for container, %s, of pod %s", contName, podUID)
-	_, err := e.preStartContainer(devs)
+	_, err := eI.e.preStartContainer(devs)
 	if err != nil {
 		return fmt.Errorf("device plugin PreStartContainer rpc failed with err: %v", err)
 	}