From d64afb1f31d2b17375691c09abcbb3a0ca420d70 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Mon, 2 Dec 2024 12:58:42 -0800 Subject: [PATCH 1/9] vfio-pci device config API Signed-off-by: Varun Ramachandra Sekar --- .../resource/gpu/v1alpha1/driverconfig.go | 60 +++++++++++++++++++ .../resource/gpu/v1alpha1/gpuconfig.go | 37 ++++++++++-- .../gpu/v1alpha1/zz_generated.deepcopy.go | 21 +++++++ 3 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go diff --git a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go new file mode 100644 index 00000000..27349726 --- /dev/null +++ b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import "fmt" + +// GpuDriver encodes the gpu driver as a string. +type GpuDriver string + +const ( + NvidiaDriver GpuDriver = "nvidia" + VfioPciDriver GpuDriver = "vfio-pci" +) + +// GpuDriverConfig holds the set of parameters for configuring a GPU with a driver. +type GpuDriverConfig struct { + Driver GpuDriver `json:"driver"` +} + +// DefaultGpuDriverConfig provides the default configuration of a GPU with a driver. +func DefaultGpuDriverConfig() *GpuDriverConfig { + return &GpuDriverConfig{ + Driver: NvidiaDriver, + } +} + +// Normalize updates a GpuDriverConfig config with implied default values based on other settings. +func (c *GpuDriverConfig) Normalize() error { + if c.Driver == "" { + c.Driver = NvidiaDriver + } + return nil +} + +// Validate ensures that GpuDriverConfig has a valid set of values. +func (c *GpuDriverConfig) Validate() error { + switch c.Driver { + case NvidiaDriver: + fallthrough + case VfioPciDriver: + break + default: + return fmt.Errorf("invalid driver specified in gpu driver configuration") + } + return nil +} diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go index d14699fd..bf6b5765 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go @@ -29,7 +29,8 @@ import ( // GpuConfig holds the set of parameters for configuring a GPU. type GpuConfig struct { metav1.TypeMeta `json:",inline"` - Sharing *GpuSharing `json:"sharing,omitempty"` + Sharing *GpuSharing `json:"sharing,omitempty"` + DriverConfig *GpuDriverConfig `json:"driverConfig,omitempty"` } // DefaultGpuConfig provides the default GPU configuration. @@ -45,11 +46,26 @@ func DefaultGpuConfig() *GpuConfig { Interval: ptr.To(DefaultTimeSlice), }, }, + DriverConfig: &GpuDriverConfig{ + Driver: NvidiaDriver, + }, } } // Normalize updates a GpuConfig config with implied default values based on other settings. func (c *GpuConfig) Normalize() error { + if c.DriverConfig == nil { + c.DriverConfig = DefaultGpuDriverConfig() + } else { + if err := c.DriverConfig.Normalize(); err != nil { + return err + } + } + // If driver is not Nvidia, don't proceed with normalizing sharing configuration. + if c.DriverConfig.Driver != NvidiaDriver { + return nil + } + if c.Sharing == nil { c.Sharing = &GpuSharing{ Strategy: TimeSlicingStrategy, @@ -68,8 +84,21 @@ func (c *GpuConfig) Normalize() error { // Validate ensures that GpuConfig has a valid set of values. func (c *GpuConfig) Validate() error { - if c.Sharing == nil { - return fmt.Errorf("no sharing strategy set") + if c.DriverConfig.Driver == NvidiaDriver { + if c.Sharing == nil { + return fmt.Errorf("no sharing strategy set") + } + if err := c.Sharing.Validate(); err != nil { + return err + } + } else { + if c.Sharing != nil { + return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver") + } } - return c.Sharing.Validate() + if err := c.DriverConfig.Validate(); err != nil { + return err + } + + return nil } diff --git a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go index 86a9f407..7cf34e00 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go @@ -1,4 +1,5 @@ //go:build !ignore_autogenerated +// +build !ignore_autogenerated /* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. @@ -33,6 +34,11 @@ func (in *GpuConfig) DeepCopyInto(out *GpuConfig) { *out = new(GpuSharing) (*in).DeepCopyInto(*out) } + if in.DriverConfig != nil { + in, out := &in.DriverConfig, &out.DriverConfig + *out = new(GpuDriverConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuConfig. @@ -53,6 +59,21 @@ func (in *GpuConfig) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GpuDriverConfig) DeepCopyInto(out *GpuDriverConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuDriverConfig. +func (in *GpuDriverConfig) DeepCopy() *GpuDriverConfig { + if in == nil { + return nil + } + out := new(GpuDriverConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GpuSharing) DeepCopyInto(out *GpuSharing) { *out = *in From 2559d025c0d7e87f825c5f4d47fcac8a543ff3e3 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Mon, 2 Dec 2024 14:05:23 -0800 Subject: [PATCH 2/9] vfio-pci gpu configuration Signed-off-by: Varun Ramachandra Sekar --- cmd/nvidia-dra-plugin/allocatable.go | 12 +- cmd/nvidia-dra-plugin/device_state.go | 131 +++++++++++++----- cmd/nvidia-dra-plugin/deviceinfo.go | 4 + cmd/nvidia-dra-plugin/main.go | 3 + cmd/nvidia-dra-plugin/mutex.go | 43 ++++++ cmd/nvidia-dra-plugin/nvlib.go | 12 +- cmd/nvidia-dra-plugin/prepared.go | 20 +++ cmd/nvidia-dra-plugin/vfio-device.go | 192 ++++++++++++++++++++++++++ scripts/bind_to_driver.sh | 38 +++++ scripts/unbind_from_driver.sh | 53 +++++++ 10 files changed, 470 insertions(+), 38 deletions(-) create mode 100644 cmd/nvidia-dra-plugin/mutex.go create mode 100644 cmd/nvidia-dra-plugin/vfio-device.go create mode 100644 scripts/bind_to_driver.sh create mode 100644 scripts/unbind_from_driver.sh diff --git a/cmd/nvidia-dra-plugin/allocatable.go b/cmd/nvidia-dra-plugin/allocatable.go index 3350716c..b9bdbb46 100644 --- a/cmd/nvidia-dra-plugin/allocatable.go +++ b/cmd/nvidia-dra-plugin/allocatable.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,3 +106,13 @@ func (d AllocatableDevices) UUIDs() []string { slices.Sort(uuids) return uuids } + +func (d AllocatableDevices) PciAddresses() []string { + var pciAddresses []string + for _, device := range d { + if device.Type() == GpuDeviceType { + pciAddresses = append(pciAddresses, device.Gpu.PciAddress) + } + } + return pciAddresses +} diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go index 0ecd3423..6f851c49 100644 --- a/cmd/nvidia-dra-plugin/device_state.go +++ b/cmd/nvidia-dra-plugin/device_state.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,17 +38,19 @@ type OpaqueDeviceConfig struct { } type DeviceConfigState struct { - MpsControlDaemonID string `json:"mpsControlDaemonID"` + MpsControlDaemonID string `json:"mpsControlDaemonID"` + GpuConfig *configapi.GpuConfig `json:"deviceConfig,omitempty"` containerEdits *cdiapi.ContainerEdits } type DeviceState struct { sync.Mutex - cdi *CDIHandler - tsManager *TimeSlicingManager - mpsManager *MpsManager - allocatable AllocatableDevices - config *Config + cdi *CDIHandler + tsManager *TimeSlicingManager + mpsManager *MpsManager + vfioPciManager *VfioPciManager + allocatable AllocatableDevices + config *Config nvdevlib *deviceLib checkpointManager checkpointmanager.CheckpointManager @@ -87,6 +89,8 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) { tsManager := NewTimeSlicingManager(nvdevlib) mpsManager := NewMpsManager(config, nvdevlib, MpsRoot, hostDriverRoot, MpsControlDaemonTemplatePath) + vfioPciManager := NewVfioPciManager() + if err := cdi.CreateStandardDeviceSpecFile(allocatable); err != nil { return nil, fmt.Errorf("unable to create base CDI spec file: %v", err) } @@ -100,12 +104,16 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) { cdi: cdi, tsManager: tsManager, mpsManager: mpsManager, + vfioPciManager: vfioPciManager, allocatable: allocatable, config: config, nvdevlib: nvdevlib, checkpointManager: checkpointManager, } + // Initialize the vfio-pci driver manager. + vfioPciManager.Init() + checkpoints, err := state.checkpointManager.ListCheckpoints() if err != nil { return nil, fmt.Errorf("unable to list checkpoints: %v", err) @@ -349,35 +357,70 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error { for _, group := range devices { - // Stop any MPS control daemons started for each group of prepared devices. - mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(claimUID, group) - if err := mpsControlDaemon.Stop(ctx); err != nil { - return fmt.Errorf("error stopping MPS control daemon: %w", err) + var err error + if group.ConfigState.GpuConfig != nil { + err = s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus()) } - - // Go back to default time-slicing for all full GPUs. - tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig - if err := s.tsManager.SetTimeSlice(group.Devices.Gpus(), tsc); err != nil { - return fmt.Errorf("error setting timeslice for devices: %w", err) + if err != nil { + return err + } + } + return nil +} +func (s *DeviceState) unprepareGpus(ctx context.Context, config *configapi.GpuConfig, devices PreparedDeviceList) error { + if config.DriverConfig.Driver == configapi.VfioPciDriver { + for _, device := range devices { + if err := s.vfioPciManager.Unconfigure(device.Gpu.Info); err != nil { + return fmt.Errorf("error unconfiguring vfio-pci device: %w", err) + } } } + // Go back to default time-slicing for all full GPUs. + tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig + if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil { + return fmt.Errorf("error setting timeslice for devices: %w", err) + } return nil } func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interface, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { + var err error + var configState DeviceConfigState + switch castConfig := config.(type) { case *configapi.GpuConfig: - return s.applySharingConfig(ctx, castConfig.Sharing, claim, results) + configState.GpuConfig = castConfig + err = s.applyGpuConfig(ctx, castConfig, claim, results, &configState) case *configapi.MigDeviceConfig: - return s.applySharingConfig(ctx, castConfig.Sharing, claim, results) + err = s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState) case *configapi.ImexChannelConfig: - return s.applyImexChannelConfig(ctx, castConfig, claim, results) + err = s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState) default: - return nil, fmt.Errorf("unknown config type: %T", castConfig) + err = fmt.Errorf("unknown config type: %T", castConfig) + } + if err != nil { + return nil, err + } + return &configState, nil +} + +func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { + if config.Sharing != nil { + err := s.applySharingConfig(ctx, config.Sharing, claim, results, configState) + if err != nil { + return err + } + } + if config.DriverConfig != nil { + err := s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState) + if err != nil { + return err + } } + return nil } -func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { +func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { // Get the list of claim requests this config is being applied over. var requests []string for _, r := range results { @@ -390,19 +433,16 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S allocatableDevices[r.Device] = s.allocatable[r.Device] } - // Declare a device group state object to populate. - var configState DeviceConfigState - // Apply time-slicing settings (if available). if config.IsTimeSlicing() { tsc, err := config.GetTimeSlicingConfig() if err != nil { - return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) + return fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) } if tsc != nil { err = s.tsManager.SetTimeSlice(allocatableDevices, tsc) if err != nil { - return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) + return fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) } } } @@ -411,36 +451,55 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S if config.IsMps() { mpsc, err := config.GetMpsConfig() if err != nil { - return nil, fmt.Errorf("error getting MPS configuration: %w", err) + return fmt.Errorf("error getting MPS configuration: %w", err) } mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices) if err := mpsControlDaemon.Start(ctx, mpsc); err != nil { - return nil, fmt.Errorf("error starting MPS control daemon: %w", err) + return fmt.Errorf("error starting MPS control daemon: %w", err) } if err := mpsControlDaemon.AssertReady(ctx); err != nil { - return nil, fmt.Errorf("MPS control daemon is not yet ready: %w", err) + return fmt.Errorf("MPS control daemon is not yet ready: %w", err) } configState.MpsControlDaemonID = mpsControlDaemon.GetID() configState.containerEdits = mpsControlDaemon.GetCDIContainerEdits() } - return &configState, nil + return nil } -func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { - // Declare a device group state object to populate. - var configState DeviceConfigState - +func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { // Create any necessary IMEX channels and gather their CDI container edits. for _, r := range results { imexChannel := s.allocatable[r.Device].ImexChannel if err := s.nvdevlib.createImexChannelDevice(imexChannel.Channel); err != nil { - return nil, fmt.Errorf("error creating IMEX channel device: %w", err) + return fmt.Errorf("error creating IMEX channel device: %w", err) } configState.containerEdits = configState.containerEdits.Append(s.cdi.GetImexChannelContainerEdits(imexChannel)) } - return &configState, nil + return nil +} + +func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { + // Get the list of allocatable devices this config is being applied over. + allocatableDevices := make(AllocatableDevices) + for _, r := range results { + allocatableDevices[r.Device] = s.allocatable[r.Device] + } + + if config.Driver == configapi.VfioPciDriver { + // Apply vfio-pci driver settings. + for _, r := range results { + info := allocatableDevices[r.Device] + err := s.vfioPciManager.Configure(info.Gpu) + if err != nil { + return err + } + configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu)) + } + } + + return nil } // GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver. diff --git a/cmd/nvidia-dra-plugin/deviceinfo.go b/cmd/nvidia-dra-plugin/deviceinfo.go index cc899c44..8098cc07 100644 --- a/cmd/nvidia-dra-plugin/deviceinfo.go +++ b/cmd/nvidia-dra-plugin/deviceinfo.go @@ -40,6 +40,7 @@ type GpuInfo struct { driverVersion string cudaDriverVersion string migProfiles []*MigProfileInfo + PciAddress string `json:"pciAddress"` } type MigDeviceInfo struct { @@ -130,6 +131,9 @@ func (d *GpuInfo) GetDevice() resourceapi.Device { "cudaDriverVersion": { VersionValue: ptr.To(semver.MustParse(d.cudaDriverVersion).String()), }, + "pciAddress": { + StringValue: &d.PciAddress, + }, }, Capacity: map[resourceapi.QualifiedName]resource.Quantity{ "memory": *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI), diff --git a/cmd/nvidia-dra-plugin/main.go b/cmd/nvidia-dra-plugin/main.go index ab5bd585..889be930 100644 --- a/cmd/nvidia-dra-plugin/main.go +++ b/cmd/nvidia-dra-plugin/main.go @@ -52,6 +52,9 @@ type Flags struct { hostDriverRoot string nvidiaCTKPath string deviceClasses sets.Set[string] + pciDevicesRoot string + sysModulesRoot string + vfioDevicesRoot string } type Config struct { diff --git a/cmd/nvidia-dra-plugin/mutex.go b/cmd/nvidia-dra-plugin/mutex.go new file mode 100644 index 00000000..e98fed15 --- /dev/null +++ b/cmd/nvidia-dra-plugin/mutex.go @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "sync" +) + +type PerGPUMutex struct { + sync.Mutex + submutex map[string]*sync.Mutex +} + +var perGpuLock *PerGPUMutex + +func init() { + perGpuLock = &PerGPUMutex{ + submutex: make(map[string]*sync.Mutex), + } +} + +func (pgm *PerGPUMutex) Get(gpu string) *sync.Mutex { + pgm.Mutex.Lock() + defer pgm.Mutex.Unlock() + if pgm.submutex[gpu] == nil { + pgm.submutex[gpu] = &sync.Mutex{} + } + return pgm.submutex[gpu] +} diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go index 421e7a50..45634431 100644 --- a/cmd/nvidia-dra-plugin/nvlib.go +++ b/cmd/nvidia-dra-plugin/nvlib.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -199,6 +199,10 @@ func (l deviceLib) enumerateImexChannels(config *Config) (AllocatableDevices, er return devices, nil } +func getPciAddressFromNvmlPciInfo(info nvml.PciInfo) string { + return fmt.Sprintf("%04x:%02x:%02x.0", info.Domain, info.Bus, info.Device) +} + func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) { minor, ret := device.GetMinorNumber() if ret != nvml.SUCCESS { @@ -240,6 +244,11 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting CUDA driver version: %w", err) } + pciInfo, ret := l.nvmllib.DeviceGetPciInfo(device) + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting PCI info for device %d: %w", index, err) + } + pciAddress := getPciAddressFromNvmlPciInfo(pciInfo) var migProfiles []*MigProfileInfo for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ { @@ -307,6 +316,7 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) driverVersion: driverVersion, cudaDriverVersion: fmt.Sprintf("%v.%v", cudaDriverVersion/1000, (cudaDriverVersion%1000)/10), migProfiles: migProfiles, + PciAddress: pciAddress, } return gpuInfo, nil diff --git a/cmd/nvidia-dra-plugin/prepared.go b/cmd/nvidia-dra-plugin/prepared.go index edb369fe..f939e77b 100644 --- a/cmd/nvidia-dra-plugin/prepared.go +++ b/cmd/nvidia-dra-plugin/prepared.go @@ -203,3 +203,23 @@ func (d PreparedDevices) MigDeviceUUIDs() []string { slices.Sort(uuids) return uuids } + +func (l PreparedDeviceList) PciAddresses() []string { + var pciAddresses []string + for _, device := range l.Gpus() { + pciAddresses = append(pciAddresses, device.Gpu.Info.PciAddress) + } + return pciAddresses +} + +func (g *PreparedDeviceGroup) PciAddresses() []string { + return g.Devices.Gpus().PciAddresses() +} + +func (d PreparedDevices) PciAddresses() []string { + var pciAddresses []string + for _, group := range d { + pciAddresses = append(pciAddresses, group.PciAddresses()...) + } + return pciAddresses +} diff --git a/cmd/nvidia-dra-plugin/vfio-device.go b/cmd/nvidia-dra-plugin/vfio-device.go new file mode 100644 index 00000000..fb5076dd --- /dev/null +++ b/cmd/nvidia-dra-plugin/vfio-device.go @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "os" + "os/exec" + "path/filepath" + + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" + cdispec "tags.cncf.io/container-device-interface/specs-go" +) + +const ( + vfioPciModule = "vfio_pci" + vfioPciDriver = "vfio-pci" + nvidiaDriver = "nvidia" + unbindFromDriverScript = "/usr/bin/unbind_from_driver.sh" + bindToDriverScript = "/usr/bin/bind_to_driver.sh" + driverResetRetries = "5" +) + +type VfioPciManager struct { + pciDevicesRoot string + vfioDevicesRoot string + sysModulesRoot string + driver string + vfioPciModule string +} + +func NewVfioPciManager() *VfioPciManager { + return &VfioPciManager{ + pciDevicesRoot: "/sys/bus/pci/devices", + vfioDevicesRoot: "/dev/vfio", + sysModulesRoot: "/sys/module", + driver: vfioPciDriver, + vfioPciModule: vfioPciModule, + } +} + +// Init ensures the vfio-pci module is loaded on the host. +func (vm *VfioPciManager) Init() error { + if !vm.isVfioPCIModuleLoaded() { + err := vm.loadVfioPciModule() + if err != nil { + return err + } + } + return nil +} + +func (vm *VfioPciManager) isVfioPCIModuleLoaded() bool { + modules, err := os.ReadDir(vm.sysModulesRoot) + if err != nil { + return false + } + + for _, module := range modules { + if module.Name() == vm.vfioPciModule { + return true + } + } + + return false + +} + +func (vm *VfioPciManager) loadVfioPciModule() error { + cmd := exec.Command("modprobe", vm.vfioPciModule) //nolint:gosec + _, err := cmd.CombinedOutput() + if err != nil { + return err + } + + return nil +} + +// Configure binds the GPU to the vfio-pci driver. +func (vm *VfioPciManager) Configure(info *GpuInfo) error { + perGpuLock.Get(info.PciAddress).Lock() + defer perGpuLock.Get(info.PciAddress).Unlock() + + driver, err := getDriver(vm.pciDevicesRoot, info.PciAddress) + if err != nil { + return err + } + if driver == vm.driver { + return nil + } + err = changeDriver(info.PciAddress, vm.driver) + if err != nil { + return err + } + return nil +} + +// Unconfigure binds the GPU to the nvidia driver. +func (vm *VfioPciManager) Unconfigure(info *GpuInfo) error { + perGpuLock.Get(info.PciAddress).Lock() + defer perGpuLock.Get(info.PciAddress).Unlock() + + driver, err := getDriver(vm.pciDevicesRoot, info.PciAddress) + if err != nil { + return err + } + if driver == nvidiaDriver { + return nil + } + err = changeDriver(info.PciAddress, nvidiaDriver) + if err != nil { + return err + } + return nil +} + +func getDriver(pciDevicesRoot, pciAddress string) (string, error) { + driverPath, err := os.Readlink(filepath.Join(pciDevicesRoot, pciAddress, "driver")) + if err != nil { + return "", err + } + _, driver := filepath.Split(driverPath) + return driver, nil +} + +func changeDriver(pciAddress, driver string) error { + err := unbindFromDriver(pciAddress, driver) + if err != nil { + return err + } + err = bindToDriver(pciAddress, driver) + if err != nil { + return err + } + return nil +} + +func unbindFromDriver(pciAddress, driverResetRetries string) error { + cmd := exec.Command(unbindFromDriverScript, pciAddress, driverResetRetries) //nolint:gosec + _, err := cmd.CombinedOutput() + if err != nil { + return err + } + return nil +} + +func bindToDriver(pciAddress, driver string) error { + cmd := exec.Command(bindToDriverScript, pciAddress, driver) //nolint:gosec + _, err := cmd.CombinedOutput() + if err != nil { + return err + } + return nil +} + +func (vm *VfioPciManager) getIommuGroupForVfioPciDevice(pciAddress string) string { + iommuGroup, err := os.Readlink(filepath.Join(vm.pciDevicesRoot, pciAddress, "iommu_group")) + if err != nil { + return "" + } + _, file := filepath.Split(iommuGroup) + return file + +} + +// GetCDIContainerEdits returns the CDI spec for a container to have access to the GPU while bound on vfio-pci driver. +func (vm *VfioPciManager) GetCDIContainerEdits(info *GpuInfo) *cdiapi.ContainerEdits { + iommuGroup := vm.getIommuGroupForVfioPciDevice(info.PciAddress) + vfioDevicePath := filepath.Join(vm.vfioDevicesRoot, iommuGroup) + return &cdiapi.ContainerEdits{ + ContainerEdits: &cdispec.ContainerEdits{ + DeviceNodes: []*cdispec.DeviceNode{ + { + Path: vfioDevicePath, + }, + }, + }, + } +} diff --git a/scripts/bind_to_driver.sh b/scripts/bind_to_driver.sh new file mode 100644 index 00000000..26840cd4 --- /dev/null +++ b/scripts/bind_to_driver.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Usage: ./bind_to_driver.sh +# Bind the GPU specified by the PCI_ID=ssss:bb:dd.f to the given driver. + +bind_to_driver() +{ + local gpu=$1 + local driver=$2 + local drivers_path="/sys/bus/pci/drivers" + local driver_override_file="/sys/bus/pci/devices/$gpu/driver_override" + local bind_file="$drivers_path/$driver/bind" + + if [ ! -e "$driver_override_file" ]; then + echo "'$driver_override_file' file does not exist" >&2 + return 1 + fi + + echo "$driver" > "$driver_override_file" + if [ $? -ne 0 ]; then + echo "failed to write '$driver' to $driver_override_file" >&2 + return 1 + fi + + if [ ! -e "$bind_file" ]; then + echo "'$bind_file' file does not exist" >&2 + return 1 + fi + + echo "$gpu" > "$bind_file" + if [ $? -ne 0 ]; then + echo "failed to write '$gpu' to $bind_file" >&2 + echo "" > "$driver_override_file" + return 1 + fi +} + +bind_to_driver "$1" "$2" || exit 1 \ No newline at end of file diff --git a/scripts/unbind_from_driver.sh b/scripts/unbind_from_driver.sh new file mode 100644 index 00000000..c7653679 --- /dev/null +++ b/scripts/unbind_from_driver.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Usage: ./unbind_from_driver.sh +# Unbind the GPU specified by the PCI_ID=ssss:bb:dd.f from the driver its bound to. +# Attempt to acquire the unbindLock within the retries specified before unbinding the device from its driver. + +acquire_unbind_lock() +{ + local gpu=$1 + local lock_retries=5 + local unbind_lock_file="/proc/driver/nvidia/gpus/$gpu/unbindLock" + local unbind_lock=0 + local attempt=1 + + if [ ! -e "${unbind_lock_file}" ]; then + return 0 + fi + + while [[ $attempt -le ${lock_retries} ]]; do + echo "[retry $attempt/${lock_retries}] Attempting to acquire unbindLock for $gpu" >&1 + + echo 1 > "{$unbind_lock_file}" + read -r unbind_lock < "${unbind_lock_file}" + if [ ${unbind_lock} -eq 1 ]; then + echo "UnbindLock acquired for $gpu" >&1 + return 0 + fi + + sleep $attempt + attempt=$((attempt + 1)) + done + + echo "cannot obtain unbindLock for $gpu" >&2 + return 1 +} + +unbind_from_driver() +{ + local gpu=$1 + local existing_driver + local existing_driver_name + + [ -e "/sys/bus/pci/devices/$gpu/driver" ] || return 0 + existing_driver=$(readlink -f "/sys/bus/pci/devices/$gpu/driver") + existing_driver_name=$(basename "${existing_driver}") + if [ "${existing_driver_name}" == "nvidia" ]; then + acquire_unbind_lock "$gpu" || return 1 + fi + echo "$gpu" > "${existing_driver}/unbind" + return 0 +} + +unbind_from_driver "$1" || exit 1 \ No newline at end of file From f92e72fff3545c4dfe36013f6621aa68d898de4c Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Mon, 2 Dec 2024 13:00:27 -0800 Subject: [PATCH 3/9] Dockerfile changes for vfio-pci device config Signed-off-by: Varun Ramachandra Sekar --- deployments/container/Dockerfile.ubi8 | 7 ++++--- deployments/container/Dockerfile.ubuntu | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/deployments/container/Dockerfile.ubi8 b/deployments/container/Dockerfile.ubi8 index c365f123..ab52a352 100644 --- a/deployments/container/Dockerfile.ubi8 +++ b/deployments/container/Dockerfile.ubi8 @@ -59,9 +59,10 @@ LABEL org.opencontainers.image.description "NVIDIA GPU DRA driver for Kubernetes RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE -COPY --from=build /artifacts/nvidia-dra-controller /usr/bin/nvidia-dra-controller -COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin -COPY --from=build /build/templates /templates +COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin +COPY --from=build /build/templates /templates +COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh +COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh # Install / upgrade packages here that are required to resolve CVEs ARG CVE_UPDATES diff --git a/deployments/container/Dockerfile.ubuntu b/deployments/container/Dockerfile.ubuntu index ea5d58a3..b815bcd1 100644 --- a/deployments/container/Dockerfile.ubuntu +++ b/deployments/container/Dockerfile.ubuntu @@ -64,9 +64,10 @@ LABEL org.opencontainers.image.description "NVIDIA GPU DRA driver for Kubernetes RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE -COPY --from=build /artifacts/nvidia-dra-controller /usr/bin/nvidia-dra-controller -COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin -COPY --from=build /build/templates /templates +COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin +COPY --from=build /build/templates /templates +COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh +COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh # Install / upgrade packages here that are required to resolve CVEs ARG CVE_UPDATES From 229cc20e00d06185cb6ecf900fcde1185c73a9d1 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Tue, 24 Sep 2024 18:42:22 -0700 Subject: [PATCH 4/9] vfio-pci gpu deviceclass Signed-off-by: Varun Ramachandra Sekar --- demo/clusters/kind/install-dra-driver.sh | 2 +- .../templates/deviceclass-vfiopci.yaml | 19 +++++++++++++++++++ deployments/helm/k8s-dra-driver/values.yaml | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh index ece8cdf1..b48aafd6 100755 --- a/demo/clusters/kind/install-dra-driver.sh +++ b/demo/clusters/kind/install-dra-driver.sh @@ -24,7 +24,7 @@ source "${CURRENT_DIR}/scripts/common.sh" kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/gpu.present=true -deviceClasses=${1:-"gpu,mig,imex"} +deviceClasses=${1:-"gpu,mig,imex,vfiopci"} helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ --set deviceClasses="{${deviceClasses}}" \ ${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \ diff --git a/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml b/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml new file mode 100644 index 00000000..9fb1cb31 --- /dev/null +++ b/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml @@ -0,0 +1,19 @@ +{{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }} +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: DeviceClass +metadata: + name: vfiopci.nvidia.com +spec: + config: + - opaque: + driver: gpu.nvidia.com + parameters: + apiVersion: gpu.nvidia.com/v1alpha1 + kind: GpuConfig + driverConfig: + driver: vfio-pci + selectors: + - cel: + expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'" +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml index 76ff38ca..986e7ad5 100644 --- a/deployments/helm/k8s-dra-driver/values.yaml +++ b/deployments/helm/k8s-dra-driver/values.yaml @@ -34,7 +34,7 @@ selectorLabelsOverride: {} allowDefaultNamespace: false -deviceClasses: ["gpu", "mig", "imex"] +deviceClasses: ["gpu", "mig", "imex", "vfiopci"] # Masking of the params file is typically done to allow nvkind to # selectively exclude certain GPUs from being visible to the From 2bda933d90a1a40bd635f8f058467eaac4469527 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Tue, 15 Oct 2024 09:18:24 -0700 Subject: [PATCH 5/9] kind cluster changes for vfio-pci device config support --- demo/clusters/kind/scripts/kind-cluster-config.yaml | 2 ++ .../helm/k8s-dra-driver/templates/kubeletplugin.yaml | 12 ++++++++++++ deployments/helm/k8s-dra-driver/values.yaml | 4 ++++ 3 files changed, 18 insertions(+) diff --git a/demo/clusters/kind/scripts/kind-cluster-config.yaml b/demo/clusters/kind/scripts/kind-cluster-config.yaml index f1a34a1c..890b0b71 100644 --- a/demo/clusters/kind/scripts/kind-cluster-config.yaml +++ b/demo/clusters/kind/scripts/kind-cluster-config.yaml @@ -66,3 +66,5 @@ nodes: # on the kind nodes. - hostPath: /usr/bin/nvidia-ctk containerPath: /usr/bin/nvidia-ctk + - hostPath: /sys + containerPath: /sys \ No newline at end of file diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml index 0b9b09b0..e79e70b1 100644 --- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml +++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml @@ -103,6 +103,12 @@ spec: - name: driver-root mountPath: /driver-root readOnly: true + - name: sysfs + mountPath: /sys + readOnly: false + - name: dev-vfio + mountPath: /dev/vfio + readOnly: false volumes: - name: plugins-registry hostPath: @@ -116,6 +122,12 @@ spec: - name: driver-root hostPath: path: {{ .Values.nvidiaDriverRoot }} + - name: sysfs + hostPath: + path: /sys + - name: dev-vfio + hostPath: + path: /dev/vfio {{- with .Values.kubeletPlugin.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml index 986e7ad5..a5c7ee91 100644 --- a/deployments/helm/k8s-dra-driver/values.yaml +++ b/deployments/helm/k8s-dra-driver/values.yaml @@ -96,6 +96,10 @@ kubeletPlugin: plugin: securityContext: privileged: true + allowPrivilegeEscalation: true + runAsNonRoot: false + runAsUser: 0 + runAsGroup: 0 resources: {} affinity: nodeAffinity: From 5ce0763f0d7e20e5d2a0b1e760157ee49183db85 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Thu, 17 Oct 2024 02:09:49 -0700 Subject: [PATCH 6/9] vfio-pci gpu claim example --- demo/specs/quickstart/gpu-test-vfiopci.yaml | 41 +++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 demo/specs/quickstart/gpu-test-vfiopci.yaml diff --git a/demo/specs/quickstart/gpu-test-vfiopci.yaml b/demo/specs/quickstart/gpu-test-vfiopci.yaml new file mode 100644 index 00000000..75f6cfbe --- /dev/null +++ b/demo/specs/quickstart/gpu-test-vfiopci.yaml @@ -0,0 +1,41 @@ +# One pod, one container asking for 1 distinct GPU + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gpu-test-vfiopci + +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + namespace: gpu-test-vfiopci + name: single-gpu +spec: + spec: + devices: + requests: + - name: gpu + deviceClassName: vfiopci.nvidia.com + +--- +apiVersion: v1 +kind: Pod +metadata: + namespace: gpu-test-vfiopci + name: pod1 + labels: + app: pod +spec: + containers: + - name: ctr + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["sleep 9999 & wait"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimTemplateName: single-gpu From f1527355d8927ba91daaadd6c7632df23bd5f47e Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Wed, 4 Dec 2024 16:33:40 -0800 Subject: [PATCH 7/9] address review comments Signed-off-by: Varun Ramachandra Sekar --- .../resource/gpu/v1alpha1/driverconfig.go | 2 +- .../resource/gpu/v1alpha1/gpuconfig.go | 33 +++--- cmd/nvidia-dra-plugin/allocatable.go | 1 + cmd/nvidia-dra-plugin/device_state.go | 107 +++++++++--------- cmd/nvidia-dra-plugin/deviceinfo.go | 2 +- cmd/nvidia-dra-plugin/main.go | 3 - cmd/nvidia-dra-plugin/nvlib.go | 12 +- .../k8s-dra-driver/templates/_helpers.tpl | 18 +++ .../templates/kubeletplugin.yaml | 18 +-- .../k8s-dra-driver/templates/validation.yaml | 2 +- 10 files changed, 100 insertions(+), 98 deletions(-) diff --git a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go index 27349726..2dd59390 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go @@ -54,7 +54,7 @@ func (c *GpuDriverConfig) Validate() error { case VfioPciDriver: break default: - return fmt.Errorf("invalid driver specified in gpu driver configuration") + return fmt.Errorf("invalid driver '%s' specified in gpu driver configuration", c.Driver) } return nil } diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go index bf6b5765..d61db48c 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go @@ -56,13 +56,14 @@ func DefaultGpuConfig() *GpuConfig { func (c *GpuConfig) Normalize() error { if c.DriverConfig == nil { c.DriverConfig = DefaultGpuDriverConfig() - } else { - if err := c.DriverConfig.Normalize(); err != nil { - return err - } } - // If driver is not Nvidia, don't proceed with normalizing sharing configuration. - if c.DriverConfig.Driver != NvidiaDriver { + + if err := c.DriverConfig.Normalize(); err != nil { + return err + } + + // If sharing is not supported, don't proceed with normalizing its configuration. + if !c.SupportsSharing() { return nil } @@ -84,21 +85,23 @@ func (c *GpuConfig) Normalize() error { // Validate ensures that GpuConfig has a valid set of values. func (c *GpuConfig) Validate() error { - if c.DriverConfig.Driver == NvidiaDriver { + if err := c.DriverConfig.Validate(); err != nil { + return err + } + + if c.SupportsSharing() { if c.Sharing == nil { return fmt.Errorf("no sharing strategy set") } if err := c.Sharing.Validate(); err != nil { return err } - } else { - if c.Sharing != nil { - return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver") - } - } - if err := c.DriverConfig.Validate(); err != nil { - return err + } else if c.Sharing != nil { + return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver") } - return nil } + +func (c *GpuConfig) SupportsSharing() bool { + return c.DriverConfig.Driver == NvidiaDriver +} diff --git a/cmd/nvidia-dra-plugin/allocatable.go b/cmd/nvidia-dra-plugin/allocatable.go index b9bdbb46..68a54241 100644 --- a/cmd/nvidia-dra-plugin/allocatable.go +++ b/cmd/nvidia-dra-plugin/allocatable.go @@ -114,5 +114,6 @@ func (d AllocatableDevices) PciAddresses() []string { pciAddresses = append(pciAddresses, device.Gpu.PciAddress) } } + slices.Sort(pciAddresses) return pciAddresses } diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go index 6f851c49..9e7407cb 100644 --- a/cmd/nvidia-dra-plugin/device_state.go +++ b/cmd/nvidia-dra-plugin/device_state.go @@ -39,7 +39,7 @@ type OpaqueDeviceConfig struct { type DeviceConfigState struct { MpsControlDaemonID string `json:"mpsControlDaemonID"` - GpuConfig *configapi.GpuConfig `json:"deviceConfig,omitempty"` + GpuConfig *configapi.GpuConfig `json:"gpuConfig,omitempty"` containerEdits *cdiapi.ContainerEdits } @@ -112,7 +112,9 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) { } // Initialize the vfio-pci driver manager. - vfioPciManager.Init() + if err := vfioPciManager.Init(); err != nil { + return nil, fmt.Errorf("unable to initialize vfio-pci manager: %v", err) + } checkpoints, err := state.checkpointManager.ListCheckpoints() if err != nil { @@ -357,12 +359,21 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error { for _, group := range devices { - var err error if group.ConfigState.GpuConfig != nil { - err = s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus()) + err := s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus()) + if err != nil { + return err + } } - if err != nil { - return err + // Stop any MPS control daemons started for each group of prepared devices. + mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(claimUID, group) + if err := mpsControlDaemon.Stop(ctx); err != nil { + return fmt.Errorf("error stopping MPS control daemon: %w", err) + } + // Go back to default time-slicing for all full GPUs. + tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig + if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil { + return fmt.Errorf("error setting timeslice for devices: %w", err) } } return nil @@ -375,52 +386,40 @@ func (s *DeviceState) unprepareGpus(ctx context.Context, config *configapi.GpuCo } } } - // Go back to default time-slicing for all full GPUs. - tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig - if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil { - return fmt.Errorf("error setting timeslice for devices: %w", err) - } return nil } func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interface, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { - var err error var configState DeviceConfigState - switch castConfig := config.(type) { case *configapi.GpuConfig: configState.GpuConfig = castConfig - err = s.applyGpuConfig(ctx, castConfig, claim, results, &configState) + return s.applyGpuConfig(ctx, castConfig, claim, results, &configState) case *configapi.MigDeviceConfig: - err = s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState) + return s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState) case *configapi.ImexChannelConfig: - err = s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState) + return s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState) default: - err = fmt.Errorf("unknown config type: %T", castConfig) - } - if err != nil { - return nil, err + return nil, fmt.Errorf("unknown config type: %T", castConfig) } - return &configState, nil } -func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { - if config.Sharing != nil { - err := s.applySharingConfig(ctx, config.Sharing, claim, results, configState) - if err != nil { - return err - } +func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { + var err error + configState, err = s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState) + if err != nil { + return nil, err } - if config.DriverConfig != nil { - err := s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState) + if config.SupportsSharing() { + configState, err = s.applySharingConfig(ctx, config.Sharing, claim, results, configState) if err != nil { - return err + return nil, err } } - return nil + return configState, nil } -func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { +func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { // Get the list of claim requests this config is being applied over. var requests []string for _, r := range results { @@ -437,12 +436,12 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S if config.IsTimeSlicing() { tsc, err := config.GetTimeSlicingConfig() if err != nil { - return fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) + return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) } if tsc != nil { err = s.tsManager.SetTimeSlice(allocatableDevices, tsc) if err != nil { - return fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) + return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err) } } } @@ -451,55 +450,51 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S if config.IsMps() { mpsc, err := config.GetMpsConfig() if err != nil { - return fmt.Errorf("error getting MPS configuration: %w", err) + return nil, fmt.Errorf("error getting MPS configuration: %w", err) } mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices) if err := mpsControlDaemon.Start(ctx, mpsc); err != nil { - return fmt.Errorf("error starting MPS control daemon: %w", err) + return nil, fmt.Errorf("error starting MPS control daemon: %w", err) } if err := mpsControlDaemon.AssertReady(ctx); err != nil { - return fmt.Errorf("MPS control daemon is not yet ready: %w", err) + return nil, fmt.Errorf("MPS control daemon is not yet ready: %w", err) } configState.MpsControlDaemonID = mpsControlDaemon.GetID() configState.containerEdits = mpsControlDaemon.GetCDIContainerEdits() } - return nil + return configState, nil } -func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { +func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { // Create any necessary IMEX channels and gather their CDI container edits. for _, r := range results { imexChannel := s.allocatable[r.Device].ImexChannel if err := s.nvdevlib.createImexChannelDevice(imexChannel.Channel); err != nil { - return fmt.Errorf("error creating IMEX channel device: %w", err) + return nil, fmt.Errorf("error creating IMEX channel device: %w", err) } configState.containerEdits = configState.containerEdits.Append(s.cdi.GetImexChannelContainerEdits(imexChannel)) } - return nil + return configState, nil } -func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error { - // Get the list of allocatable devices this config is being applied over. - allocatableDevices := make(AllocatableDevices) - for _, r := range results { - allocatableDevices[r.Device] = s.allocatable[r.Device] +func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { + if config.Driver != configapi.VfioPciDriver { + return configState, nil } - if config.Driver == configapi.VfioPciDriver { - // Apply vfio-pci driver settings. - for _, r := range results { - info := allocatableDevices[r.Device] - err := s.vfioPciManager.Configure(info.Gpu) - if err != nil { - return err - } - configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu)) + // Apply vfio-pci driver settings. + for _, r := range results { + info := s.allocatable[r.Device] + err := s.vfioPciManager.Configure(info.Gpu) + if err != nil { + return nil, err } + configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu)) } - return nil + return configState, nil } // GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver. diff --git a/cmd/nvidia-dra-plugin/deviceinfo.go b/cmd/nvidia-dra-plugin/deviceinfo.go index 8098cc07..194742ea 100644 --- a/cmd/nvidia-dra-plugin/deviceinfo.go +++ b/cmd/nvidia-dra-plugin/deviceinfo.go @@ -29,6 +29,7 @@ import ( type GpuInfo struct { UUID string `json:"uuid"` + PciAddress string `json:"pciAddress"` index int minor int migEnabled bool @@ -40,7 +41,6 @@ type GpuInfo struct { driverVersion string cudaDriverVersion string migProfiles []*MigProfileInfo - PciAddress string `json:"pciAddress"` } type MigDeviceInfo struct { diff --git a/cmd/nvidia-dra-plugin/main.go b/cmd/nvidia-dra-plugin/main.go index 889be930..ab5bd585 100644 --- a/cmd/nvidia-dra-plugin/main.go +++ b/cmd/nvidia-dra-plugin/main.go @@ -52,9 +52,6 @@ type Flags struct { hostDriverRoot string nvidiaCTKPath string deviceClasses sets.Set[string] - pciDevicesRoot string - sysModulesRoot string - vfioDevicesRoot string } type Config struct { diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go index 45634431..37639e92 100644 --- a/cmd/nvidia-dra-plugin/nvlib.go +++ b/cmd/nvidia-dra-plugin/nvlib.go @@ -199,10 +199,6 @@ func (l deviceLib) enumerateImexChannels(config *Config) (AllocatableDevices, er return devices, nil } -func getPciAddressFromNvmlPciInfo(info nvml.PciInfo) string { - return fmt.Sprintf("%04x:%02x:%02x.0", info.Domain, info.Bus, info.Device) -} - func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) { minor, ret := device.GetMinorNumber() if ret != nvml.SUCCESS { @@ -244,12 +240,10 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting CUDA driver version: %w", err) } - pciInfo, ret := l.nvmllib.DeviceGetPciInfo(device) - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting PCI info for device %d: %w", index, err) + pciAddress, err := device.GetPCIBusID() + if err != nil { + return nil, err } - pciAddress := getPciAddressFromNvmlPciInfo(pciInfo) - var migProfiles []*MigProfileInfo for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ { giProfileInfo, ret := device.GetGpuInstanceProfileInfo(i) diff --git a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl index 7cf4ea01..8fff1579 100644 --- a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl +++ b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl @@ -127,3 +127,21 @@ Filter a list by a set of valid values {{- end }} {{- $result -}} {{- end -}} + +{{- define "k8s-dra-driver.vfiopciDeviceClassVolumes" -}} +- name: sysfs + hostPath: + path: /sys +- name: dev-vfio + hostPath: + path: /dev/vfio +{{- end -}} + +{{- define "k8s-dra-driver.vfiopciDeviceClassVolumeMounts" -}} +- name: sysfs + mountPath: /sys + readOnly: false +- name: dev-vfio + mountPath: /dev/vfio + readOnly: false +{{- end -}} \ No newline at end of file diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml index e79e70b1..161d78a0 100644 --- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml +++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml @@ -103,12 +103,9 @@ spec: - name: driver-root mountPath: /driver-root readOnly: true - - name: sysfs - mountPath: /sys - readOnly: false - - name: dev-vfio - mountPath: /dev/vfio - readOnly: false + {{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }} + {{- include "k8s-dra-driver.vfiopciDeviceClassVolumeMounts" . | nindent 8 }} + {{- end }} volumes: - name: plugins-registry hostPath: @@ -122,12 +119,9 @@ spec: - name: driver-root hostPath: path: {{ .Values.nvidiaDriverRoot }} - - name: sysfs - hostPath: - path: /sys - - name: dev-vfio - hostPath: - path: /dev/vfio + {{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }} + {{- include "k8s-dra-driver.vfiopciDeviceClassVolumes" . | nindent 6}} + {{- end }} {{- with .Values.kubeletPlugin.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deployments/helm/k8s-dra-driver/templates/validation.yaml b/deployments/helm/k8s-dra-driver/templates/validation.yaml index ce2dbe68..f93e7dd3 100644 --- a/deployments/helm/k8s-dra-driver/templates/validation.yaml +++ b/deployments/helm/k8s-dra-driver/templates/validation.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -{{- $validDeviceClasses := list "gpu" "mig" "imex" }} +{{- $validDeviceClasses := list "gpu" "mig" "imex" "vfiopci" }} {{- if not (kindIs "slice" .Values.deviceClasses) }} {{- $error := "" }} From 69cff40f1dfebdc9f9b225bf18bbc02698e0ec6e Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Thu, 5 Dec 2024 11:22:13 -0800 Subject: [PATCH 8/9] use nsenter to run kernel tasks Signed-off-by: Varun Ramachandra Sekar --- cmd/nvidia-dra-plugin/vfio-device.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cmd/nvidia-dra-plugin/vfio-device.go b/cmd/nvidia-dra-plugin/vfio-device.go index fb5076dd..d4c8fc8e 100644 --- a/cmd/nvidia-dra-plugin/vfio-device.go +++ b/cmd/nvidia-dra-plugin/vfio-device.go @@ -17,6 +17,7 @@ package main import ( + "fmt" "os" "os/exec" "path/filepath" @@ -26,6 +27,7 @@ import ( ) const ( + hostNamespaceMount = "/proc/1/ns/mnt" vfioPciModule = "vfio_pci" vfioPciDriver = "vfio-pci" nvidiaDriver = "nvidia" @@ -80,8 +82,7 @@ func (vm *VfioPciManager) isVfioPCIModuleLoaded() bool { } func (vm *VfioPciManager) loadVfioPciModule() error { - cmd := exec.Command("modprobe", vm.vfioPciModule) //nolint:gosec - _, err := cmd.CombinedOutput() + _, err := execCommandInHostNamespace("modprobe", []string{vm.vfioPciModule}) //nolint:gosec if err != nil { return err } @@ -149,8 +150,7 @@ func changeDriver(pciAddress, driver string) error { } func unbindFromDriver(pciAddress, driverResetRetries string) error { - cmd := exec.Command(unbindFromDriverScript, pciAddress, driverResetRetries) //nolint:gosec - _, err := cmd.CombinedOutput() + _, err := execCommandInHostNamespace(unbindFromDriverScript, []string{pciAddress, driverResetRetries}) //nolint:gosec if err != nil { return err } @@ -158,8 +158,7 @@ func unbindFromDriver(pciAddress, driverResetRetries string) error { } func bindToDriver(pciAddress, driver string) error { - cmd := exec.Command(bindToDriverScript, pciAddress, driver) //nolint:gosec - _, err := cmd.CombinedOutput() + _, err := execCommandInHostNamespace(bindToDriverScript, []string{pciAddress, driver}) //nolint:gosec if err != nil { return err } @@ -190,3 +189,9 @@ func (vm *VfioPciManager) GetCDIContainerEdits(info *GpuInfo) *cdiapi.ContainerE }, } } + +func execCommandInHostNamespace(cmd string, args []string) ([]byte, error) { + nsenterArgs := []string{fmt.Sprintf("--mount=%s", hostNamespaceMount), "--", cmd} + nsenterArgs = append(nsenterArgs, args...) + return exec.Command("nsenter", nsenterArgs...).CombinedOutput() +} From aa3c2bffb542d32354dd4779f697bcc8beca09e8 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Thu, 5 Dec 2024 11:25:54 -0800 Subject: [PATCH 9/9] add missing dockerfile dependencies Signed-off-by: Varun Ramachandra Sekar --- deployments/container/Dockerfile.ubi8 | 4 +++- deployments/container/Dockerfile.ubuntu | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/deployments/container/Dockerfile.ubi8 b/deployments/container/Dockerfile.ubi8 index ab52a352..ab3b2715 100644 --- a/deployments/container/Dockerfile.ubi8 +++ b/deployments/container/Dockerfile.ubi8 @@ -19,7 +19,7 @@ ARG BASE_DIST=ubi8 FROM --platform=${TARGETARCH} nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build RUN yum install -y \ - wget make git gcc \ + wget make git gcc kmod \ && \ rm -rf /var/cache/yum/* @@ -63,6 +63,8 @@ COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugi COPY --from=build /build/templates /templates COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh +COPY --from=build /usr/bin/nsenter /usr/bin/nsenter +COPY --from=build /usr/sbin/modprobe /usr/sbin/modprobe # Install / upgrade packages here that are required to resolve CVEs ARG CVE_UPDATES diff --git a/deployments/container/Dockerfile.ubuntu b/deployments/container/Dockerfile.ubuntu index b815bcd1..3559e5c0 100644 --- a/deployments/container/Dockerfile.ubuntu +++ b/deployments/container/Dockerfile.ubuntu @@ -19,7 +19,7 @@ ARG BASE_DIST=ubuntu20.04 FROM --platform=${BUILDOS}/amd64 nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build RUN apt-get update && \ - apt-get install -y wget make git gcc-aarch64-linux-gnu gcc \ + apt-get install -y wget make git gcc-aarch64-linux-gnu gcc kmod \ && \ rm -rf /var/lib/apt/lists/* @@ -68,6 +68,8 @@ COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugi COPY --from=build /build/templates /templates COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh +COPY --from=build /usr/bin/nsenter /usr/bin/nsenter +COPY --from=build /usr/sbin/modprobe /usr/sbin/modprobe # Install / upgrade packages here that are required to resolve CVEs ARG CVE_UPDATES