From d64afb1f31d2b17375691c09abcbb3a0ca420d70 Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Mon, 2 Dec 2024 12:58:42 -0800
Subject: [PATCH 1/9] vfio-pci device config API

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 .../resource/gpu/v1alpha1/driverconfig.go     | 60 +++++++++++++++++++
 .../resource/gpu/v1alpha1/gpuconfig.go        | 37 ++++++++++--
 .../gpu/v1alpha1/zz_generated.deepcopy.go     | 21 +++++++
 3 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go

diff --git a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go
new file mode 100644
index 00000000..27349726
--- /dev/null
+++ b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package v1alpha1
+
+import "fmt"
+
+// GpuDriver encodes the gpu driver as a string.
+type GpuDriver string
+
+const (
+	NvidiaDriver  GpuDriver = "nvidia"
+	VfioPciDriver GpuDriver = "vfio-pci"
+)
+
+// GpuDriverConfig holds the set of parameters for configuring a GPU with a driver.
+type GpuDriverConfig struct {
+	Driver GpuDriver `json:"driver"`
+}
+
+// DefaultGpuDriverConfig provides the default configuration of a GPU with a driver.
+func DefaultGpuDriverConfig() *GpuDriverConfig {
+	return &GpuDriverConfig{
+		Driver: NvidiaDriver,
+	}
+}
+
+// Normalize updates a GpuDriverConfig config with implied default values based on other settings.
+func (c *GpuDriverConfig) Normalize() error {
+	if c.Driver == "" {
+		c.Driver = NvidiaDriver
+	}
+	return nil
+}
+
+// Validate ensures that GpuDriverConfig has a valid set of values.
+func (c *GpuDriverConfig) Validate() error {
+	switch c.Driver {
+	case NvidiaDriver:
+		fallthrough
+	case VfioPciDriver:
+		break
+	default:
+		return fmt.Errorf("invalid driver specified in gpu driver configuration")
+	}
+	return nil
+}
diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go
index d14699fd..bf6b5765 100644
--- a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go
+++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go
@@ -29,7 +29,8 @@ import (
 // GpuConfig holds the set of parameters for configuring a GPU.
 type GpuConfig struct {
 	metav1.TypeMeta `json:",inline"`
-	Sharing         *GpuSharing `json:"sharing,omitempty"`
+	Sharing         *GpuSharing      `json:"sharing,omitempty"`
+	DriverConfig    *GpuDriverConfig `json:"driverConfig,omitempty"`
 }
 
 // DefaultGpuConfig provides the default GPU configuration.
@@ -45,11 +46,26 @@ func DefaultGpuConfig() *GpuConfig {
 				Interval: ptr.To(DefaultTimeSlice),
 			},
 		},
+		DriverConfig: &GpuDriverConfig{
+			Driver: NvidiaDriver,
+		},
 	}
 }
 
 // Normalize updates a GpuConfig config with implied default values based on other settings.
 func (c *GpuConfig) Normalize() error {
+	if c.DriverConfig == nil {
+		c.DriverConfig = DefaultGpuDriverConfig()
+	} else {
+		if err := c.DriverConfig.Normalize(); err != nil {
+			return err
+		}
+	}
+	// If driver is not Nvidia, don't proceed with normalizing sharing configuration.
+	if c.DriverConfig.Driver != NvidiaDriver {
+		return nil
+	}
+
 	if c.Sharing == nil {
 		c.Sharing = &GpuSharing{
 			Strategy: TimeSlicingStrategy,
@@ -68,8 +84,21 @@ func (c *GpuConfig) Normalize() error {
 
 // Validate ensures that GpuConfig has a valid set of values.
 func (c *GpuConfig) Validate() error {
-	if c.Sharing == nil {
-		return fmt.Errorf("no sharing strategy set")
+	if c.DriverConfig.Driver == NvidiaDriver {
+		if c.Sharing == nil {
+			return fmt.Errorf("no sharing strategy set")
+		}
+		if err := c.Sharing.Validate(); err != nil {
+			return err
+		}
+	} else {
+		if c.Sharing != nil {
+			return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver")
+		}
 	}
-	return c.Sharing.Validate()
+	if err := c.DriverConfig.Validate(); err != nil {
+		return err
+	}
+
+	return nil
 }
diff --git a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go
index 86a9f407..7cf34e00 100644
--- a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go
+++ b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go
@@ -1,4 +1,5 @@
 //go:build !ignore_autogenerated
+// +build !ignore_autogenerated
 
 /*
  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
@@ -33,6 +34,11 @@ func (in *GpuConfig) DeepCopyInto(out *GpuConfig) {
 		*out = new(GpuSharing)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.DriverConfig != nil {
+		in, out := &in.DriverConfig, &out.DriverConfig
+		*out = new(GpuDriverConfig)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuConfig.
@@ -53,6 +59,21 @@ func (in *GpuConfig) DeepCopyObject() runtime.Object {
 	return nil
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *GpuDriverConfig) DeepCopyInto(out *GpuDriverConfig) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuDriverConfig.
+func (in *GpuDriverConfig) DeepCopy() *GpuDriverConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(GpuDriverConfig)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *GpuSharing) DeepCopyInto(out *GpuSharing) {
 	*out = *in

From 2559d025c0d7e87f825c5f4d47fcac8a543ff3e3 Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Mon, 2 Dec 2024 14:05:23 -0800
Subject: [PATCH 2/9] vfio-pci gpu configuration

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 cmd/nvidia-dra-plugin/allocatable.go  |  12 +-
 cmd/nvidia-dra-plugin/device_state.go | 131 +++++++++++++-----
 cmd/nvidia-dra-plugin/deviceinfo.go   |   4 +
 cmd/nvidia-dra-plugin/main.go         |   3 +
 cmd/nvidia-dra-plugin/mutex.go        |  43 ++++++
 cmd/nvidia-dra-plugin/nvlib.go        |  12 +-
 cmd/nvidia-dra-plugin/prepared.go     |  20 +++
 cmd/nvidia-dra-plugin/vfio-device.go  | 192 ++++++++++++++++++++++++++
 scripts/bind_to_driver.sh             |  38 +++++
 scripts/unbind_from_driver.sh         |  53 +++++++
 10 files changed, 470 insertions(+), 38 deletions(-)
 create mode 100644 cmd/nvidia-dra-plugin/mutex.go
 create mode 100644 cmd/nvidia-dra-plugin/vfio-device.go
 create mode 100644 scripts/bind_to_driver.sh
 create mode 100644 scripts/unbind_from_driver.sh

diff --git a/cmd/nvidia-dra-plugin/allocatable.go b/cmd/nvidia-dra-plugin/allocatable.go
index 3350716c..b9bdbb46 100644
--- a/cmd/nvidia-dra-plugin/allocatable.go
+++ b/cmd/nvidia-dra-plugin/allocatable.go
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,3 +106,13 @@ func (d AllocatableDevices) UUIDs() []string {
 	slices.Sort(uuids)
 	return uuids
 }
+
+func (d AllocatableDevices) PciAddresses() []string {
+	var pciAddresses []string
+	for _, device := range d {
+		if device.Type() == GpuDeviceType {
+			pciAddresses = append(pciAddresses, device.Gpu.PciAddress)
+		}
+	}
+	return pciAddresses
+}
diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go
index 0ecd3423..6f851c49 100644
--- a/cmd/nvidia-dra-plugin/device_state.go
+++ b/cmd/nvidia-dra-plugin/device_state.go
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,17 +38,19 @@ type OpaqueDeviceConfig struct {
 }
 
 type DeviceConfigState struct {
-	MpsControlDaemonID string `json:"mpsControlDaemonID"`
+	MpsControlDaemonID string               `json:"mpsControlDaemonID"`
+	GpuConfig          *configapi.GpuConfig `json:"deviceConfig,omitempty"`
 	containerEdits     *cdiapi.ContainerEdits
 }
 
 type DeviceState struct {
 	sync.Mutex
-	cdi         *CDIHandler
-	tsManager   *TimeSlicingManager
-	mpsManager  *MpsManager
-	allocatable AllocatableDevices
-	config      *Config
+	cdi            *CDIHandler
+	tsManager      *TimeSlicingManager
+	mpsManager     *MpsManager
+	vfioPciManager *VfioPciManager
+	allocatable    AllocatableDevices
+	config         *Config
 
 	nvdevlib          *deviceLib
 	checkpointManager checkpointmanager.CheckpointManager
@@ -87,6 +89,8 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
 	tsManager := NewTimeSlicingManager(nvdevlib)
 	mpsManager := NewMpsManager(config, nvdevlib, MpsRoot, hostDriverRoot, MpsControlDaemonTemplatePath)
 
+	vfioPciManager := NewVfioPciManager()
+
 	if err := cdi.CreateStandardDeviceSpecFile(allocatable); err != nil {
 		return nil, fmt.Errorf("unable to create base CDI spec file: %v", err)
 	}
@@ -100,12 +104,16 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
 		cdi:               cdi,
 		tsManager:         tsManager,
 		mpsManager:        mpsManager,
+		vfioPciManager:    vfioPciManager,
 		allocatable:       allocatable,
 		config:            config,
 		nvdevlib:          nvdevlib,
 		checkpointManager: checkpointManager,
 	}
 
+	// Initialize the vfio-pci driver manager.
+	vfioPciManager.Init()
+
 	checkpoints, err := state.checkpointManager.ListCheckpoints()
 	if err != nil {
 		return nil, fmt.Errorf("unable to list checkpoints: %v", err)
@@ -349,35 +357,70 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
 
 func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error {
 	for _, group := range devices {
-		// Stop any MPS control daemons started for each group of prepared devices.
-		mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(claimUID, group)
-		if err := mpsControlDaemon.Stop(ctx); err != nil {
-			return fmt.Errorf("error stopping MPS control daemon: %w", err)
+		var err error
+		if group.ConfigState.GpuConfig != nil {
+			err = s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus())
 		}
-
-		// Go back to default time-slicing for all full GPUs.
-		tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig
-		if err := s.tsManager.SetTimeSlice(group.Devices.Gpus(), tsc); err != nil {
-			return fmt.Errorf("error setting timeslice for devices: %w", err)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+func (s *DeviceState) unprepareGpus(ctx context.Context, config *configapi.GpuConfig, devices PreparedDeviceList) error {
+	if config.DriverConfig.Driver == configapi.VfioPciDriver {
+		for _, device := range devices {
+			if err := s.vfioPciManager.Unconfigure(device.Gpu.Info); err != nil {
+				return fmt.Errorf("error unconfiguring vfio-pci device: %w", err)
+			}
 		}
 	}
+	// Go back to default time-slicing for all full GPUs.
+	tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig
+	if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil {
+		return fmt.Errorf("error setting timeslice for devices: %w", err)
+	}
 	return nil
 }
 
 func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interface, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) {
+	var err error
+	var configState DeviceConfigState
+
 	switch castConfig := config.(type) {
 	case *configapi.GpuConfig:
-		return s.applySharingConfig(ctx, castConfig.Sharing, claim, results)
+		configState.GpuConfig = castConfig
+		err = s.applyGpuConfig(ctx, castConfig, claim, results, &configState)
 	case *configapi.MigDeviceConfig:
-		return s.applySharingConfig(ctx, castConfig.Sharing, claim, results)
+		err = s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState)
 	case *configapi.ImexChannelConfig:
-		return s.applyImexChannelConfig(ctx, castConfig, claim, results)
+		err = s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState)
 	default:
-		return nil, fmt.Errorf("unknown config type: %T", castConfig)
+		err = fmt.Errorf("unknown config type: %T", castConfig)
+	}
+	if err != nil {
+		return nil, err
+	}
+	return &configState, nil
+}
+
+func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
+	if config.Sharing != nil {
+		err := s.applySharingConfig(ctx, config.Sharing, claim, results, configState)
+		if err != nil {
+			return err
+		}
+	}
+	if config.DriverConfig != nil {
+		err := s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState)
+		if err != nil {
+			return err
+		}
 	}
+	return nil
 }
 
-func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) {
+func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
 	// Get the list of claim requests this config is being applied over.
 	var requests []string
 	for _, r := range results {
@@ -390,19 +433,16 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
 		allocatableDevices[r.Device] = s.allocatable[r.Device]
 	}
 
-	// Declare a device group state object to populate.
-	var configState DeviceConfigState
-
 	// Apply time-slicing settings (if available).
 	if config.IsTimeSlicing() {
 		tsc, err := config.GetTimeSlicingConfig()
 		if err != nil {
-			return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
+			return fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
 		}
 		if tsc != nil {
 			err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
 			if err != nil {
-				return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
+				return fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
 			}
 		}
 	}
@@ -411,36 +451,55 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
 	if config.IsMps() {
 		mpsc, err := config.GetMpsConfig()
 		if err != nil {
-			return nil, fmt.Errorf("error getting MPS configuration: %w", err)
+			return fmt.Errorf("error getting MPS configuration: %w", err)
 		}
 		mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices)
 		if err := mpsControlDaemon.Start(ctx, mpsc); err != nil {
-			return nil, fmt.Errorf("error starting MPS control daemon: %w", err)
+			return fmt.Errorf("error starting MPS control daemon: %w", err)
 		}
 		if err := mpsControlDaemon.AssertReady(ctx); err != nil {
-			return nil, fmt.Errorf("MPS control daemon is not yet ready: %w", err)
+			return fmt.Errorf("MPS control daemon is not yet ready: %w", err)
 		}
 		configState.MpsControlDaemonID = mpsControlDaemon.GetID()
 		configState.containerEdits = mpsControlDaemon.GetCDIContainerEdits()
 	}
 
-	return &configState, nil
+	return nil
 }
 
-func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) {
-	// Declare a device group state object to populate.
-	var configState DeviceConfigState
-
+func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
 	// Create any necessary IMEX channels and gather their CDI container edits.
 	for _, r := range results {
 		imexChannel := s.allocatable[r.Device].ImexChannel
 		if err := s.nvdevlib.createImexChannelDevice(imexChannel.Channel); err != nil {
-			return nil, fmt.Errorf("error creating IMEX channel device: %w", err)
+			return fmt.Errorf("error creating IMEX channel device: %w", err)
 		}
 		configState.containerEdits = configState.containerEdits.Append(s.cdi.GetImexChannelContainerEdits(imexChannel))
 	}
 
-	return &configState, nil
+	return nil
+}
+
+func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
+	// Get the list of allocatable devices this config is being applied over.
+	allocatableDevices := make(AllocatableDevices)
+	for _, r := range results {
+		allocatableDevices[r.Device] = s.allocatable[r.Device]
+	}
+
+	if config.Driver == configapi.VfioPciDriver {
+		// Apply vfio-pci driver settings.
+		for _, r := range results {
+			info := allocatableDevices[r.Device]
+			err := s.vfioPciManager.Configure(info.Gpu)
+			if err != nil {
+				return err
+			}
+			configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu))
+		}
+	}
+
+	return nil
 }
 
 // GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver.
diff --git a/cmd/nvidia-dra-plugin/deviceinfo.go b/cmd/nvidia-dra-plugin/deviceinfo.go
index cc899c44..8098cc07 100644
--- a/cmd/nvidia-dra-plugin/deviceinfo.go
+++ b/cmd/nvidia-dra-plugin/deviceinfo.go
@@ -40,6 +40,7 @@ type GpuInfo struct {
 	driverVersion         string
 	cudaDriverVersion     string
 	migProfiles           []*MigProfileInfo
+	PciAddress            string `json:"pciAddress"`
 }
 
 type MigDeviceInfo struct {
@@ -130,6 +131,9 @@ func (d *GpuInfo) GetDevice() resourceapi.Device {
 				"cudaDriverVersion": {
 					VersionValue: ptr.To(semver.MustParse(d.cudaDriverVersion).String()),
 				},
+				"pciAddress": {
+					StringValue: &d.PciAddress,
+				},
 			},
 			Capacity: map[resourceapi.QualifiedName]resource.Quantity{
 				"memory": *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI),
diff --git a/cmd/nvidia-dra-plugin/main.go b/cmd/nvidia-dra-plugin/main.go
index ab5bd585..889be930 100644
--- a/cmd/nvidia-dra-plugin/main.go
+++ b/cmd/nvidia-dra-plugin/main.go
@@ -52,6 +52,9 @@ type Flags struct {
 	hostDriverRoot      string
 	nvidiaCTKPath       string
 	deviceClasses       sets.Set[string]
+	pciDevicesRoot      string
+	sysModulesRoot      string
+	vfioDevicesRoot     string
 }
 
 type Config struct {
diff --git a/cmd/nvidia-dra-plugin/mutex.go b/cmd/nvidia-dra-plugin/mutex.go
new file mode 100644
index 00000000..e98fed15
--- /dev/null
+++ b/cmd/nvidia-dra-plugin/mutex.go
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"sync"
+)
+
+type PerGPUMutex struct {
+	sync.Mutex
+	submutex map[string]*sync.Mutex
+}
+
+var perGpuLock *PerGPUMutex
+
+func init() {
+	perGpuLock = &PerGPUMutex{
+		submutex: make(map[string]*sync.Mutex),
+	}
+}
+
+func (pgm *PerGPUMutex) Get(gpu string) *sync.Mutex {
+	pgm.Mutex.Lock()
+	defer pgm.Mutex.Unlock()
+	if pgm.submutex[gpu] == nil {
+		pgm.submutex[gpu] = &sync.Mutex{}
+	}
+	return pgm.submutex[gpu]
+}
diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go
index 421e7a50..45634431 100644
--- a/cmd/nvidia-dra-plugin/nvlib.go
+++ b/cmd/nvidia-dra-plugin/nvlib.go
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -199,6 +199,10 @@ func (l deviceLib) enumerateImexChannels(config *Config) (AllocatableDevices, er
 	return devices, nil
 }
 
+func getPciAddressFromNvmlPciInfo(info nvml.PciInfo) string {
+	return fmt.Sprintf("%04x:%02x:%02x.0", info.Domain, info.Bus, info.Device)
+}
+
 func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) {
 	minor, ret := device.GetMinorNumber()
 	if ret != nvml.SUCCESS {
@@ -240,6 +244,11 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
 	if ret != nvml.SUCCESS {
 		return nil, fmt.Errorf("error getting CUDA driver version: %w", err)
 	}
+	pciInfo, ret := l.nvmllib.DeviceGetPciInfo(device)
+	if ret != nvml.SUCCESS {
+		return nil, fmt.Errorf("error getting PCI info for device %d: %w", index, err)
+	}
+	pciAddress := getPciAddressFromNvmlPciInfo(pciInfo)
 
 	var migProfiles []*MigProfileInfo
 	for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
@@ -307,6 +316,7 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
 		driverVersion:         driverVersion,
 		cudaDriverVersion:     fmt.Sprintf("%v.%v", cudaDriverVersion/1000, (cudaDriverVersion%1000)/10),
 		migProfiles:           migProfiles,
+		PciAddress:            pciAddress,
 	}
 
 	return gpuInfo, nil
diff --git a/cmd/nvidia-dra-plugin/prepared.go b/cmd/nvidia-dra-plugin/prepared.go
index edb369fe..f939e77b 100644
--- a/cmd/nvidia-dra-plugin/prepared.go
+++ b/cmd/nvidia-dra-plugin/prepared.go
@@ -203,3 +203,23 @@ func (d PreparedDevices) MigDeviceUUIDs() []string {
 	slices.Sort(uuids)
 	return uuids
 }
+
+func (l PreparedDeviceList) PciAddresses() []string {
+	var pciAddresses []string
+	for _, device := range l.Gpus() {
+		pciAddresses = append(pciAddresses, device.Gpu.Info.PciAddress)
+	}
+	return pciAddresses
+}
+
+func (g *PreparedDeviceGroup) PciAddresses() []string {
+	return g.Devices.Gpus().PciAddresses()
+}
+
+func (d PreparedDevices) PciAddresses() []string {
+	var pciAddresses []string
+	for _, group := range d {
+		pciAddresses = append(pciAddresses, group.PciAddresses()...)
+	}
+	return pciAddresses
+}
diff --git a/cmd/nvidia-dra-plugin/vfio-device.go b/cmd/nvidia-dra-plugin/vfio-device.go
new file mode 100644
index 00000000..fb5076dd
--- /dev/null
+++ b/cmd/nvidia-dra-plugin/vfio-device.go
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"os"
+	"os/exec"
+	"path/filepath"
+
+	cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
+	cdispec "tags.cncf.io/container-device-interface/specs-go"
+)
+
+const (
+	vfioPciModule          = "vfio_pci"
+	vfioPciDriver          = "vfio-pci"
+	nvidiaDriver           = "nvidia"
+	unbindFromDriverScript = "/usr/bin/unbind_from_driver.sh"
+	bindToDriverScript     = "/usr/bin/bind_to_driver.sh"
+	driverResetRetries     = "5"
+)
+
+type VfioPciManager struct {
+	pciDevicesRoot  string
+	vfioDevicesRoot string
+	sysModulesRoot  string
+	driver          string
+	vfioPciModule   string
+}
+
+func NewVfioPciManager() *VfioPciManager {
+	return &VfioPciManager{
+		pciDevicesRoot:  "/sys/bus/pci/devices",
+		vfioDevicesRoot: "/dev/vfio",
+		sysModulesRoot:  "/sys/module",
+		driver:          vfioPciDriver,
+		vfioPciModule:   vfioPciModule,
+	}
+}
+
+// Init ensures the vfio-pci module is loaded on the host.
+func (vm *VfioPciManager) Init() error {
+	if !vm.isVfioPCIModuleLoaded() {
+		err := vm.loadVfioPciModule()
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (vm *VfioPciManager) isVfioPCIModuleLoaded() bool {
+	modules, err := os.ReadDir(vm.sysModulesRoot)
+	if err != nil {
+		return false
+	}
+
+	for _, module := range modules {
+		if module.Name() == vm.vfioPciModule {
+			return true
+		}
+	}
+
+	return false
+
+}
+
+func (vm *VfioPciManager) loadVfioPciModule() error {
+	cmd := exec.Command("modprobe", vm.vfioPciModule) //nolint:gosec
+	_, err := cmd.CombinedOutput()
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Configure binds the GPU to the vfio-pci driver.
+func (vm *VfioPciManager) Configure(info *GpuInfo) error {
+	perGpuLock.Get(info.PciAddress).Lock()
+	defer perGpuLock.Get(info.PciAddress).Unlock()
+
+	driver, err := getDriver(vm.pciDevicesRoot, info.PciAddress)
+	if err != nil {
+		return err
+	}
+	if driver == vm.driver {
+		return nil
+	}
+	err = changeDriver(info.PciAddress, vm.driver)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// Unconfigure binds the GPU to the nvidia driver.
+func (vm *VfioPciManager) Unconfigure(info *GpuInfo) error {
+	perGpuLock.Get(info.PciAddress).Lock()
+	defer perGpuLock.Get(info.PciAddress).Unlock()
+
+	driver, err := getDriver(vm.pciDevicesRoot, info.PciAddress)
+	if err != nil {
+		return err
+	}
+	if driver == nvidiaDriver {
+		return nil
+	}
+	err = changeDriver(info.PciAddress, nvidiaDriver)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+func getDriver(pciDevicesRoot, pciAddress string) (string, error) {
+	driverPath, err := os.Readlink(filepath.Join(pciDevicesRoot, pciAddress, "driver"))
+	if err != nil {
+		return "", err
+	}
+	_, driver := filepath.Split(driverPath)
+	return driver, nil
+}
+
+func changeDriver(pciAddress, driver string) error {
+	err := unbindFromDriver(pciAddress, driver)
+	if err != nil {
+		return err
+	}
+	err = bindToDriver(pciAddress, driver)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+func unbindFromDriver(pciAddress, driverResetRetries string) error {
+	cmd := exec.Command(unbindFromDriverScript, pciAddress, driverResetRetries) //nolint:gosec
+	_, err := cmd.CombinedOutput()
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+func bindToDriver(pciAddress, driver string) error {
+	cmd := exec.Command(bindToDriverScript, pciAddress, driver) //nolint:gosec
+	_, err := cmd.CombinedOutput()
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+func (vm *VfioPciManager) getIommuGroupForVfioPciDevice(pciAddress string) string {
+	iommuGroup, err := os.Readlink(filepath.Join(vm.pciDevicesRoot, pciAddress, "iommu_group"))
+	if err != nil {
+		return ""
+	}
+	_, file := filepath.Split(iommuGroup)
+	return file
+
+}
+
+// GetCDIContainerEdits returns the CDI spec for a container to have access to the GPU while bound on vfio-pci driver.
+func (vm *VfioPciManager) GetCDIContainerEdits(info *GpuInfo) *cdiapi.ContainerEdits {
+	iommuGroup := vm.getIommuGroupForVfioPciDevice(info.PciAddress)
+	vfioDevicePath := filepath.Join(vm.vfioDevicesRoot, iommuGroup)
+	return &cdiapi.ContainerEdits{
+		ContainerEdits: &cdispec.ContainerEdits{
+			DeviceNodes: []*cdispec.DeviceNode{
+				{
+					Path: vfioDevicePath,
+				},
+			},
+		},
+	}
+}
diff --git a/scripts/bind_to_driver.sh b/scripts/bind_to_driver.sh
new file mode 100644
index 00000000..26840cd4
--- /dev/null
+++ b/scripts/bind_to_driver.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Usage: ./bind_to_driver.sh <ssss:bb:dd.f> <driver>
+# Bind the GPU specified by the PCI_ID=ssss:bb:dd.f to the given driver.
+
+bind_to_driver()
+{
+   local gpu=$1
+   local driver=$2
+   local drivers_path="/sys/bus/pci/drivers"
+   local driver_override_file="/sys/bus/pci/devices/$gpu/driver_override"
+   local bind_file="$drivers_path/$driver/bind"
+
+   if [ ! -e "$driver_override_file" ]; then
+      echo "'$driver_override_file' file does not exist" >&2
+      return 1
+   fi
+
+   echo "$driver" > "$driver_override_file"
+   if [ $? -ne 0 ]; then
+      echo "failed to write '$driver' to $driver_override_file" >&2
+      return 1
+   fi
+
+   if [ ! -e "$bind_file" ]; then
+      echo "'$bind_file' file does not exist" >&2
+      return 1
+   fi
+
+   echo "$gpu" > "$bind_file"
+   if [ $? -ne 0 ]; then
+      echo "failed to write '$gpu' to $bind_file" >&2
+      echo "" > "$driver_override_file"
+      return 1
+   fi
+}
+
+bind_to_driver "$1" "$2" || exit 1
\ No newline at end of file
diff --git a/scripts/unbind_from_driver.sh b/scripts/unbind_from_driver.sh
new file mode 100644
index 00000000..c7653679
--- /dev/null
+++ b/scripts/unbind_from_driver.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Usage: ./unbind_from_driver.sh <ssss:bb:dd.f>
+# Unbind the GPU specified by the PCI_ID=ssss:bb:dd.f from the driver its bound to.
+# Attempt to acquire the unbindLock within the retries specified before unbinding the device from its driver.
+
+acquire_unbind_lock()
+{
+   local gpu=$1
+   local lock_retries=5
+   local unbind_lock_file="/proc/driver/nvidia/gpus/$gpu/unbindLock"
+   local unbind_lock=0
+   local attempt=1
+
+   if [ ! -e "${unbind_lock_file}" ]; then
+      return 0
+   fi
+
+   while [[ $attempt -le ${lock_retries} ]]; do
+      echo "[retry $attempt/${lock_retries}] Attempting to acquire unbindLock for $gpu" >&1
+
+      echo 1 > "{$unbind_lock_file}"
+      read -r unbind_lock < "${unbind_lock_file}"
+      if [ ${unbind_lock} -eq 1 ]; then
+         echo "UnbindLock acquired for $gpu" >&1
+         return 0
+      fi
+
+      sleep $attempt
+      attempt=$((attempt + 1))
+   done
+
+   echo "cannot obtain unbindLock for $gpu" >&2
+   return 1
+}
+
+unbind_from_driver()
+{
+   local gpu=$1
+   local existing_driver
+   local existing_driver_name
+
+   [ -e "/sys/bus/pci/devices/$gpu/driver" ] || return 0
+   existing_driver=$(readlink -f "/sys/bus/pci/devices/$gpu/driver")
+   existing_driver_name=$(basename "${existing_driver}")
+   if [ "${existing_driver_name}" == "nvidia" ]; then
+      acquire_unbind_lock "$gpu" || return 1
+   fi
+   echo "$gpu" > "${existing_driver}/unbind"
+   return 0
+}
+
+unbind_from_driver "$1" || exit 1
\ No newline at end of file

From f92e72fff3545c4dfe36013f6621aa68d898de4c Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Mon, 2 Dec 2024 13:00:27 -0800
Subject: [PATCH 3/9] Dockerfile changes for vfio-pci device config

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 deployments/container/Dockerfile.ubi8   | 7 ++++---
 deployments/container/Dockerfile.ubuntu | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/deployments/container/Dockerfile.ubi8 b/deployments/container/Dockerfile.ubi8
index c365f123..ab52a352 100644
--- a/deployments/container/Dockerfile.ubi8
+++ b/deployments/container/Dockerfile.ubi8
@@ -59,9 +59,10 @@ LABEL org.opencontainers.image.description "NVIDIA GPU DRA driver for Kubernetes
 
 RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
 
-COPY --from=build /artifacts/nvidia-dra-controller /usr/bin/nvidia-dra-controller
-COPY --from=build /artifacts/nvidia-dra-plugin     /usr/bin/nvidia-dra-plugin
-COPY --from=build /build/templates                 /templates
+COPY --from=build /artifacts/nvidia-dra-plugin         /usr/bin/nvidia-dra-plugin
+COPY --from=build /build/templates                     /templates
+COPY --from=build /build/scripts/bind_to_driver.sh     /usr/bin/bind_to_driver.sh
+COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh
 
 # Install / upgrade packages here that are required to resolve CVEs
 ARG CVE_UPDATES
diff --git a/deployments/container/Dockerfile.ubuntu b/deployments/container/Dockerfile.ubuntu
index ea5d58a3..b815bcd1 100644
--- a/deployments/container/Dockerfile.ubuntu
+++ b/deployments/container/Dockerfile.ubuntu
@@ -64,9 +64,10 @@ LABEL org.opencontainers.image.description "NVIDIA GPU DRA driver for Kubernetes
 
 RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
 
-COPY --from=build /artifacts/nvidia-dra-controller /usr/bin/nvidia-dra-controller
-COPY --from=build /artifacts/nvidia-dra-plugin     /usr/bin/nvidia-dra-plugin
-COPY --from=build /build/templates                 /templates
+COPY --from=build /artifacts/nvidia-dra-plugin         /usr/bin/nvidia-dra-plugin
+COPY --from=build /build/templates                     /templates
+COPY --from=build /build/scripts/bind_to_driver.sh     /usr/bin/bind_to_driver.sh
+COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh
 
 # Install / upgrade packages here that are required to resolve CVEs
 ARG CVE_UPDATES

From 229cc20e00d06185cb6ecf900fcde1185c73a9d1 Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Tue, 24 Sep 2024 18:42:22 -0700
Subject: [PATCH 4/9] vfio-pci gpu deviceclass

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 demo/clusters/kind/install-dra-driver.sh      |  2 +-
 .../templates/deviceclass-vfiopci.yaml        | 19 +++++++++++++++++++
 deployments/helm/k8s-dra-driver/values.yaml   |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml

diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh
index ece8cdf1..b48aafd6 100755
--- a/demo/clusters/kind/install-dra-driver.sh
+++ b/demo/clusters/kind/install-dra-driver.sh
@@ -24,7 +24,7 @@ source "${CURRENT_DIR}/scripts/common.sh"
 
 kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/gpu.present=true
 
-deviceClasses=${1:-"gpu,mig,imex"}
+deviceClasses=${1:-"gpu,mig,imex,vfiopci"}
 helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
     --set deviceClasses="{${deviceClasses}}" \
     ${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \
diff --git a/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml b/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml
new file mode 100644
index 00000000..9fb1cb31
--- /dev/null
+++ b/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml
@@ -0,0 +1,19 @@
+{{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }}
+---
+apiVersion: resource.k8s.io/v1alpha3
+kind: DeviceClass
+metadata:
+  name: vfiopci.nvidia.com
+spec:
+  config:
+  - opaque:
+      driver: gpu.nvidia.com
+      parameters:
+        apiVersion: gpu.nvidia.com/v1alpha1
+        kind: GpuConfig
+        driverConfig:
+          driver: vfio-pci
+  selectors:
+  - cel:
+      expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'"
+{{- end }}
diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml
index 76ff38ca..986e7ad5 100644
--- a/deployments/helm/k8s-dra-driver/values.yaml
+++ b/deployments/helm/k8s-dra-driver/values.yaml
@@ -34,7 +34,7 @@ selectorLabelsOverride: {}
 
 allowDefaultNamespace: false
 
-deviceClasses: ["gpu", "mig", "imex"]
+deviceClasses: ["gpu", "mig", "imex", "vfiopci"]
 
 # Masking of the params file is typically done to allow nvkind to
 # selectively exclude certain GPUs from being visible to the

From 2bda933d90a1a40bd635f8f058467eaac4469527 Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Tue, 15 Oct 2024 09:18:24 -0700
Subject: [PATCH 5/9] kind cluster changes for vfio-pci device config support

---
 demo/clusters/kind/scripts/kind-cluster-config.yaml  |  2 ++
 .../helm/k8s-dra-driver/templates/kubeletplugin.yaml | 12 ++++++++++++
 deployments/helm/k8s-dra-driver/values.yaml          |  4 ++++
 3 files changed, 18 insertions(+)

diff --git a/demo/clusters/kind/scripts/kind-cluster-config.yaml b/demo/clusters/kind/scripts/kind-cluster-config.yaml
index f1a34a1c..890b0b71 100644
--- a/demo/clusters/kind/scripts/kind-cluster-config.yaml
+++ b/demo/clusters/kind/scripts/kind-cluster-config.yaml
@@ -66,3 +66,5 @@ nodes:
   # on the kind nodes.
   - hostPath: /usr/bin/nvidia-ctk
     containerPath: /usr/bin/nvidia-ctk
+  - hostPath: /sys
+    containerPath: /sys
\ No newline at end of file
diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
index 0b9b09b0..e79e70b1 100644
--- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
+++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
@@ -103,6 +103,12 @@ spec:
         - name: driver-root
           mountPath: /driver-root
           readOnly: true
+        - name: sysfs
+          mountPath: /sys
+          readOnly: false
+        - name: dev-vfio
+          mountPath: /dev/vfio
+          readOnly: false
       volumes:
       - name: plugins-registry
         hostPath:
@@ -116,6 +122,12 @@ spec:
       - name: driver-root
         hostPath:
           path: {{ .Values.nvidiaDriverRoot }}
+      - name: sysfs
+        hostPath:
+          path: /sys
+      - name: dev-vfio
+        hostPath:
+          path: /dev/vfio
       {{- with .Values.kubeletPlugin.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}
diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml
index 986e7ad5..a5c7ee91 100644
--- a/deployments/helm/k8s-dra-driver/values.yaml
+++ b/deployments/helm/k8s-dra-driver/values.yaml
@@ -96,6 +96,10 @@ kubeletPlugin:
     plugin:
       securityContext:
         privileged: true
+        allowPrivilegeEscalation: true
+        runAsNonRoot: false
+        runAsUser: 0
+        runAsGroup: 0
       resources: {}
   affinity:
     nodeAffinity:

From 5ce0763f0d7e20e5d2a0b1e760157ee49183db85 Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Thu, 17 Oct 2024 02:09:49 -0700
Subject: [PATCH 6/9] vfio-pci gpu claim example

---
 demo/specs/quickstart/gpu-test-vfiopci.yaml | 41 +++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 demo/specs/quickstart/gpu-test-vfiopci.yaml

diff --git a/demo/specs/quickstart/gpu-test-vfiopci.yaml b/demo/specs/quickstart/gpu-test-vfiopci.yaml
new file mode 100644
index 00000000..75f6cfbe
--- /dev/null
+++ b/demo/specs/quickstart/gpu-test-vfiopci.yaml
@@ -0,0 +1,41 @@
+# One pod, one container asking for 1 distinct GPU
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gpu-test-vfiopci
+
+---
+apiVersion: resource.k8s.io/v1alpha3
+kind: ResourceClaimTemplate
+metadata:
+  namespace: gpu-test-vfiopci
+  name: single-gpu
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        deviceClassName: vfiopci.nvidia.com
+
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  namespace: gpu-test-vfiopci
+  name: pod1
+  labels:
+    app: pod
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:22.04
+    command: ["bash", "-c"]
+    args: ["sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: single-gpu

From f1527355d8927ba91daaadd6c7632df23bd5f47e Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Wed, 4 Dec 2024 16:33:40 -0800
Subject: [PATCH 7/9] address review comments

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 .../resource/gpu/v1alpha1/driverconfig.go     |   2 +-
 .../resource/gpu/v1alpha1/gpuconfig.go        |  33 +++---
 cmd/nvidia-dra-plugin/allocatable.go          |   1 +
 cmd/nvidia-dra-plugin/device_state.go         | 107 +++++++++---------
 cmd/nvidia-dra-plugin/deviceinfo.go           |   2 +-
 cmd/nvidia-dra-plugin/main.go                 |   3 -
 cmd/nvidia-dra-plugin/nvlib.go                |  12 +-
 .../k8s-dra-driver/templates/_helpers.tpl     |  18 +++
 .../templates/kubeletplugin.yaml              |  18 +--
 .../k8s-dra-driver/templates/validation.yaml  |   2 +-
 10 files changed, 100 insertions(+), 98 deletions(-)

diff --git a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go
index 27349726..2dd59390 100644
--- a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go
+++ b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go
@@ -54,7 +54,7 @@ func (c *GpuDriverConfig) Validate() error {
 	case VfioPciDriver:
 		break
 	default:
-		return fmt.Errorf("invalid driver specified in gpu driver configuration")
+		return fmt.Errorf("invalid driver '%s' specified in gpu driver configuration", c.Driver)
 	}
 	return nil
 }
diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go
index bf6b5765..d61db48c 100644
--- a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go
+++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go
@@ -56,13 +56,14 @@ func DefaultGpuConfig() *GpuConfig {
 func (c *GpuConfig) Normalize() error {
 	if c.DriverConfig == nil {
 		c.DriverConfig = DefaultGpuDriverConfig()
-	} else {
-		if err := c.DriverConfig.Normalize(); err != nil {
-			return err
-		}
 	}
-	// If driver is not Nvidia, don't proceed with normalizing sharing configuration.
-	if c.DriverConfig.Driver != NvidiaDriver {
+
+	if err := c.DriverConfig.Normalize(); err != nil {
+		return err
+	}
+
+	// If sharing is not supported, don't proceed with normalizing its configuration.
+	if !c.SupportsSharing() {
 		return nil
 	}
 
@@ -84,21 +85,23 @@ func (c *GpuConfig) Normalize() error {
 
 // Validate ensures that GpuConfig has a valid set of values.
 func (c *GpuConfig) Validate() error {
-	if c.DriverConfig.Driver == NvidiaDriver {
+	if err := c.DriverConfig.Validate(); err != nil {
+		return err
+	}
+
+	if c.SupportsSharing() {
 		if c.Sharing == nil {
 			return fmt.Errorf("no sharing strategy set")
 		}
 		if err := c.Sharing.Validate(); err != nil {
 			return err
 		}
-	} else {
-		if c.Sharing != nil {
-			return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver")
-		}
-	}
-	if err := c.DriverConfig.Validate(); err != nil {
-		return err
+	} else if c.Sharing != nil {
+		return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver")
 	}
-
 	return nil
 }
+
+func (c *GpuConfig) SupportsSharing() bool {
+	return c.DriverConfig.Driver == NvidiaDriver
+}
diff --git a/cmd/nvidia-dra-plugin/allocatable.go b/cmd/nvidia-dra-plugin/allocatable.go
index b9bdbb46..68a54241 100644
--- a/cmd/nvidia-dra-plugin/allocatable.go
+++ b/cmd/nvidia-dra-plugin/allocatable.go
@@ -114,5 +114,6 @@ func (d AllocatableDevices) PciAddresses() []string {
 			pciAddresses = append(pciAddresses, device.Gpu.PciAddress)
 		}
 	}
+	slices.Sort(pciAddresses)
 	return pciAddresses
 }
diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go
index 6f851c49..9e7407cb 100644
--- a/cmd/nvidia-dra-plugin/device_state.go
+++ b/cmd/nvidia-dra-plugin/device_state.go
@@ -39,7 +39,7 @@ type OpaqueDeviceConfig struct {
 
 type DeviceConfigState struct {
 	MpsControlDaemonID string               `json:"mpsControlDaemonID"`
-	GpuConfig          *configapi.GpuConfig `json:"deviceConfig,omitempty"`
+	GpuConfig          *configapi.GpuConfig `json:"gpuConfig,omitempty"`
 	containerEdits     *cdiapi.ContainerEdits
 }
 
@@ -112,7 +112,9 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
 	}
 
 	// Initialize the vfio-pci driver manager.
-	vfioPciManager.Init()
+	if err := vfioPciManager.Init(); err != nil {
+		return nil, fmt.Errorf("unable to initialize vfio-pci manager: %v", err)
+	}
 
 	checkpoints, err := state.checkpointManager.ListCheckpoints()
 	if err != nil {
@@ -357,12 +359,21 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
 
 func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error {
 	for _, group := range devices {
-		var err error
 		if group.ConfigState.GpuConfig != nil {
-			err = s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus())
+			err := s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus())
+			if err != nil {
+				return err
+			}
 		}
-		if err != nil {
-			return err
+		// Stop any MPS control daemons started for each group of prepared devices.
+		mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(claimUID, group)
+		if err := mpsControlDaemon.Stop(ctx); err != nil {
+			return fmt.Errorf("error stopping MPS control daemon: %w", err)
+		}
+		// Go back to default time-slicing for all full GPUs.
+		tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig
+		if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil {
+			return fmt.Errorf("error setting timeslice for devices: %w", err)
 		}
 	}
 	return nil
@@ -375,52 +386,40 @@ func (s *DeviceState) unprepareGpus(ctx context.Context, config *configapi.GpuCo
 			}
 		}
 	}
-	// Go back to default time-slicing for all full GPUs.
-	tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig
-	if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil {
-		return fmt.Errorf("error setting timeslice for devices: %w", err)
-	}
 	return nil
 }
 
 func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interface, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) {
-	var err error
 	var configState DeviceConfigState
-
 	switch castConfig := config.(type) {
 	case *configapi.GpuConfig:
 		configState.GpuConfig = castConfig
-		err = s.applyGpuConfig(ctx, castConfig, claim, results, &configState)
+		return s.applyGpuConfig(ctx, castConfig, claim, results, &configState)
 	case *configapi.MigDeviceConfig:
-		err = s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState)
+		return s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState)
 	case *configapi.ImexChannelConfig:
-		err = s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState)
+		return s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState)
 	default:
-		err = fmt.Errorf("unknown config type: %T", castConfig)
-	}
-	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("unknown config type: %T", castConfig)
 	}
-	return &configState, nil
 }
 
-func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
-	if config.Sharing != nil {
-		err := s.applySharingConfig(ctx, config.Sharing, claim, results, configState)
-		if err != nil {
-			return err
-		}
+func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) {
+	var err error
+	configState, err = s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState)
+	if err != nil {
+		return nil, err
 	}
-	if config.DriverConfig != nil {
-		err := s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState)
+	if config.SupportsSharing() {
+		configState, err = s.applySharingConfig(ctx, config.Sharing, claim, results, configState)
 		if err != nil {
-			return err
+			return nil, err
 		}
 	}
-	return nil
+	return configState, nil
 }
 
-func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
+func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) {
 	// Get the list of claim requests this config is being applied over.
 	var requests []string
 	for _, r := range results {
@@ -437,12 +436,12 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
 	if config.IsTimeSlicing() {
 		tsc, err := config.GetTimeSlicingConfig()
 		if err != nil {
-			return fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
+			return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
 		}
 		if tsc != nil {
 			err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
 			if err != nil {
-				return fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
+				return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
 			}
 		}
 	}
@@ -451,55 +450,51 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
 	if config.IsMps() {
 		mpsc, err := config.GetMpsConfig()
 		if err != nil {
-			return fmt.Errorf("error getting MPS configuration: %w", err)
+			return nil, fmt.Errorf("error getting MPS configuration: %w", err)
 		}
 		mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices)
 		if err := mpsControlDaemon.Start(ctx, mpsc); err != nil {
-			return fmt.Errorf("error starting MPS control daemon: %w", err)
+			return nil, fmt.Errorf("error starting MPS control daemon: %w", err)
 		}
 		if err := mpsControlDaemon.AssertReady(ctx); err != nil {
-			return fmt.Errorf("MPS control daemon is not yet ready: %w", err)
+			return nil, fmt.Errorf("MPS control daemon is not yet ready: %w", err)
 		}
 		configState.MpsControlDaemonID = mpsControlDaemon.GetID()
 		configState.containerEdits = mpsControlDaemon.GetCDIContainerEdits()
 	}
 
-	return nil
+	return configState, nil
 }
 
-func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
+func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) {
 	// Create any necessary IMEX channels and gather their CDI container edits.
 	for _, r := range results {
 		imexChannel := s.allocatable[r.Device].ImexChannel
 		if err := s.nvdevlib.createImexChannelDevice(imexChannel.Channel); err != nil {
-			return fmt.Errorf("error creating IMEX channel device: %w", err)
+			return nil, fmt.Errorf("error creating IMEX channel device: %w", err)
 		}
 		configState.containerEdits = configState.containerEdits.Append(s.cdi.GetImexChannelContainerEdits(imexChannel))
 	}
 
-	return nil
+	return configState, nil
 }
 
-func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) error {
-	// Get the list of allocatable devices this config is being applied over.
-	allocatableDevices := make(AllocatableDevices)
-	for _, r := range results {
-		allocatableDevices[r.Device] = s.allocatable[r.Device]
+func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) {
+	if config.Driver != configapi.VfioPciDriver {
+		return configState, nil
 	}
 
-	if config.Driver == configapi.VfioPciDriver {
-		// Apply vfio-pci driver settings.
-		for _, r := range results {
-			info := allocatableDevices[r.Device]
-			err := s.vfioPciManager.Configure(info.Gpu)
-			if err != nil {
-				return err
-			}
-			configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu))
+	// Apply vfio-pci driver settings.
+	for _, r := range results {
+		info := s.allocatable[r.Device]
+		err := s.vfioPciManager.Configure(info.Gpu)
+		if err != nil {
+			return nil, err
 		}
+		configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu))
 	}
 
-	return nil
+	return configState, nil
 }
 
 // GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver.
diff --git a/cmd/nvidia-dra-plugin/deviceinfo.go b/cmd/nvidia-dra-plugin/deviceinfo.go
index 8098cc07..194742ea 100644
--- a/cmd/nvidia-dra-plugin/deviceinfo.go
+++ b/cmd/nvidia-dra-plugin/deviceinfo.go
@@ -29,6 +29,7 @@ import (
 
 type GpuInfo struct {
 	UUID                  string `json:"uuid"`
+	PciAddress            string `json:"pciAddress"`
 	index                 int
 	minor                 int
 	migEnabled            bool
@@ -40,7 +41,6 @@ type GpuInfo struct {
 	driverVersion         string
 	cudaDriverVersion     string
 	migProfiles           []*MigProfileInfo
-	PciAddress            string `json:"pciAddress"`
 }
 
 type MigDeviceInfo struct {
diff --git a/cmd/nvidia-dra-plugin/main.go b/cmd/nvidia-dra-plugin/main.go
index 889be930..ab5bd585 100644
--- a/cmd/nvidia-dra-plugin/main.go
+++ b/cmd/nvidia-dra-plugin/main.go
@@ -52,9 +52,6 @@ type Flags struct {
 	hostDriverRoot      string
 	nvidiaCTKPath       string
 	deviceClasses       sets.Set[string]
-	pciDevicesRoot      string
-	sysModulesRoot      string
-	vfioDevicesRoot     string
 }
 
 type Config struct {
diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go
index 45634431..37639e92 100644
--- a/cmd/nvidia-dra-plugin/nvlib.go
+++ b/cmd/nvidia-dra-plugin/nvlib.go
@@ -199,10 +199,6 @@ func (l deviceLib) enumerateImexChannels(config *Config) (AllocatableDevices, er
 	return devices, nil
 }
 
-func getPciAddressFromNvmlPciInfo(info nvml.PciInfo) string {
-	return fmt.Sprintf("%04x:%02x:%02x.0", info.Domain, info.Bus, info.Device)
-}
-
 func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) {
 	minor, ret := device.GetMinorNumber()
 	if ret != nvml.SUCCESS {
@@ -244,12 +240,10 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
 	if ret != nvml.SUCCESS {
 		return nil, fmt.Errorf("error getting CUDA driver version: %w", err)
 	}
-	pciInfo, ret := l.nvmllib.DeviceGetPciInfo(device)
-	if ret != nvml.SUCCESS {
-		return nil, fmt.Errorf("error getting PCI info for device %d: %w", index, err)
+	pciAddress, err := device.GetPCIBusID()
+	if err != nil {
+		return nil, err
 	}
-	pciAddress := getPciAddressFromNvmlPciInfo(pciInfo)
-
 	var migProfiles []*MigProfileInfo
 	for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
 		giProfileInfo, ret := device.GetGpuInstanceProfileInfo(i)
diff --git a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl
index 7cf4ea01..8fff1579 100644
--- a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl
+++ b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl
@@ -127,3 +127,21 @@ Filter a list by a set of valid values
   {{- end }}
   {{- $result -}}
 {{- end -}}
+
+{{- define "k8s-dra-driver.vfiopciDeviceClassVolumes" -}}
+- name: sysfs
+  hostPath:
+    path: /sys
+- name: dev-vfio
+  hostPath:
+    path: /dev/vfio
+{{- end -}}
+
+{{- define "k8s-dra-driver.vfiopciDeviceClassVolumeMounts" -}}
+- name: sysfs
+  mountPath: /sys
+  readOnly: false
+- name: dev-vfio
+  mountPath: /dev/vfio
+  readOnly: false
+{{- end -}}
\ No newline at end of file
diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
index e79e70b1..161d78a0 100644
--- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
+++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
@@ -103,12 +103,9 @@ spec:
         - name: driver-root
           mountPath: /driver-root
           readOnly: true
-        - name: sysfs
-          mountPath: /sys
-          readOnly: false
-        - name: dev-vfio
-          mountPath: /dev/vfio
-          readOnly: false
+        {{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }}
+        {{- include "k8s-dra-driver.vfiopciDeviceClassVolumeMounts" . | nindent 8 }}
+        {{- end }}
       volumes:
       - name: plugins-registry
         hostPath:
@@ -122,12 +119,9 @@ spec:
       - name: driver-root
         hostPath:
           path: {{ .Values.nvidiaDriverRoot }}
-      - name: sysfs
-        hostPath:
-          path: /sys
-      - name: dev-vfio
-        hostPath:
-          path: /dev/vfio
+      {{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }}
+      {{- include "k8s-dra-driver.vfiopciDeviceClassVolumes" . | nindent 6}}
+      {{- end }}
       {{- with .Values.kubeletPlugin.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}
diff --git a/deployments/helm/k8s-dra-driver/templates/validation.yaml b/deployments/helm/k8s-dra-driver/templates/validation.yaml
index ce2dbe68..f93e7dd3 100644
--- a/deployments/helm/k8s-dra-driver/templates/validation.yaml
+++ b/deployments/helm/k8s-dra-driver/templates/validation.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-{{- $validDeviceClasses := list "gpu" "mig" "imex" }}
+{{- $validDeviceClasses := list "gpu" "mig" "imex" "vfiopci" }}
 
 {{- if not (kindIs "slice" .Values.deviceClasses) }}
 {{- $error := "" }}

From 69cff40f1dfebdc9f9b225bf18bbc02698e0ec6e Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Thu, 5 Dec 2024 11:22:13 -0800
Subject: [PATCH 8/9] use nsenter to run kernel tasks

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 cmd/nvidia-dra-plugin/vfio-device.go | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/cmd/nvidia-dra-plugin/vfio-device.go b/cmd/nvidia-dra-plugin/vfio-device.go
index fb5076dd..d4c8fc8e 100644
--- a/cmd/nvidia-dra-plugin/vfio-device.go
+++ b/cmd/nvidia-dra-plugin/vfio-device.go
@@ -17,6 +17,7 @@
 package main
 
 import (
+	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -26,6 +27,7 @@ import (
 )
 
 const (
+	hostNamespaceMount     = "/proc/1/ns/mnt"
 	vfioPciModule          = "vfio_pci"
 	vfioPciDriver          = "vfio-pci"
 	nvidiaDriver           = "nvidia"
@@ -80,8 +82,7 @@ func (vm *VfioPciManager) isVfioPCIModuleLoaded() bool {
 }
 
 func (vm *VfioPciManager) loadVfioPciModule() error {
-	cmd := exec.Command("modprobe", vm.vfioPciModule) //nolint:gosec
-	_, err := cmd.CombinedOutput()
+	_, err := execCommandInHostNamespace("modprobe", []string{vm.vfioPciModule}) //nolint:gosec
 	if err != nil {
 		return err
 	}
@@ -149,8 +150,7 @@ func changeDriver(pciAddress, driver string) error {
 }
 
 func unbindFromDriver(pciAddress, driverResetRetries string) error {
-	cmd := exec.Command(unbindFromDriverScript, pciAddress, driverResetRetries) //nolint:gosec
-	_, err := cmd.CombinedOutput()
+	_, err := execCommandInHostNamespace(unbindFromDriverScript, []string{pciAddress, driverResetRetries}) //nolint:gosec
 	if err != nil {
 		return err
 	}
@@ -158,8 +158,7 @@ func unbindFromDriver(pciAddress, driverResetRetries string) error {
 }
 
 func bindToDriver(pciAddress, driver string) error {
-	cmd := exec.Command(bindToDriverScript, pciAddress, driver) //nolint:gosec
-	_, err := cmd.CombinedOutput()
+	_, err := execCommandInHostNamespace(bindToDriverScript, []string{pciAddress, driver}) //nolint:gosec
 	if err != nil {
 		return err
 	}
@@ -190,3 +189,9 @@ func (vm *VfioPciManager) GetCDIContainerEdits(info *GpuInfo) *cdiapi.ContainerE
 		},
 	}
 }
+
+func execCommandInHostNamespace(cmd string, args []string) ([]byte, error) {
+	nsenterArgs := []string{fmt.Sprintf("--mount=%s", hostNamespaceMount), "--", cmd}
+	nsenterArgs = append(nsenterArgs, args...)
+	return exec.Command("nsenter", nsenterArgs...).CombinedOutput()
+}

From aa3c2bffb542d32354dd4779f697bcc8beca09e8 Mon Sep 17 00:00:00 2001
From: Varun Ramachandra Sekar <vsekar@nvidia.com>
Date: Thu, 5 Dec 2024 11:25:54 -0800
Subject: [PATCH 9/9] add missing dockerfile dependencies

Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
---
 deployments/container/Dockerfile.ubi8   | 4 +++-
 deployments/container/Dockerfile.ubuntu | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/deployments/container/Dockerfile.ubi8 b/deployments/container/Dockerfile.ubi8
index ab52a352..ab3b2715 100644
--- a/deployments/container/Dockerfile.ubi8
+++ b/deployments/container/Dockerfile.ubi8
@@ -19,7 +19,7 @@ ARG BASE_DIST=ubi8
 FROM --platform=${TARGETARCH} nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build
 
 RUN yum install -y \
-    wget make git gcc \
+    wget make git gcc kmod \
      && \
     rm -rf /var/cache/yum/*
 
@@ -63,6 +63,8 @@ COPY --from=build /artifacts/nvidia-dra-plugin         /usr/bin/nvidia-dra-plugi
 COPY --from=build /build/templates                     /templates
 COPY --from=build /build/scripts/bind_to_driver.sh     /usr/bin/bind_to_driver.sh
 COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh
+COPY --from=build /usr/bin/nsenter                     /usr/bin/nsenter
+COPY --from=build /usr/sbin/modprobe                   /usr/sbin/modprobe
 
 # Install / upgrade packages here that are required to resolve CVEs
 ARG CVE_UPDATES
diff --git a/deployments/container/Dockerfile.ubuntu b/deployments/container/Dockerfile.ubuntu
index b815bcd1..3559e5c0 100644
--- a/deployments/container/Dockerfile.ubuntu
+++ b/deployments/container/Dockerfile.ubuntu
@@ -19,7 +19,7 @@ ARG BASE_DIST=ubuntu20.04
 FROM --platform=${BUILDOS}/amd64 nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build
 
 RUN apt-get update && \
-    apt-get install -y wget make git gcc-aarch64-linux-gnu gcc \
+    apt-get install -y wget make git gcc-aarch64-linux-gnu gcc kmod \
     && \
     rm -rf /var/lib/apt/lists/*
 
@@ -68,6 +68,8 @@ COPY --from=build /artifacts/nvidia-dra-plugin         /usr/bin/nvidia-dra-plugi
 COPY --from=build /build/templates                     /templates
 COPY --from=build /build/scripts/bind_to_driver.sh     /usr/bin/bind_to_driver.sh
 COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh
+COPY --from=build /usr/bin/nsenter                     /usr/bin/nsenter
+COPY --from=build /usr/sbin/modprobe                   /usr/sbin/modprobe
 
 # Install / upgrade packages here that are required to resolve CVEs
 ARG CVE_UPDATES