Skip to content

Commit 539c3e3

Browse files
authored
Merge pull request #1827 from tkatila/gpu-plugin-cdi-support
GPU-plugin: add cdi support
2 parents d11f3e5 + 8dd5b4a commit 539c3e3

File tree

16 files changed

+528
-69
lines changed

16 files changed

+528
-69
lines changed

.golangci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ issues:
7373
- path: test/e2e/
7474
linters:
7575
- wsl
76+
- gocognit
77+
- gocyclo
7678
- path: cmd/gpu_fakedev/
7779
linters:
7880
- wsl

cmd/gpu_plugin/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Table of Contents
1616
* [Running GPU plugin as non-root](#running-gpu-plugin-as-non-root)
1717
* [Labels created by GPU plugin](#labels-created-by-gpu-plugin)
1818
* [SR-IOV use with the plugin](#sr-iov-use-with-the-plugin)
19+
* [CDI support](#cdi-support)
1920
* [KMD and UMD](#kmd-and-umd)
2021
* [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups)
2122
* [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api)
@@ -218,6 +219,19 @@ GPU plugin does __not__ setup SR-IOV. It has to be configured by the cluster adm
218219
219220
GPU plugin does however support provisioning Virtual Functions (VFs) to containers for a SR-IOV enabled GPU. When the plugin detects a GPU with SR-IOV VFs configured, it will only provision the VFs and leaves the PF device on the host.
220221
222+
### CDI support
223+
224+
GPU plugin supports [CDI](https://github.com/container-orchestrated-devices/container-device-interface) to provide device details to the container. It does not yet provide any benefits compared to the traditional Kubernetes Device Plugin API. The CDI device specs will improve in the future with features that are not possible with the Device Plugin API.
225+
226+
To enable CDI support, container runtime has to support it. The support varies depending on the versions:
227+
* CRI-O supports CDI by default v1.24.0 onwards.
228+
* Containerd supports CDI from 1.7.0 onwards. 2.0.0 release will enable it by default.
229+
* Docker supports CDI from v25 onwards.
230+
231+
Kubernetes CDI support is included since 1.28 release. In 1.28 it needs to be enabled via `DevicePluginCDIDevices` feature gate. From 1.29 onwards the feature is enabled by default.
232+
233+
> *NOTE*: To use CDI outside of Kubernetes, for example with Docker or Podman, CDI specs can be generated with the [Intel CDI specs generator](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases/tag/specs-generator-v0.1.0).
234+
221235
### KMD and UMD
222236
223237
There are 3 different Kernel Mode Drivers (KMD) available: `i915 upstream`, `i915 backport` and `xe`:

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
3535
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
3636
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
37+
cdispec "tags.cncf.io/container-device-interface/specs-go"
3738
)
3839

3940
const (
@@ -202,13 +203,10 @@ func packedPolicy(req *pluginapi.ContainerPreferredAllocationRequest) []string {
202203
return deviceIds
203204
}
204205

205-
// Returns a slice of by-path Mounts for a cardPath&Name.
206-
// by-path files are searched from the given bypathDir.
207-
// In the by-path dir, any files that start with "pci-<pci addr>" will be added to mounts.
208-
func (dp *devicePlugin) bypathMountsForPci(cardPath, cardName, bypathDir string) []pluginapi.Mount {
206+
func (dp *devicePlugin) pciAddressForCard(cardPath, cardName string) (string, error) {
209207
linkPath, err := os.Readlink(cardPath)
210208
if err != nil {
211-
return nil
209+
return "", err
212210
}
213211

214212
// Fetches the pci address for a drm card by reading the
@@ -220,9 +218,27 @@ func (dp *devicePlugin) bypathMountsForPci(cardPath, cardName, bypathDir string)
220218
if !dp.pciAddressReg.MatchString(pciAddress) {
221219
klog.Warningf("Invalid pci address for %s: %s", cardPath, pciAddress)
222220

223-
return nil
221+
return "", os.ErrInvalid
224222
}
225223

224+
return pciAddress, nil
225+
}
226+
227+
func pciDeviceIDForCard(cardPath string) (string, error) {
228+
idPath := filepath.Join(cardPath, "device", "device")
229+
230+
idBytes, err := os.ReadFile(idPath)
231+
if err != nil {
232+
return "", err
233+
}
234+
235+
return strings.Split(string(idBytes), "\n")[0], nil
236+
}
237+
238+
// Returns a slice of by-path Mounts for a pciAddress.
239+
// by-path files are searched from the given bypathDir.
240+
// In the by-path dir, any files that start with "pci-<pci addr>" will be added to mounts.
241+
func (dp *devicePlugin) bypathMountsForPci(pciAddress, bypathDir string) []pluginapi.Mount {
226242
files, err := os.ReadDir(bypathDir)
227243
if err != nil {
228244
klog.Warningf("Failed to read by-path directory: %+v", err)
@@ -481,6 +497,45 @@ func (dp *devicePlugin) createDeviceSpecsFromDrmFiles(cardPath string) []plugina
481497
return specs
482498
}
483499

500+
func (dp *devicePlugin) createMountsAndCDIDevices(cardPath, name string, devSpecs []pluginapi.DeviceSpec) ([]pluginapi.Mount, *cdispec.Spec) {
501+
mounts := []pluginapi.Mount{}
502+
503+
if dp.bypathFound {
504+
if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil {
505+
mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir)
506+
}
507+
}
508+
509+
spec := &cdispec.Spec{
510+
Version: dpapi.CDIVersion,
511+
Kind: dpapi.CDIVendor + "/gpu",
512+
Devices: make([]cdispec.Device, 1),
513+
}
514+
515+
spec.Devices[0].Name = name
516+
517+
cedits := &spec.Devices[0].ContainerEdits
518+
519+
for _, dspec := range devSpecs {
520+
cedits.DeviceNodes = append(cedits.DeviceNodes, &cdispec.DeviceNode{
521+
HostPath: dspec.HostPath,
522+
Path: dspec.ContainerPath,
523+
Permissions: dspec.Permissions,
524+
})
525+
}
526+
527+
for _, mount := range mounts {
528+
cedits.Mounts = append(cedits.Mounts, &cdispec.Mount{
529+
HostPath: mount.HostPath,
530+
ContainerPath: mount.ContainerPath,
531+
Type: "none",
532+
Options: []string{"bind", "ro"},
533+
})
534+
}
535+
536+
return mounts, spec
537+
}
538+
484539
func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
485540
files, err := os.ReadDir(dp.sysfsDir)
486541
if err != nil {
@@ -509,12 +564,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
509564
continue
510565
}
511566

512-
mounts := []pluginapi.Mount{}
513-
if dp.bypathFound {
514-
mounts = dp.bypathMountsForPci(cardPath, name, dp.bypathDir)
515-
}
567+
mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs)
516568

517-
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, nil)
569+
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices)
518570

519571
for i := 0; i < dp.options.sharedDevNum; i++ {
520572
devID := fmt.Sprintf("%s-%d", name, i)

0 commit comments

Comments
 (0)