Skip to content

Commit 2d7b126

Browse files
committed
Merge branch 'CNT-4762/extend-runtime-cdi-device-names' into 'main'
Extend the 'runtime.nvidia.com/gpu' CDI device kind to support full-GPUs... See merge request nvidia/container-toolkit/container-toolkit!514
2 parents 08ef3e7 + 86d8639 commit 2d7b126

33 files changed

+1569
-6
lines changed

Diff for: CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# NVIDIA Container Toolkit Changelog
22

3+
## v1.15.0-rc.2
4+
* Extend the `runtime.nvidia.com/gpu` CDI kind to support full-GPUs and MIG devices specified by index or UUID.
5+
36
## v1.15.0-rc.1
47
* Skip update of ldcache in containers without ldconfig. The .so.SONAME symlinks are still created.
58
* Normalize ldconfig path on use. This automatically adjust the ldconfig setting applied to ldconfig.real on systems where this exists.
@@ -10,6 +13,7 @@
1013
* Added support for `nvidia-ctk runtime configure --enable-cdi` for the `docker` runtime. Note that this requires Docker >= 25.
1114
* Fixed bug in `nvidia-ctk config` command when using `--set`. The types of applied config options are now applied correctly.
1215
* Add `--relative-to` option to `nvidia-ctk transform root` command. This controls whether the root transformation is applied to host or container paths.
16+
* Added automatic CDI spec generation when the `runtime.nvidia.com/gpu=all` device is requested by a container.
1317

1418
* [libnvidia-container] Fix device permission check when using cgroupv2 (fixes #227)
1519

Diff for: go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d
77
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f
88
github.com/fsnotify/fsnotify v1.5.4
9+
github.com/google/uuid v1.4.0
910
github.com/opencontainers/runtime-spec v1.1.0
1011
github.com/pelletier/go-toml v1.9.4
1112
github.com/sirupsen/logrus v1.9.0

Diff for: go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
1515
github.com/fsnotify/fsnotify v1.5.4 h1:jRbGcIw6P2Meqdwuo0H1p6JVLbL5DHKAKlYndzMwVZI=
1616
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
1717
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
18+
github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4=
19+
github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
1820
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
1921
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
2022
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=

Diff for: internal/modifier/cdi.go

+30-6
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,12 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
5050
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
5151
}
5252
if len(automaticDevices) > 0 {
53-
return newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
53+
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
54+
if err == nil {
55+
return automaticModifier, nil
56+
}
57+
logger.Warningf("Failed to create the automatic CDI modifier: %w", err)
58+
logger.Debugf("Falling back to the standard CDI modifier")
5459
}
5560

5661
return cdi.New(
@@ -152,7 +157,8 @@ func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]s
152157
func filterAutomaticDevices(devices []string) []string {
153158
var automatic []string
154159
for _, device := range devices {
155-
if device == "runtime.nvidia.com/gpu=all" {
160+
vendor, class, _ := parser.ParseDevice(device)
161+
if vendor == "runtime.nvidia.com" && class == "gpu" {
156162
automatic = append(automatic, device)
157163
}
158164
}
@@ -176,9 +182,6 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
176182
return cdiModifier, nil
177183
}
178184

179-
// TODO: use the requested devices when generating the CDI spec once we add
180-
// automatic CDI generation for more than just the 'runtime.nvidia.com/gpu=all'
181-
// device
182185
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
183186
cdilib, err := nvcdi.New(
184187
nvcdi.WithLogger(logger),
@@ -191,5 +194,26 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
191194
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
192195
}
193196

194-
return cdilib.GetSpec()
197+
identifiers := []string{}
198+
for _, device := range devices {
199+
_, _, id := parser.ParseDevice(device)
200+
identifiers = append(identifiers, id)
201+
}
202+
203+
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
204+
if err != nil {
205+
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
206+
}
207+
208+
commonEdits, err := cdilib.GetCommonEdits()
209+
if err != nil {
210+
return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err)
211+
}
212+
213+
return spec.New(
214+
spec.WithDeviceSpecs(deviceSpecs),
215+
spec.WithEdits(*commonEdits.ContainerEdits),
216+
spec.WithVendor("runtime.nvidia.com"),
217+
spec.WithClass("gpu"),
218+
)
195219
}

Diff for: pkg/nvcdi/api.go

+1
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,5 @@ type Interface interface {
5151
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
5252
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
5353
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
54+
GetDeviceSpecsByID(...string) ([]specs.Device, error)
5455
}

Diff for: pkg/nvcdi/gds.go

+7
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,10 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
8181
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
8282
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
8383
}
84+
85+
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
86+
// the provided identifiers, where an identifier is an index or UUID of a valid
87+
// GPU device.
88+
func (l *gdslib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
89+
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
90+
}

Diff for: pkg/nvcdi/identifier.go

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package nvcdi
2+
3+
import (
4+
"strconv"
5+
"strings"
6+
7+
"github.com/google/uuid"
8+
)
9+
10+
type identifier string
11+
12+
// isGPUIndex checks if an identifier is a full GPU index
13+
func (i identifier) isGpuIndex() bool {
14+
if _, err := strconv.ParseUint(string(i), 10, 0); err != nil {
15+
return false
16+
}
17+
return true
18+
}
19+
20+
// isMigIndex checks if an identifier is a MIG index
21+
func (i identifier) isMigIndex() bool {
22+
split := strings.SplitN(string(i), ":", 2)
23+
if len(split) != 2 {
24+
return false
25+
}
26+
for _, s := range split {
27+
if _, err := strconv.ParseUint(s, 10, 0); err != nil {
28+
return false
29+
}
30+
}
31+
return true
32+
}
33+
34+
// isUUID checks if an identifier is a UUID
35+
func (i identifier) isUUID() bool {
36+
return i.isGpuUUID() || i.isMigUUID()
37+
}
38+
39+
// isGpuUUID checks if an identifier is a GPU UUID
40+
// A GPU UUID must be of the form GPU-b1028956-cfa2-0990-bf4a-5da9abb51763
41+
func (i identifier) isGpuUUID() bool {
42+
if !strings.HasPrefix(string(i), "GPU-") {
43+
return false
44+
}
45+
_, err := uuid.Parse(strings.TrimPrefix(string(i), "GPU-"))
46+
return err == nil
47+
}
48+
49+
// isMigUUID checks if an identifier is a MIG UUID
50+
// A MIG UUID can be of one of two forms:
51+
// - MIG-b1028956-cfa2-0990-bf4a-5da9abb51763
52+
// - MIG-GPU-b1028956-cfa2-0990-bf4a-5da9abb51763/3/0
53+
func (i identifier) isMigUUID() bool {
54+
if !strings.HasPrefix(string(i), "MIG-") {
55+
return false
56+
}
57+
suffix := strings.TrimPrefix(string(i), "MIG-")
58+
_, err := uuid.Parse(suffix)
59+
if err == nil {
60+
return true
61+
}
62+
split := strings.SplitN(suffix, "/", 3)
63+
if len(split) != 3 {
64+
return false
65+
}
66+
if !identifier(split[0]).isGpuUUID() {
67+
return false
68+
}
69+
for _, s := range split[1:] {
70+
_, err := strconv.ParseUint(s, 10, 0)
71+
if err != nil {
72+
return false
73+
}
74+
}
75+
return true
76+
}

Diff for: pkg/nvcdi/identifier_test.go

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package nvcdi
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
7+
"github.com/stretchr/testify/require"
8+
)
9+
10+
func TestIsGpuIndex(t *testing.T) {
11+
testCases := []struct {
12+
id string
13+
expected bool
14+
}{
15+
{"", false},
16+
{"0", true},
17+
{"1", true},
18+
{"not an integer", false},
19+
}
20+
for i, tc := range testCases {
21+
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
22+
actual := identifier(tc.id).isGpuIndex()
23+
require.Equal(t, tc.expected, actual)
24+
})
25+
}
26+
}
27+
28+
func TestIsMigIndex(t *testing.T) {
29+
testCases := []struct {
30+
id string
31+
expected bool
32+
}{
33+
{"", false},
34+
{"0", false},
35+
{"not an integer", false},
36+
{"0:0", true},
37+
{"0:0:0", false},
38+
{"0:0.0", false},
39+
{"0:foo", false},
40+
{"foo:0", false},
41+
}
42+
for i, tc := range testCases {
43+
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
44+
actual := identifier(tc.id).isMigIndex()
45+
require.Equal(t, tc.expected, actual)
46+
})
47+
}
48+
}
49+
50+
func TestIsGpuUUID(t *testing.T) {
51+
testCases := []struct {
52+
id string
53+
expected bool
54+
}{
55+
{"", false},
56+
{"0", false},
57+
{"not an integer", false},
58+
{"GPU-foo", false},
59+
{"GPU-ebd34bdf-1083-eaac-2aff-4b71a022f9bd", true},
60+
{"MIG-ebd34bdf-1083-eaac-2aff-4b71a022f9bd", false},
61+
{"ebd34bdf-1083-eaac-2aff-4b71a022f9bd", false},
62+
}
63+
for i, tc := range testCases {
64+
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
65+
actual := identifier(tc.id).isGpuUUID()
66+
require.Equal(t, tc.expected, actual)
67+
})
68+
}
69+
}
70+
71+
func TestIsMigUUID(t *testing.T) {
72+
testCases := []struct {
73+
id string
74+
expected bool
75+
}{
76+
{"", false},
77+
{"0", false},
78+
{"not an integer", false},
79+
{"MIG-foo", false},
80+
{"MIG-ebd34bdf-1083-eaac-2aff-4b71a022f9bd", true},
81+
{"GPU-ebd34bdf-1083-eaac-2aff-4b71a022f9bd", false},
82+
{"ebd34bdf-1083-eaac-2aff-4b71a022f9bd", false},
83+
}
84+
for i, tc := range testCases {
85+
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
86+
actual := identifier(tc.id).isMigUUID()
87+
require.Equal(t, tc.expected, actual)
88+
})
89+
}
90+
}

Diff for: pkg/nvcdi/lib-csv.go

+7
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,10 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
9494
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
9595
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
9696
}
97+
98+
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
99+
// the provided identifiers, where an identifier is an index or UUID of a valid
100+
// GPU device.
101+
func (l *csvlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
102+
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported for CSV files")
103+
}

0 commit comments

Comments
 (0)