Skip to content

Commit 4fbb11e

Browse files
authored
Merge pull request #1515 from elezar/forward-compat-tegra
Add enable-cuda-compat on Tegra-based systems
2 parents 923fa9b + c8056eb commit 4fbb11e

File tree

7 files changed

+117
-37
lines changed

7 files changed

+117
-37
lines changed

cmd/nvidia-cdi-hook/cudacompat/cudacompat.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import (
3131
)
3232

3333
const (
34-
cudaCompatPath = "/usr/local/cuda/compat"
34+
defaultCudaCompatPath = "/usr/local/cuda/compat"
3535
// cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename
3636
// in ld.so.conf.d that includes a reference to the CUDA compat path.
3737
// The 00-compat prefix is chosen to ensure that these libraries have a
@@ -44,8 +44,11 @@ type command struct {
4444
}
4545

4646
type options struct {
47-
hostDriverVersion string
48-
containerSpec string
47+
cudaCompatContainerRoot string
48+
hostDriverVersion string
49+
// containerSpec allows the path to the container spec to be specified for
50+
// testing.
51+
containerSpec string
4952
}
5053

5154
// NewCommand constructs a cuda-compat command with the specified logger
@@ -76,6 +79,12 @@ func (m command) build() *cli.Command {
7679
Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.",
7780
Destination: &cfg.hostDriverVersion,
7881
},
82+
&cli.StringFlag{
83+
Name: "cuda-compat-container-root",
84+
Usage: "Specify the folder in which CUDA compat libraries are located in the container",
85+
Value: defaultCudaCompatPath,
86+
Destination: &cfg.cudaCompatContainerRoot,
87+
},
7988
&cli.StringFlag{
8089
Name: "container-spec",
8190
Hidden: true,
@@ -108,7 +117,7 @@ func (m command) run(_ *cli.Command, cfg *options) error {
108117
return fmt.Errorf("failed to determined container root: %w", err)
109118
}
110119

111-
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
120+
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.cudaCompatContainerRoot, cfg.hostDriverVersion)
112121
if err != nil {
113122
return fmt.Errorf("failed to get container forward compat directory: %w", err)
114123
}
@@ -119,17 +128,17 @@ func (m command) run(_ *cli.Command, cfg *options) error {
119128
return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
120129
}
121130

122-
func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
131+
func (m command) getContainerForwardCompatDir(containerRoot containerRoot, cudaCompatRoot string, hostDriverVersion string) (string, error) {
123132
if hostDriverVersion == "" {
124133
m.logger.Debugf("Host driver version not specified")
125134
return "", nil
126135
}
127-
if !containerRoot.hasPath(cudaCompatPath) {
136+
if !containerRoot.hasPath(cudaCompatRoot) {
128137
m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
129138
return "", nil
130139
}
131140

132-
libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
141+
libs, err := containerRoot.globFiles(filepath.Join(cudaCompatRoot, "libcuda.so.*.*"))
133142
if err != nil {
134143
m.logger.Warningf("Failed to find CUDA compat library: %w", err)
135144
return "", nil

cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ func TestCompatLibs(t *testing.T) {
131131
c := command{
132132
logger: logger,
133133
}
134-
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
134+
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), defaultCudaCompatPath, tc.hostDriverVersion)
135135
require.NoError(t, err)
136136
require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
137137
})

internal/discover/compat_libs.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ import (
88

99
// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook.
1010
// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version.
11-
func NewCUDACompatHookDiscoverer(logger logger.Interface, hookCreator HookCreator, version string) Discover {
11+
func NewCUDACompatHookDiscoverer(logger logger.Interface, hookCreator HookCreator, version string, cudaCompatContainerRoot string) Discover {
1212
var args []string
1313
if version != "" && !strings.Contains(version, "*") {
1414
args = append(args, "--host-driver-version="+version)
1515
}
16+
if cudaCompatContainerRoot != "" {
17+
args = append(args, "--cuda-compat-container-root="+cudaCompatContainerRoot)
18+
}
1619

1720
return hookCreator.Create("enable-cuda-compat", args...)
1821
}

internal/modifier/gated.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, dr
107107
return nil, fmt.Errorf("failed to get driver version: %w", err)
108108
}
109109

110-
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, hookCreator, version)
110+
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, hookCreator, version, "")
111111
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
112112
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
113113
return compatLibHookDiscoverer, nil

internal/platform-support/tegra/tegra.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,6 @@ func New(opts ...Option) (discover.Discover, error) {
5959
return nil, fmt.Errorf("failed to create discoverer for mount specs: %v", err)
6060
}
6161

62-
ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(o.logger, mountSpecDiscoverer, o.hookCreator, o.ldconfigPath)
63-
if err != nil {
64-
return nil, fmt.Errorf("failed to create ldcach update hook discoverer: %v", err)
65-
}
66-
6762
tegraSystemMounts := discover.NewMounts(
6863
o.logger,
6964
lookup.NewFileLocator(lookup.WithLogger(o.logger)),
@@ -75,9 +70,6 @@ func New(opts ...Option) (discover.Discover, error) {
7570

7671
d := discover.Merge(
7772
mountSpecDiscoverer,
78-
// The ldcacheUpdateHook is added after the mount spec discoverer to
79-
// ensure that the symlinks are included.
80-
ldcacheUpdateHook,
8173
tegraSystemMounts,
8274
)
8375

pkg/nvcdi/driver-nvml.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func (l *nvcdilib) NewDriverLibraryDiscoverer(version string, libcudaSoParentDir
101101
)
102102
discoverers = append(discoverers, driverDotSoSymlinksDiscoverer)
103103

104-
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version)
104+
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, "")
105105
discoverers = append(discoverers, cudaCompatLibHookDiscoverer)
106106

107107
updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.hookCreator, l.ldconfigPath)

pkg/nvcdi/lib-csv.go

Lines changed: 94 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"fmt"
2121
"slices"
2222
"strconv"
23+
"strings"
2324

2425
"tags.cncf.io/container-device-interface/pkg/cdi"
2526
"tags.cncf.io/container-device-interface/specs-go"
@@ -101,12 +102,12 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) {
101102
}
102103
e, err := edits.FromDiscoverer(deviceNodeDiscoverer)
103104
if err != nil {
104-
return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err)
105+
return nil, fmt.Errorf("failed to create container edits for CSV files: %w", err)
105106
}
106107

107108
names, err := l.deviceNamers.GetDeviceNames(l.index, l)
108109
if err != nil {
109-
return nil, fmt.Errorf("failed to get device name: %v", err)
110+
return nil, fmt.Errorf("failed to get device name: %w", err)
110111
}
111112
var deviceSpecs []specs.Device
112113
for _, name := range names {
@@ -157,22 +158,7 @@ func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) {
157158
// GetCommonEdits generates a CDI specification that can be used for ANY devices
158159
// These explicitly do not include any device nodes.
159160
func (l *csvlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
160-
mountSpecs := tegra.Transform(
161-
tegra.Transform(
162-
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
163-
tegra.WithoutDeviceNodes(),
164-
),
165-
tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...),
166-
)
167-
driverDiscoverer, err := tegra.New(
168-
tegra.WithLogger(l.logger),
169-
tegra.WithDriverRoot(l.driverRoot),
170-
tegra.WithDevRoot(l.devRoot),
171-
tegra.WithHookCreator(l.hookCreator),
172-
tegra.WithLdconfigPath(l.ldconfigPath),
173-
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
174-
tegra.WithMountSpecs(mountSpecs),
175-
)
161+
driverDiscoverer, err := l.driverDiscoverer()
176162
if err != nil {
177163
return nil, fmt.Errorf("failed to create driver discoverer from CSV files: %w", err)
178164
}
@@ -321,3 +307,93 @@ func isIntegratedGPU(d nvml.Device) (bool, error) {
321307
}
322308
return pciInfo.Device == 0, nil
323309
}
310+
311+
func (l *csvlib) driverDiscoverer() (discover.Discover, error) {
312+
mountSpecs := tegra.Transform(
313+
tegra.Transform(
314+
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
315+
tegra.WithoutDeviceNodes(),
316+
),
317+
tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...),
318+
)
319+
driverDiscoverer, err := tegra.New(
320+
tegra.WithLogger(l.logger),
321+
tegra.WithDriverRoot(l.driverRoot),
322+
tegra.WithDevRoot(l.devRoot),
323+
tegra.WithHookCreator(l.hookCreator),
324+
tegra.WithLdconfigPath(l.ldconfigPath),
325+
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
326+
tegra.WithMountSpecs(mountSpecs),
327+
)
328+
if err != nil {
329+
return nil, fmt.Errorf("failed to create discoverer from CSV files: %w", err)
330+
}
331+
332+
cudaCompatDiscoverer := l.cudaCompatDiscoverer()
333+
334+
ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(l.logger, driverDiscoverer, l.hookCreator, l.ldconfigPath)
335+
if err != nil {
336+
return nil, fmt.Errorf("failed to create ldcache update hook discoverer: %w", err)
337+
}
338+
339+
d := discover.Merge(
340+
driverDiscoverer,
341+
cudaCompatDiscoverer,
342+
// The ldcacheUpdateHook is added last to ensure that the created symlinks are included
343+
ldcacheUpdateHook,
344+
)
345+
return d, nil
346+
}
347+
348+
// cudaCompatDiscoverer returns a discoverer for the CUDA forward compat hook
349+
// on Tegra-based systems.
350+
// If the system has NVML available, this is used to determine the driver
351+
// version to be passed to the hook.
352+
// On Orin-based systems, the compat library root in the container is also set.
353+
func (l *csvlib) cudaCompatDiscoverer() discover.Discover {
354+
hasNvml, _ := l.infolib.HasNvml()
355+
if !hasNvml {
356+
return nil
357+
}
358+
359+
ret := l.nvmllib.Init()
360+
if ret != nvml.SUCCESS {
361+
l.logger.Warningf("Failed to initialize NVML: %v", ret)
362+
return nil
363+
}
364+
defer func() {
365+
_ = l.nvmllib.Shutdown()
366+
}()
367+
368+
version, ret := l.nvmllib.SystemGetDriverVersion()
369+
if ret != nvml.SUCCESS {
370+
l.logger.Warningf("Failed to get driver version: %v", ret)
371+
return nil
372+
}
373+
374+
var names []string
375+
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
376+
name, ret := d.GetName()
377+
if ret != nvml.SUCCESS {
378+
return fmt.Errorf("device %v: %v", i, ret)
379+
}
380+
names = append(names, name)
381+
return nil
382+
})
383+
if err != nil {
384+
l.logger.Warningf("Failed to get device names: %v", err)
385+
return nil
386+
}
387+
388+
var cudaCompatContainerRoot string
389+
for _, name := range names {
390+
// TODO: Should this be overridable through a feature flag / config option?
391+
if strings.Contains(name, "Orin (nvgpu)") {
392+
// TODO: This should probably be a constant or configurable.
393+
cudaCompatContainerRoot = "/usr/local/cuda/compat-orin"
394+
break
395+
}
396+
}
397+
398+
return discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, cudaCompatContainerRoot)
399+
}

0 commit comments

Comments
 (0)