Skip to content

Commit 60b67f2

Browse files
committed
Expand cudacompat hook to always inspect libcuda.so ELF header if available
This commit reverts 08bf583 which made it so that the libcuda.so ELF header was only ever inspected on certain Tegra systems, like Orin. We now leverage the libcuda.so ELF header in all cases (if available) to determine if the CUDA compat libraries bundled in the container should be used in favor of the host driver libraries. This allows us to support CUDA minor version compatibility. Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
1 parent 7bbafde commit 60b67f2

6 files changed

Lines changed: 245 additions & 16 deletions

File tree

cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"encoding/json"
2525
"fmt"
2626
"os"
27+
"slices"
2728
"strings"
2829

2930
"golang.org/x/mod/semver"
@@ -118,14 +119,31 @@ func getCUDAFwdCompatibilitySection(lib *elf.File) *elf.Section {
118119

119120
// UseCompat checks whether the CUDA compat libraries with the specified elf
120121
// header should be used given the specified host versions.
121-
// This is done by comparing the host CUDA version with the CUDA version
122-
// specified in the ELF header.
123-
func (h *compatElfHeader) UseCompat(hostCUDAVersion string) bool {
122+
// If the host driver version is specified, we check if the driver version
123+
// is supported in the ELF header. If no host driver version is provided, we
124+
// fall back to checking the CUDA version specified in the ELF header.
125+
func (h *compatElfHeader) UseCompat(compatDriverVersion string, hostDriverVersion string, hostCUDAVersion string) bool {
124126
if h == nil {
125127
return false
126128
}
127129

128-
return h.CUDAVersion.UseCompat(hostCUDAVersion)
130+
if compatDriverVersion == "" || hostDriverVersion == "" {
131+
if hostCUDAVersion != "" {
132+
return h.CUDAVersion.UseCompat(hostCUDAVersion)
133+
}
134+
return false
135+
}
136+
137+
hostDriverMajor, err := extractMajorVersion(hostDriverVersion)
138+
if err != nil {
139+
return false
140+
}
141+
142+
if !slices.Contains(h.Driver, hostDriverMajor) {
143+
return false
144+
}
145+
146+
return semver.Compare(normalizeVersion(compatDriverVersion), normalizeVersion(hostDriverVersion)) > 0
129147
}
130148

131149
type cudaVersion string

cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go

Lines changed: 117 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ func TestGetCUDACompatElfHeader(t *testing.T) {
3838
expected *compatElfHeader
3939
}{
4040
{
41-
description: "wip",
42-
filename: "libcuda.so.575.57.08",
41+
description: "575.57.08",
42+
filename: "575.57.08/libcuda.so.575.57.08",
4343
expected: &compatElfHeader{
4444
Format: 1,
4545
CUDAVersion: "12.9",
@@ -48,8 +48,8 @@ func TestGetCUDACompatElfHeader(t *testing.T) {
4848
},
4949
},
5050
{
51-
description: "wip",
52-
filename: "libcuda.so.590.44.01",
51+
description: "590.44.01",
52+
filename: "590.44.01/libcuda.so.590.44.01",
5353
expected: &compatElfHeader{
5454
Format: 1,
5555
CUDAVersion: "13.1",
@@ -70,3 +70,116 @@ func TestGetCUDACompatElfHeader(t *testing.T) {
7070
})
7171
}
7272
}
73+
74+
func TestUseCompat(t *testing.T) {
75+
testCases := []struct {
76+
description string
77+
elfHeader *compatElfHeader
78+
compatDriverVersion string
79+
hostDriverVersion string
80+
hostCudaVersion string
81+
expected bool
82+
}{
83+
{
84+
description: "container cuda version greater than host cuda version",
85+
elfHeader: &compatElfHeader{
86+
Format: 1,
87+
CUDAVersion: "12.9",
88+
Driver: []int{535, 550, 560, 565, 570, 575},
89+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
90+
},
91+
hostCudaVersion: "12.8",
92+
expected: true,
93+
},
94+
{
95+
description: "container cuda version same as host cuda version",
96+
elfHeader: &compatElfHeader{
97+
Format: 1,
98+
CUDAVersion: "12.9",
99+
Driver: []int{535, 550, 560, 565, 570, 575},
100+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
101+
},
102+
hostCudaVersion: "12.9",
103+
expected: false,
104+
},
105+
{
106+
description: "container cuda version less than host cuda version",
107+
elfHeader: &compatElfHeader{
108+
Format: 1,
109+
CUDAVersion: "12.9",
110+
Driver: []int{535, 550, 560, 565, 570, 575},
111+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
112+
},
113+
hostCudaVersion: "12.10",
114+
expected: false,
115+
},
116+
{
117+
description: "host driver branch not supported in compat elf header",
118+
elfHeader: &compatElfHeader{
119+
Format: 1,
120+
CUDAVersion: "12.9",
121+
Driver: []int{535, 550, 560, 565, 570, 575},
122+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
123+
},
124+
compatDriverVersion: "575.57.08",
125+
hostDriverVersion: "590.44.01",
126+
expected: false,
127+
},
128+
{
129+
description: "host driver branch supported in compat elf header, host driver branch < compat driver branch",
130+
elfHeader: &compatElfHeader{
131+
Format: 1,
132+
CUDAVersion: "12.9",
133+
Driver: []int{535, 550, 560, 565, 570, 575},
134+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
135+
},
136+
compatDriverVersion: "575.57.08",
137+
hostDriverVersion: "570.211.01",
138+
expected: true,
139+
},
140+
{
141+
description: "host driver branch same as compat driver branch, compat driver > host driver",
142+
elfHeader: &compatElfHeader{
143+
Format: 1,
144+
CUDAVersion: "12.9",
145+
Driver: []int{535, 550, 560, 565, 570, 575},
146+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
147+
},
148+
compatDriverVersion: "575.57.08",
149+
hostDriverVersion: "575.10.10",
150+
expected: true,
151+
},
152+
{
153+
description: "host driver branch same as compat driver branch, compat driver = host driver",
154+
elfHeader: &compatElfHeader{
155+
Format: 1,
156+
CUDAVersion: "12.9",
157+
Driver: []int{535, 550, 560, 565, 570, 575},
158+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
159+
},
160+
compatDriverVersion: "575.57.08",
161+
hostDriverVersion: "575.57.08",
162+
expected: false,
163+
},
164+
{
165+
description: "host driver branch same as compat driver branch, compat driver < host driver",
166+
elfHeader: &compatElfHeader{
167+
Format: 1,
168+
CUDAVersion: "12.9",
169+
Driver: []int{535, 550, 560, 565, 570, 575},
170+
Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14},
171+
},
172+
compatDriverVersion: "575.57.08",
173+
hostDriverVersion: "575.99.99",
174+
expected: false,
175+
},
176+
}
177+
178+
for _, tc := range testCases {
179+
t.Run(tc.description, func(t *testing.T) {
180+
useCompat := tc.elfHeader.UseCompat(tc.compatDriverVersion, tc.hostDriverVersion, tc.hostCudaVersion)
181+
182+
require.EqualValues(t, tc.expected, useCompat)
183+
})
184+
}
185+
}

cmd/nvidia-cdi-hook/cudacompat/cudacompat.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -177,16 +177,19 @@ func (m command) getContainerForwardCompatDir(containerRoot containerRoot, o *op
177177
}
178178

179179
func (m command) useCompatLibraries(libcudaCompatPath string, hostDriverVersion string, hostCUDAVersion string) (bool, error) {
180+
// First check the ELF header of the libcuda.so included in the compat directory.
181+
// If this is present, we use the ELF header to determine whether the CUDA compat
182+
// libraries in the container should be used over the host driver libraries.
183+
compatDriverVersion := strings.TrimPrefix(filepath.Base(libcudaCompatPath), "libcuda.so.")
184+
cudaCompatHeader, _ := GetCUDACompatElfHeader(libcudaCompatPath)
185+
if cudaCompatHeader != nil {
186+
return cudaCompatHeader.UseCompat(compatDriverVersion, hostDriverVersion, hostCUDAVersion), nil
187+
}
188+
180189
// If the host CUDA version is specified, we need to inspect the ELF header
181190
// of the compat libraries in the container to determine whether these
182-
// should be used.
191+
// should be used. Return early if we cannot read the ELF header.
183192
if hostCUDAVersion != "" {
184-
cudaCompatHeader, _ := GetCUDACompatElfHeader(libcudaCompatPath)
185-
if cudaCompatHeader != nil {
186-
return cudaCompatHeader.UseCompat(hostCUDAVersion), nil
187-
}
188-
// If we were unable to read the CUDA header, we do not use the compat
189-
// libraries.
190193
return false, nil
191194
}
192195

@@ -196,12 +199,14 @@ func (m command) useCompatLibraries(libcudaCompatPath string, hostDriverVersion
196199
return false, nil
197200
}
198201

202+
// If we reach this point, it means we could not read the ELf header but
203+
// the host driver version is specified. We fall back to comparing the major
204+
// versions of the host driver and compat driver.
199205
driverMajor, err := extractMajorVersion(hostDriverVersion)
200206
if err != nil {
201207
return false, fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err)
202208
}
203209

204-
compatDriverVersion := strings.TrimPrefix(filepath.Base(libcudaCompatPath), "libcuda.so.")
205210
compatMajor, err := extractMajorVersion(compatDriverVersion)
206211
if err != nil {
207212
return false, fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err)

0 commit comments

Comments
 (0)