Skip to content

Commit 8dd5b4a

Browse files
committed
e2e: gpu: add tests for different deployments
Signed-off-by: Tuomas Katila <[email protected]>
1 parent 402fb8d commit 8dd5b4a

File tree

11 files changed

+151
-46
lines changed

11 files changed

+151
-46
lines changed

.golangci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ issues:
7373
- path: test/e2e/
7474
linters:
7575
- wsl
76+
- gocognit
77+
- gocyclo
7678
- path: cmd/gpu_fakedev/
7779
linters:
7880
- wsl

test/e2e/dlb/dlb.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ func describe() {
8484
ginkgo.Context("When PF resources are available [Resource:pf]", func() {
8585
ginkgo.BeforeEach(func(ctx context.Context) {
8686
resource := v1.ResourceName("dlb.intel.com/pf")
87-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second); err != nil {
87+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second, utils.WaitForPositiveResource); err != nil {
8888
framework.Failf("unable to wait for nodes to have positive allocatable resource %s: %v", resource, err)
8989
}
9090
})
@@ -101,7 +101,7 @@ func describe() {
101101
ginkgo.Context("When VF resources are available [Resource:vf]", func() {
102102
ginkgo.BeforeEach(func(ctx context.Context) {
103103
resource := v1.ResourceName("dlb.intel.com/vf")
104-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second); err != nil {
104+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second, utils.WaitForPositiveResource); err != nil {
105105
framework.Failf("unable to wait for nodes to have positive allocatable resource %s: %v", resource, err)
106106
}
107107
})

test/e2e/dsa/dsa.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ func describe() {
9797
ginkgo.Context("When DSA resources are available [Resource:dedicated]", func() {
9898
ginkgo.BeforeEach(func(ctx context.Context) {
9999
ginkgo.By("checking if the resource is allocatable")
100-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "dsa.intel.com/wq-user-dedicated", 300*time.Second); err != nil {
100+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "dsa.intel.com/wq-user-dedicated", 300*time.Second, utils.WaitForPositiveResource); err != nil {
101101
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
102102
}
103103
})

test/e2e/fpga/fpga.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func runDevicePlugin(ctx context.Context, fmw *framework.Framework, pluginKustom
129129

130130
ginkgo.By("checking if the resource is allocatable")
131131

132-
if err = utils.WaitForNodesWithResource(ctx, fmw.ClientSet, resource, 30*time.Second); err != nil {
132+
if err = utils.WaitForNodesWithResource(ctx, fmw.ClientSet, resource, 30*time.Second, utils.WaitForPositiveResource); err != nil {
133133
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
134134
}
135135
}

test/e2e/gpu/gpu.go

Lines changed: 122 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -37,51 +37,112 @@ import (
3737

3838
const (
3939
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
40+
monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml"
41+
rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml"
42+
nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml"
4043
containerName = "testcontainer"
4144
tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
4245
tfPodName = "training-pod"
4346
)
4447

4548
func init() {
46-
ginkgo.Describe("GPU plugin [Device:gpu]", describe)
49+
// This needs to be Ordered because only one GPU plugin can function on the node at once.
50+
ginkgo.Describe("GPU plugin [Device:gpu]", describe, ginkgo.Ordered)
51+
}
52+
53+
func createPluginAndVerifyExistence(f *framework.Framework, ctx context.Context, kustomizationPath, baseResource string) {
54+
ginkgo.By("deploying GPU plugin")
55+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
56+
57+
ginkgo.By("waiting for GPU plugin's availability")
58+
podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name,
59+
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
60+
if err != nil {
61+
e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name)
62+
e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf)
63+
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
64+
}
65+
66+
ginkgo.By("checking GPU plugin's securityContext")
67+
if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil {
68+
framework.Failf("container filesystem info checks failed: %v", err)
69+
}
70+
71+
ginkgo.By("checking if the resource is allocatable")
72+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, v1.ResourceName(baseResource), 30*time.Second, utils.WaitForPositiveResource); err != nil {
73+
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
74+
}
4775
}
4876

4977
func describe() {
5078
f := framework.NewDefaultFramework("gpuplugin")
5179
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
5280

53-
kustomizationPath, errFailedToLocateRepoFile := utils.LocateRepoFile(kustomizationYaml)
81+
vanillaPath, errFailedToLocateRepoFile := utils.LocateRepoFile(kustomizationYaml)
5482
if errFailedToLocateRepoFile != nil {
5583
framework.Failf("unable to locate %q: %v", kustomizationYaml, errFailedToLocateRepoFile)
5684
}
5785

58-
ginkgo.BeforeEach(func(ctx context.Context) {
59-
ginkgo.By("deploying GPU plugin")
60-
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
61-
62-
ginkgo.By("waiting for GPU plugin's availability")
63-
podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name,
64-
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
65-
if err != nil {
66-
e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name)
67-
e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf)
68-
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
69-
}
70-
71-
ginkgo.By("checking GPU plugin's securityContext")
72-
if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil {
73-
framework.Failf("container filesystem info checks failed: %v", err)
74-
}
75-
})
86+
monitoringPath, errFailedToLocateRepoFile := utils.LocateRepoFile(monitoringYaml)
87+
if errFailedToLocateRepoFile != nil {
88+
framework.Failf("unable to locate %q: %v", monitoringYaml, errFailedToLocateRepoFile)
89+
}
90+
91+
nfdRulesPath, errFailedToLocateRepoFile := utils.LocateRepoFile(nfdRulesYaml)
92+
if errFailedToLocateRepoFile != nil {
93+
framework.Failf("unable to locate %q: %v", nfdRulesYaml, errFailedToLocateRepoFile)
94+
}
95+
96+
resourceManagerPath, errFailedToLocateRepoFile := utils.LocateRepoFile(rmEnabledYaml)
97+
if errFailedToLocateRepoFile != nil {
98+
framework.Failf("unable to locate %q: %v", rmEnabledYaml, errFailedToLocateRepoFile)
99+
}
100+
101+
ginkgo.Context("When GPU plugin is deployed [Resource:i915]", func() {
102+
ginkgo.AfterEach(func(ctx context.Context) {
103+
framework.Logf("Removing gpu-plugin manually")
76104

77-
ginkgo.Context("When GPU resources are available [Resource:i915]", func() {
78-
ginkgo.BeforeEach(func(ctx context.Context) {
79-
ginkgo.By("checking if the resource is allocatable")
80-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil {
81-
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
105+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "delete", "-k", filepath.Dir(vanillaPath))
106+
107+
framework.Logf("Waiting for i915 resources to go to zero")
108+
109+
// Wait for resources to go to zero
110+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915", 30*time.Second, utils.WaitForZeroResource); err != nil {
111+
framework.Failf("unable to wait for nodes to have no resources: %v", err)
82112
}
83113
})
114+
84115
ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) {
116+
createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915")
117+
118+
podListFunc := framework.ListObjects(f.ClientSet.CoreV1().Pods(f.Namespace.Name).List, metav1.ListOptions{})
119+
120+
pods, err := podListFunc(ctx)
121+
if err != nil {
122+
framework.Failf("Couldn't list pods: %+v", err)
123+
}
124+
125+
if len(pods.Items) != 1 {
126+
framework.Failf("Invalid amount of Pods listed %d", len(pods.Items))
127+
}
128+
129+
pluginPod := pods.Items[0]
130+
131+
ginkgo.By("checking if CDI path is included in volumes")
132+
found := false
133+
for _, v := range pluginPod.Spec.Volumes {
134+
if v.HostPath != nil && v.HostPath.Path == "/var/run/cdi" {
135+
framework.Logf("CDI volume found")
136+
found = true
137+
138+
break
139+
}
140+
}
141+
142+
if !found {
143+
framework.Fail("Couldn't find CDI volume in GPU plugin deployment")
144+
}
145+
85146
ginkgo.By("submitting a pod requesting GPU resources")
86147
podSpec := &v1.Pod{
87148
ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"},
@@ -122,7 +183,41 @@ func describe() {
122183
framework.Logf("found card and renderD from the log")
123184
})
124185

186+
ginkgo.Context("When [Deployment:monitoring] deployment is applied [Resource:i915]", func() {
187+
ginkgo.It("check if monitoring resource is available", func(ctx context.Context) {
188+
createPluginAndVerifyExistence(f, ctx, monitoringPath, "gpu.intel.com/i915")
189+
190+
ginkgo.By("checking if the monitoring resource is allocatable")
191+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915_monitoring", 30*time.Second, utils.WaitForPositiveResource); err != nil {
192+
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
193+
}
194+
})
195+
})
196+
197+
ginkgo.Context("When [Deployment:resourceManager] deployment is applied [Resource:i915]", func() {
198+
ginkgo.It("check if i915 resources is available", func(ctx context.Context) {
199+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(nfdRulesPath))
200+
201+
createPluginAndVerifyExistence(f, ctx, resourceManagerPath, "gpu.intel.com/i915")
202+
203+
// To speed up extended resource detection, let's restart NFD worker
204+
e2ekubectl.RunKubectlOrDie("node-feature-discovery", "rollout", "restart", "daemonset", "nfd-worker")
205+
206+
ginkgo.By("checking if the millicores resource is allocatable")
207+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/millicores", 30*time.Second, utils.WaitForPositiveResource); err != nil {
208+
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
209+
}
210+
211+
ginkgo.By("checking if the tiles resource is allocatable")
212+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/tiles", 30*time.Second, utils.WaitForPositiveResource); err != nil {
213+
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
214+
}
215+
})
216+
})
217+
125218
ginkgo.It("run a small workload on the GPU [App:tensorflow]", func(ctx context.Context) {
219+
createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915")
220+
126221
kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
127222
if err != nil {
128223
framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err)
@@ -146,13 +241,9 @@ func describe() {
146241
})
147242

148243
ginkgo.Context("When GPU resources are available [Resource:xe]", func() {
149-
ginkgo.BeforeEach(func(ctx context.Context) {
150-
ginkgo.By("checking if the resource is allocatable")
151-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/xe", 30*time.Second); err != nil {
152-
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
153-
}
154-
})
155244
ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) {
245+
createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/xe")
246+
156247
ginkgo.By("submitting a pod requesting GPU resources")
157248
podSpec := &v1.Pod{
158249
ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"},

test/e2e/iaa/iaa.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ func describe() {
9797
ginkgo.Context("When IAA resources are available [Resource:dedicated]", func() {
9898
ginkgo.BeforeEach(func(ctx context.Context) {
9999
ginkgo.By("checking if the resource is allocatable")
100-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "iaa.intel.com/wq-user-dedicated", 300*time.Second); err != nil {
100+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "iaa.intel.com/wq-user-dedicated", 300*time.Second, utils.WaitForPositiveResource); err != nil {
101101
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
102102
}
103103
})

test/e2e/operator/operator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func testPluginWithOperator(deviceName string, resourceNames []v1.ResourceName,
8989
}
9090

9191
for _, resourceName := range resourceNames {
92-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, timeout); err != nil {
92+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, timeout, utils.WaitForPositiveResource); err != nil {
9393
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
9494
}
9595
}

test/e2e/qat/qatplugin_dpdk.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func describeQatDpdkPlugin() {
9898
}
9999

100100
ginkgo.By("checking if the resource is allocatable")
101-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second); err != nil {
101+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second, utils.WaitForPositiveResource); err != nil {
102102
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
103103
}
104104
})

test/e2e/qat/qatplugin_kernel.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ func describeQatKernelPlugin() {
8282
ginkgo.Context("When QAT resources are available [Resource:cy1_dc0]", func() {
8383
ginkgo.BeforeEach(func(ctx context.Context) {
8484
ginkgo.By("checking if the resource is allocatable")
85-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second); err != nil {
85+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second, utils.WaitForPositiveResource); err != nil {
8686
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
8787
}
8888
})

test/e2e/sgx/sgx.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,13 @@ func describe() {
8282
ginkgo.Context("When SGX resources are available", func() {
8383
ginkgo.BeforeEach(func(ctx context.Context) {
8484
ginkgo.By("checking if the resource is allocatable")
85-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/epc", 150*time.Second); err != nil {
85+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/epc", 150*time.Second, utils.WaitForPositiveResource); err != nil {
8686
framework.Failf("unable to wait for nodes to have positive allocatable epc resource: %v", err)
8787
}
88-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/enclave", 30*time.Second); err != nil {
88+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/enclave", 30*time.Second, utils.WaitForPositiveResource); err != nil {
8989
framework.Failf("unable to wait for nodes to have positive allocatable enclave resource: %v", err)
9090
}
91-
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/provision", 30*time.Second); err != nil {
91+
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/provision", 30*time.Second, utils.WaitForPositiveResource); err != nil {
9292
framework.Failf("unable to wait for nodes to have positive allocatable provision resource: %v", err)
9393
}
9494
})

0 commit comments

Comments
 (0)