@@ -37,51 +37,112 @@ import (
37
37
38
38
const (
39
39
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
40
+ monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml"
41
+ rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml"
42
+ nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml"
40
43
containerName = "testcontainer"
41
44
tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
42
45
tfPodName = "training-pod"
43
46
)
44
47
45
48
func init () {
46
- ginkgo .Describe ("GPU plugin [Device:gpu]" , describe )
49
+ // This needs to be Ordered because only one GPU plugin can function on the node at once.
50
+ ginkgo .Describe ("GPU plugin [Device:gpu]" , describe , ginkgo .Ordered )
51
+ }
52
+
53
+ func createPluginAndVerifyExistence (f * framework.Framework , ctx context.Context , kustomizationPath , baseResource string ) {
54
+ ginkgo .By ("deploying GPU plugin" )
55
+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (kustomizationPath ))
56
+
57
+ ginkgo .By ("waiting for GPU plugin's availability" )
58
+ podList , err := e2epod .WaitForPodsWithLabelRunningReady (ctx , f .ClientSet , f .Namespace .Name ,
59
+ labels.Set {"app" : "intel-gpu-plugin" }.AsSelector (), 1 /* one replica */ , 100 * time .Second )
60
+ if err != nil {
61
+ e2edebug .DumpAllNamespaceInfo (ctx , f .ClientSet , f .Namespace .Name )
62
+ e2ekubectl .LogFailedContainers (ctx , f .ClientSet , f .Namespace .Name , framework .Logf )
63
+ framework .Failf ("unable to wait for all pods to be running and ready: %v" , err )
64
+ }
65
+
66
+ ginkgo .By ("checking GPU plugin's securityContext" )
67
+ if err = utils .TestPodsFileSystemInfo (podList .Items ); err != nil {
68
+ framework .Failf ("container filesystem info checks failed: %v" , err )
69
+ }
70
+
71
+ ginkgo .By ("checking if the resource is allocatable" )
72
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , v1 .ResourceName (baseResource ), 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
73
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
74
+ }
47
75
}
48
76
49
77
func describe () {
50
78
f := framework .NewDefaultFramework ("gpuplugin" )
51
79
f .NamespacePodSecurityEnforceLevel = admissionapi .LevelPrivileged
52
80
53
- kustomizationPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
81
+ vanillaPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
54
82
if errFailedToLocateRepoFile != nil {
55
83
framework .Failf ("unable to locate %q: %v" , kustomizationYaml , errFailedToLocateRepoFile )
56
84
}
57
85
58
- ginkgo . BeforeEach ( func ( ctx context. Context ) {
59
- ginkgo . By ( "deploying GPU plugin" )
60
- e2ekubectl . RunKubectlOrDie ( f . Namespace . Name , "apply" , "-k" , filepath . Dir ( kustomizationPath ) )
61
-
62
- ginkgo . By ( "waiting for GPU plugin's availability" )
63
- podList , err := e2epod . WaitForPodsWithLabelRunningReady ( ctx , f . ClientSet , f . Namespace . Name ,
64
- labels. Set { "app" : "intel-gpu-plugin" }. AsSelector (), 1 /* one replica */ , 100 * time . Second )
65
- if err != nil {
66
- e2edebug . DumpAllNamespaceInfo ( ctx , f . ClientSet , f . Namespace . Name )
67
- e2ekubectl . LogFailedContainers ( ctx , f . ClientSet , f . Namespace . Name , framework . Logf )
68
- framework . Failf ( "unable to wait for all pods to be running and ready: %v" , err )
69
- }
70
-
71
- ginkgo . By ( "checking GPU plugin's securityContext" )
72
- if err = utils . TestPodsFileSystemInfo ( podList . Items ); err != nil {
73
- framework . Failf ( "container filesystem info checks failed: %v " , err )
74
- }
75
- } )
86
+ monitoringPath , errFailedToLocateRepoFile := utils . LocateRepoFile ( monitoringYaml )
87
+ if errFailedToLocateRepoFile != nil {
88
+ framework . Failf ( "unable to locate %q: %v" , monitoringYaml , errFailedToLocateRepoFile )
89
+ }
90
+
91
+ nfdRulesPath , errFailedToLocateRepoFile := utils . LocateRepoFile ( nfdRulesYaml )
92
+ if errFailedToLocateRepoFile != nil {
93
+ framework . Failf ( "unable to locate %q: %v" , nfdRulesYaml , errFailedToLocateRepoFile )
94
+ }
95
+
96
+ resourceManagerPath , errFailedToLocateRepoFile := utils . LocateRepoFile ( rmEnabledYaml )
97
+ if errFailedToLocateRepoFile != nil {
98
+ framework . Failf ( "unable to locate %q: %v" , rmEnabledYaml , errFailedToLocateRepoFile )
99
+ }
100
+
101
+ ginkgo . Context ( "When GPU plugin is deployed [Resource:i915] " , func () {
102
+ ginkgo . AfterEach ( func ( ctx context. Context ) {
103
+ framework . Logf ( "Removing gpu-plugin manually" )
76
104
77
- ginkgo .Context ("When GPU resources are available [Resource:i915]" , func () {
78
- ginkgo .BeforeEach (func (ctx context.Context ) {
79
- ginkgo .By ("checking if the resource is allocatable" )
80
- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second ); err != nil {
81
- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
105
+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "delete" , "-k" , filepath .Dir (vanillaPath ))
106
+
107
+ framework .Logf ("Waiting for i915 resources to go to zero" )
108
+
109
+ // Wait for resources to go to zero
110
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second , utils .WaitForZeroResource ); err != nil {
111
+ framework .Failf ("unable to wait for nodes to have no resources: %v" , err )
82
112
}
83
113
})
114
+
84
115
ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
116
+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
117
+
118
+ podListFunc := framework .ListObjects (f .ClientSet .CoreV1 ().Pods (f .Namespace .Name ).List , metav1.ListOptions {})
119
+
120
+ pods , err := podListFunc (ctx )
121
+ if err != nil {
122
+ framework .Failf ("Couldn't list pods: %+v" , err )
123
+ }
124
+
125
+ if len (pods .Items ) != 1 {
126
+ framework .Failf ("Invalid amount of Pods listed %d" , len (pods .Items ))
127
+ }
128
+
129
+ pluginPod := pods .Items [0 ]
130
+
131
+ ginkgo .By ("checking if CDI path is included in volumes" )
132
+ found := false
133
+ for _ , v := range pluginPod .Spec .Volumes {
134
+ if v .HostPath != nil && v .HostPath .Path == "/var/run/cdi" {
135
+ framework .Logf ("CDI volume found" )
136
+ found = true
137
+
138
+ break
139
+ }
140
+ }
141
+
142
+ if ! found {
143
+ framework .Fail ("Couldn't find CDI volume in GPU plugin deployment" )
144
+ }
145
+
85
146
ginkgo .By ("submitting a pod requesting GPU resources" )
86
147
podSpec := & v1.Pod {
87
148
ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
@@ -122,7 +183,41 @@ func describe() {
122
183
framework .Logf ("found card and renderD from the log" )
123
184
})
124
185
186
+ ginkgo .Context ("When [Deployment:monitoring] deployment is applied [Resource:i915]" , func () {
187
+ ginkgo .It ("check if monitoring resource is available" , func (ctx context.Context ) {
188
+ createPluginAndVerifyExistence (f , ctx , monitoringPath , "gpu.intel.com/i915" )
189
+
190
+ ginkgo .By ("checking if the monitoring resource is allocatable" )
191
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915_monitoring" , 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
192
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
193
+ }
194
+ })
195
+ })
196
+
197
+ ginkgo .Context ("When [Deployment:resourceManager] deployment is applied [Resource:i915]" , func () {
198
+ ginkgo .It ("check if i915 resources is available" , func (ctx context.Context ) {
199
+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (nfdRulesPath ))
200
+
201
+ createPluginAndVerifyExistence (f , ctx , resourceManagerPath , "gpu.intel.com/i915" )
202
+
203
+ // To speed up extended resource detection, let's restart NFD worker
204
+ e2ekubectl .RunKubectlOrDie ("node-feature-discovery" , "rollout" , "restart" , "daemonset" , "nfd-worker" )
205
+
206
+ ginkgo .By ("checking if the millicores resource is allocatable" )
207
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/millicores" , 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
208
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
209
+ }
210
+
211
+ ginkgo .By ("checking if the tiles resource is allocatable" )
212
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/tiles" , 30 * time .Second , utils .WaitForPositiveResource ); err != nil {
213
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
214
+ }
215
+ })
216
+ })
217
+
125
218
ginkgo .It ("run a small workload on the GPU [App:tensorflow]" , func (ctx context.Context ) {
219
+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
220
+
126
221
kustomYaml , err := utils .LocateRepoFile (tfKustomizationYaml )
127
222
if err != nil {
128
223
framework .Failf ("unable to locate %q: %v" , tfKustomizationYaml , err )
@@ -146,13 +241,9 @@ func describe() {
146
241
})
147
242
148
243
ginkgo .Context ("When GPU resources are available [Resource:xe]" , func () {
149
- ginkgo .BeforeEach (func (ctx context.Context ) {
150
- ginkgo .By ("checking if the resource is allocatable" )
151
- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/xe" , 30 * time .Second ); err != nil {
152
- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
153
- }
154
- })
155
244
ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
245
+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/xe" )
246
+
156
247
ginkgo .By ("submitting a pod requesting GPU resources" )
157
248
podSpec := & v1.Pod {
158
249
ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
0 commit comments