@@ -194,31 +194,16 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
194
194
195
195
fmt .Printf ("The profile id is %v with memory size %v \n " , giProfileInfo .Id , giProfileInfo .MemorySizeMB )
196
196
197
- var instasliceList inferencev1.InstasliceList
198
- if err := r .List (ctx , & instasliceList , & client.ListOptions {}); err != nil {
199
- fmt .Printf ("Error listing Instaslice %v" , err )
200
- }
201
- for _ , instaslice := range instasliceList .Items {
202
- // Path to the file containing the node name
203
- nodeName := os .Getenv ("NODE_NAME" )
204
- if instaslice .Name == nodeName {
205
- for _ , v := range instaslice .Spec .Allocations {
206
- if v .Processed == "no" {
207
- placement .Size = v .Size
208
- placement .Start = v .Start
209
- }
210
- }
211
- }
212
- }
213
- gi , retCodeForGiWithPlacement := device .CreateGpuInstanceWithPlacement (& giProfileInfo , & placement )
197
+ // Path to the file containing the node name
198
+ updatedPlacement := r .getAllocationsToprepare (ctx , placement )
199
+ gi , retCodeForGiWithPlacement := device .CreateGpuInstanceWithPlacement (& giProfileInfo , & updatedPlacement )
214
200
if retCodeForGiWithPlacement != nvml .SUCCESS {
215
201
fmt .Printf ("error creating GPU instance for '%v': %v \n " , & giProfileInfo , retCodeForGiWithPlacement )
216
202
}
217
203
giInfo , retForGiInfor := gi .GetInfo ()
218
204
if retForGiInfor != nvml .SUCCESS {
219
205
fmt .Printf ("error getting GPU instance info for '%v': %v \n " , & giProfileInfo , retForGiInfor )
220
206
}
221
- giId = giInfo .Id
222
207
//TODO: figure out the compute slice scenario, I think Kubernetes does not support this use case yet
223
208
ciProfileInfo , retCodeForCiProfile := gi .GetComputeInstanceProfileInfo (Ciprofileid , CiEngProfileid )
224
209
if retCodeForCiProfile != nvml .SUCCESS {
@@ -229,49 +214,7 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
229
214
fmt .Printf ("error creating Compute instance for '%v': %v \n " , ci , retCodeForComputeInstance )
230
215
}
231
216
//get created mig details
232
- h := & deviceHandler {}
233
- h .nvml = nvml .New ()
234
- h .nvdevice = nvdevice .New (nvdevice .WithNvml (h .nvml ))
235
-
236
- ret1 := h .nvml .Init ()
237
- if ret1 != nvml .SUCCESS {
238
- fmt .Printf ("Unable to initialize NVML: %v" , nvml .ErrorString (ret ))
239
- }
240
- nvlibParentDevice , err := h .nvdevice .NewDevice (device )
241
- if err != nil {
242
- fmt .Printf ("unable to get nvlib GPU parent device for MIG UUID '%v': %v" , uuid , ret )
243
- }
244
- migs , err := nvlibParentDevice .GetMigDevices ()
245
- if err != nil {
246
- fmt .Printf ("unable to get MIG devices on GPU '%v': %v" , uuid , err )
247
- }
248
- for _ , mig := range migs {
249
- obtainedProfileName , _ := mig .GetProfile ()
250
- fmt .Printf ("obtained profile is %v\n " , obtainedProfileName )
251
- giID , ret := mig .GetGpuInstanceId ()
252
- if ret != nvml .SUCCESS {
253
- fmt .Printf ("error getting GPU instance ID for MIG device: %v" , ret )
254
- }
255
- gpuInstance , err1 := device .GetGpuInstanceById (giID )
256
- if err1 != nvml .SUCCESS {
257
- fmt .Printf ("Unable to get GPU instance %v\n " , err1 )
258
- }
259
- gpuInstanceInfo , err2 := gpuInstance .GetInfo ()
260
- if err2 != nvml .SUCCESS {
261
- fmt .Printf ("Unable to get GPU instance info %v\n " , err2 )
262
- }
263
- fmt .Printf ("The instance info size %v and start %v\n " , gpuInstanceInfo .Placement .Size , gpuInstanceInfo .Placement .Start )
264
-
265
- if profileName == obtainedProfileName .String () {
266
- realizedMig , _ := mig .GetUUID ()
267
- migUUID = realizedMig
268
- migCid , _ := mig .GetComputeInstanceId ()
269
- ci , _ := gpuInstance .GetComputeInstanceById (migCid )
270
- ciMigInfo , _ := ci .GetInfo ()
271
- ciId = ciMigInfo .Id
272
-
273
- }
274
- }
217
+ giId , migUUID , ciId = r .getCreatedSliceDetails (giId , giInfo , ret , device , uuid , profileName , migUUID , ciId )
275
218
//create slice only on one GPU, both CI and GI creation are succeeded.
276
219
if retCodeForCiProfile == retCodeForGi {
277
220
break
@@ -325,6 +268,74 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
325
268
return ctrl.Result {}, nil
326
269
}
327
270
271
+ func (r * InstaSliceDaemonsetReconciler ) getAllocationsToprepare (ctx context.Context , placement nvml.GpuInstancePlacement ) nvml.GpuInstancePlacement {
272
+ var instasliceList inferencev1.InstasliceList
273
+ if err := r .List (ctx , & instasliceList , & client.ListOptions {}); err != nil {
274
+ fmt .Printf ("Error listing Instaslice %v" , err )
275
+ }
276
+ for _ , instaslice := range instasliceList .Items {
277
+
278
+ nodeName := os .Getenv ("NODE_NAME" )
279
+ if instaslice .Name == nodeName {
280
+ for _ , v := range instaslice .Spec .Allocations {
281
+ if v .Processed == "no" {
282
+ placement .Size = v .Size
283
+ placement .Start = v .Start
284
+ }
285
+ }
286
+ }
287
+ }
288
+ return placement
289
+ }
290
+
291
+ func (* InstaSliceDaemonsetReconciler ) getCreatedSliceDetails (giId uint32 , giInfo nvml.GpuInstanceInfo , ret nvml.Return , device nvml.Device , uuid string , profileName string , migUUID string , ciId uint32 ) (uint32 , string , uint32 ) {
292
+ giId = giInfo .Id
293
+ h := & deviceHandler {}
294
+ h .nvml = nvml .New ()
295
+ h .nvdevice = nvdevice .New (nvdevice .WithNvml (h .nvml ))
296
+
297
+ ret1 := h .nvml .Init ()
298
+ if ret1 != nvml .SUCCESS {
299
+ fmt .Printf ("Unable to initialize NVML: %v" , nvml .ErrorString (ret ))
300
+ }
301
+ nvlibParentDevice , err := h .nvdevice .NewDevice (device )
302
+ if err != nil {
303
+ fmt .Printf ("unable to get nvlib GPU parent device for MIG UUID '%v': %v" , uuid , ret )
304
+ }
305
+ migs , err := nvlibParentDevice .GetMigDevices ()
306
+ if err != nil {
307
+ fmt .Printf ("unable to get MIG devices on GPU '%v': %v" , uuid , err )
308
+ }
309
+ for _ , mig := range migs {
310
+ obtainedProfileName , _ := mig .GetProfile ()
311
+ fmt .Printf ("obtained profile is %v\n " , obtainedProfileName )
312
+ giID , ret := mig .GetGpuInstanceId ()
313
+ if ret != nvml .SUCCESS {
314
+ fmt .Printf ("error getting GPU instance ID for MIG device: %v" , ret )
315
+ }
316
+ gpuInstance , err1 := device .GetGpuInstanceById (giID )
317
+ if err1 != nvml .SUCCESS {
318
+ fmt .Printf ("Unable to get GPU instance %v\n " , err1 )
319
+ }
320
+ gpuInstanceInfo , err2 := gpuInstance .GetInfo ()
321
+ if err2 != nvml .SUCCESS {
322
+ fmt .Printf ("Unable to get GPU instance info %v\n " , err2 )
323
+ }
324
+ fmt .Printf ("The instance info size %v and start %v\n " , gpuInstanceInfo .Placement .Size , gpuInstanceInfo .Placement .Start )
325
+
326
+ if profileName == obtainedProfileName .String () {
327
+ realizedMig , _ := mig .GetUUID ()
328
+ migUUID = realizedMig
329
+ migCid , _ := mig .GetComputeInstanceId ()
330
+ ci , _ := gpuInstance .GetComputeInstanceById (migCid )
331
+ ciMigInfo , _ := ci .GetInfo ()
332
+ ciId = ciMigInfo .Id
333
+
334
+ }
335
+ }
336
+ return giId , migUUID , ciId
337
+ }
338
+
328
339
func (r * InstaSliceDaemonsetReconciler ) getAllocation (ctx context.Context , instasliceList inferencev1.InstasliceList , deviceForMig string , profileName string , Giprofileid int , Ciprofileid int , CiEngProfileid int ) (string , string , int , int , int ) {
329
340
if err := r .List (ctx , & instasliceList , & client.ListOptions {}); err != nil {
330
341
fmt .Printf ("Error listing Instaslice %v" , err )
0 commit comments