@@ -343,7 +343,6 @@ func (r *InstaSliceDaemonsetReconciler) getAllocation(ctx context.Context, insta
343
343
if v .Processed == "no" {
344
344
deviceForMig = k
345
345
profileName = v .Profile
346
- fmt .Printf ("obtained profile is %v\n " , profileName )
347
346
Giprofileid = v .Giprofileid
348
347
Ciprofileid = v .CIProfileID
349
348
CiEngProfileid = v .CIEngProfileID
@@ -519,13 +518,16 @@ func (r *InstaSliceDaemonsetReconciler) delayUngating() {
519
518
520
519
// This function discovers MIG devices as the plugin comes up. this is run exactly once.
521
520
func (r * InstaSliceDaemonsetReconciler ) discoverMigEnabledGpuWithSlices () ([]string , error ) {
522
- //TODO: merge two for loops
523
521
instaslice , _ , gpuModelMap , failed , returnValue , errorDiscoveringProfiles := r .discoverAvailableProfilesOnGpus ()
524
522
if failed {
525
523
return returnValue , errorDiscoveringProfiles
526
524
}
527
525
528
- r .discoverDanglingSlices (instaslice )
526
+ err := r .discoverDanglingSlices (instaslice )
527
+
528
+ if err != nil {
529
+ return nil , err
530
+ }
529
531
530
532
// Path to the file containing the node name
531
533
nodeName := os .Getenv ("NODE_NAME" )
@@ -536,13 +538,13 @@ func (r *InstaSliceDaemonsetReconciler) discoverMigEnabledGpuWithSlices() ([]str
536
538
customCtx := context .TODO ()
537
539
errToCreate := r .Create (customCtx , instaslice )
538
540
if errToCreate != nil {
539
- fmt . Printf ( "Error creating object %v \n " , errToCreate )
541
+ return nil , errToCreate
540
542
}
541
543
542
544
// Object exists, update its status
543
545
instaslice .Status .Processed = "true"
544
546
if errForStatus := r .Status ().Update (customCtx , instaslice ); errForStatus != nil {
545
- fmt . Printf ( "Error adding status %v \n " , errForStatus )
547
+ return nil , errForStatus
546
548
}
547
549
548
550
return discoveredGpusOnHost , nil
@@ -552,19 +554,19 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
552
554
instaslice := & inferencev1.Instaslice {}
553
555
ret := nvml .Init ()
554
556
if ret != nvml .SUCCESS {
555
- fmt . Printf ( "Unable to initialize NVML: %v \n " , nvml . ErrorString ( ret ))
557
+ return nil , ret , nil , false , nil , ret
556
558
}
557
559
558
560
count , ret := nvml .DeviceGetCount ()
559
561
if ret != nvml .SUCCESS {
560
- fmt . Printf ( "Unable to get device count: %v \n " , nvml . ErrorString ( ret ))
562
+ return nil , ret , nil , false , nil , ret
561
563
}
562
564
gpuModelMap := make (map [string ]string )
563
565
discoverProfilePerNode := true
564
566
for i := 0 ; i < count ; i ++ {
565
567
device , ret := nvml .DeviceGetHandleByIndex (i )
566
568
if ret != nvml .SUCCESS {
567
- fmt . Printf ( "Unable to get device at index %d: %v \n " , i , nvml . ErrorString ( ret ))
569
+ return nil , ret , nil , false , nil , ret
568
570
}
569
571
570
572
uuid , _ := device .GetUUID ()
@@ -582,12 +584,12 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
582
584
continue
583
585
}
584
586
if ret != nvml .SUCCESS {
585
- fmt . Printf ( "error retrieving GpuInstanceProfileInfo for profile %d on GPU %v" , i , uuid )
587
+ return nil , ret , nil , false , nil , ret
586
588
}
587
589
588
590
memory , ret := device .GetMemoryInfo ()
589
591
if ret != nvml .SUCCESS {
590
- fmt . Printf ( "error getting memory info for device %v: %v" , uuid , ret )
592
+ return nil , ret , nil , false , nil , ret
591
593
}
592
594
593
595
profile := NewMigProfile (i , i , nvml .COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED , giProfileInfo .SliceCount , giProfileInfo .SliceCount , giProfileInfo .MemorySizeMB , memory .Total )
@@ -600,7 +602,7 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
600
602
continue
601
603
}
602
604
if ret != nvml .SUCCESS {
603
- return nil , 0 , nil , true , nil , fmt . Errorf ( "error retrieving GpuInstancePossiblePlacements for profile %d on GPU %v" , i , uuid )
605
+ return nil , 0 , nil , true , nil , ret
604
606
}
605
607
placementsForProfile := []inferencev1.Placement {}
606
608
for _ , p := range giPossiblePlacements {
@@ -626,72 +628,72 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
626
628
return instaslice , ret , gpuModelMap , false , nil , nil
627
629
}
628
630
629
- func (* InstaSliceDaemonsetReconciler ) discoverDanglingSlices (instaslice * inferencev1.Instaslice ) {
631
+ func (* InstaSliceDaemonsetReconciler ) discoverDanglingSlices (instaslice * inferencev1.Instaslice ) error {
630
632
h := & deviceHandler {}
631
633
h .nvml = nvml .New ()
632
634
h .nvdevice = nvdevice .New (nvdevice .WithNvml (h .nvml ))
633
635
634
- ret1 := h .nvml .Init ()
635
- if ret1 != nvml .SUCCESS {
636
- fmt . Printf ( "Unable to initialize NVML: %v" , nvml . ErrorString ( ret1 ))
636
+ errInitNvml := h .nvml .Init ()
637
+ if errInitNvml != nvml .SUCCESS {
638
+ return errInitNvml
637
639
}
638
640
639
- availableGpusOnNode , ret1 := h .nvml .DeviceGetCount ()
640
- if ret1 != nvml .SUCCESS {
641
- fmt . Printf ( "Unable to get device count: %v" , nvml . ErrorString ( ret1 ))
641
+ availableGpusOnNode , errObtainingDeviceCount := h .nvml .DeviceGetCount ()
642
+ if errObtainingDeviceCount != nvml .SUCCESS {
643
+ return errObtainingDeviceCount
642
644
}
643
645
644
646
for i := 0 ; i < availableGpusOnNode ; i ++ {
645
- device , ret := h .nvml .DeviceGetHandleByIndex (i )
646
- if ret != nvml .SUCCESS {
647
- fmt . Printf ( "Unable to get device at index %d: %v \n " , i , nvml . ErrorString ( ret ))
647
+ device , errObtainingDeviceHandle := h .nvml .DeviceGetHandleByIndex (i )
648
+ if errObtainingDeviceHandle != nvml .SUCCESS {
649
+ return errObtainingDeviceHandle
648
650
}
649
651
650
- uuid , ret := device .GetUUID ()
651
- if ret != nvml .SUCCESS {
652
- fmt . Printf ( "Unable to get uuid of device at index %d: %v \n " , i , nvml . ErrorString ( ret ))
652
+ uuid , errObtainingDeviceUUID := device .GetUUID ()
653
+ if errObtainingDeviceUUID != nvml .SUCCESS {
654
+ return errObtainingDeviceUUID
653
655
}
654
656
655
- nvlibParentDevice , err := h .nvdevice .NewDevice (device )
656
- if err != nil {
657
- fmt . Printf ( "unable to get nvlib GPU parent device for MIG UUID '%v': %v" , uuid , ret )
657
+ nvlibParentDevice , errObtainingParentDevice := h .nvdevice .NewDevice (device )
658
+ if errObtainingParentDevice != nil {
659
+ return errObtainingParentDevice
658
660
}
659
- migs , err := nvlibParentDevice .GetMigDevices ()
660
- if err != nil {
661
- fmt . Printf ( "unable to get MIG devices on GPU '%v': %v" , uuid , err )
661
+ migs , errRetrievingMigDevices := nvlibParentDevice .GetMigDevices ()
662
+ if errRetrievingMigDevices != nil {
663
+ return errRetrievingMigDevices
662
664
}
663
665
664
666
for _ , mig := range migs {
665
667
migUUID , _ := mig .GetUUID ()
666
668
profile , errForProfile := mig .GetProfile ()
667
669
if errForProfile != nil {
668
- fmt . Printf ( "error getting profile in mig loop: %v" , errForProfile )
670
+ return errForProfile
669
671
}
670
672
671
- giID , ret := mig .GetGpuInstanceId ()
672
- if ret != nvml .SUCCESS {
673
- fmt . Printf ( "error getting GPU instance ID for MIG device: %v" , ret )
673
+ giID , errForMigGid := mig .GetGpuInstanceId ()
674
+ if errForMigGid != nvml .SUCCESS {
675
+ return errForMigGid
674
676
}
675
- gpuInstance , err1 := device .GetGpuInstanceById (giID )
676
- if err1 != nvml .SUCCESS {
677
- fmt . Printf ( "err1 %v \n " , err1 )
677
+ gpuInstance , errRetrievingDeviceGid := device .GetGpuInstanceById (giID )
678
+ if errRetrievingDeviceGid != nvml .SUCCESS {
679
+ return errRetrievingDeviceGid
678
680
}
679
- gpuInstanceInfo , err2 := gpuInstance .GetInfo ()
680
- if err2 != nvml .SUCCESS {
681
- fmt . Printf ( "err2 %v \n " , err2 )
681
+ gpuInstanceInfo , errObtainingInfo := gpuInstance .GetInfo ()
682
+ if errObtainingInfo != nvml .SUCCESS {
683
+ return errObtainingInfo
682
684
}
683
685
684
686
ciID , ret := mig .GetComputeInstanceId ()
685
687
if ret != nvml .SUCCESS {
686
- fmt . Printf ( "error getting Compute instance ID for MIG device: %v" , ret )
688
+ return ret
687
689
}
688
690
ci , ret := gpuInstance .GetComputeInstanceById (ciID )
689
691
if ret != nvml .SUCCESS {
690
- fmt . Printf ( "error getting Compute instance for '%v': %v" , ciID , ret )
692
+ return ret
691
693
}
692
694
ciInfo , ret := ci .GetInfo ()
693
695
if ret != nvml .SUCCESS {
694
- fmt . Printf ( "error getting Compute instance info for '%v': %v" , ciID , ret )
696
+ return ret
695
697
}
696
698
prepared := inferencev1.PreparedDetails {
697
699
Profile : profile .GetInfo ().String (),
@@ -707,6 +709,7 @@ func (*InstaSliceDaemonsetReconciler) discoverDanglingSlices(instaslice *inferen
707
709
instaslice .Spec .Prepared [migUUID ] = prepared
708
710
}
709
711
}
712
+ return nil
710
713
}
711
714
712
715
// NewMigProfile constructs a new MigProfile struct using info from the giProfiles and ciProfiles used to create it.
0 commit comments