Skip to content

Commit 9a8de6a

Browse files
committed
return errors
1 parent 02efce3 commit 9a8de6a

File tree

1 file changed

+46
-43
lines changed

1 file changed

+46
-43
lines changed

internal/controller/daemonset_controller.go

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,6 @@ func (r *InstaSliceDaemonsetReconciler) getAllocation(ctx context.Context, insta
343343
if v.Processed == "no" {
344344
deviceForMig = k
345345
profileName = v.Profile
346-
fmt.Printf("obtained profile is %v\n", profileName)
347346
Giprofileid = v.Giprofileid
348347
Ciprofileid = v.CIProfileID
349348
CiEngProfileid = v.CIEngProfileID
@@ -519,13 +518,16 @@ func (r *InstaSliceDaemonsetReconciler) delayUngating() {
519518

520519
// This function discovers MIG devices as the plugin comes up. this is run exactly once.
521520
func (r *InstaSliceDaemonsetReconciler) discoverMigEnabledGpuWithSlices() ([]string, error) {
522-
//TODO: merge two for loops
523521
instaslice, _, gpuModelMap, failed, returnValue, errorDiscoveringProfiles := r.discoverAvailableProfilesOnGpus()
524522
if failed {
525523
return returnValue, errorDiscoveringProfiles
526524
}
527525

528-
r.discoverDanglingSlices(instaslice)
526+
err := r.discoverDanglingSlices(instaslice)
527+
528+
if err != nil {
529+
return nil, err
530+
}
529531

530532
// Path to the file containing the node name
531533
nodeName := os.Getenv("NODE_NAME")
@@ -536,13 +538,13 @@ func (r *InstaSliceDaemonsetReconciler) discoverMigEnabledGpuWithSlices() ([]str
536538
customCtx := context.TODO()
537539
errToCreate := r.Create(customCtx, instaslice)
538540
if errToCreate != nil {
539-
fmt.Printf("Error creating object %v\n", errToCreate)
541+
return nil, errToCreate
540542
}
541543

542544
// Object exists, update its status
543545
instaslice.Status.Processed = "true"
544546
if errForStatus := r.Status().Update(customCtx, instaslice); errForStatus != nil {
545-
fmt.Printf("Error adding status %v\n", errForStatus)
547+
return nil, errForStatus
546548
}
547549

548550
return discoveredGpusOnHost, nil
@@ -552,19 +554,19 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
552554
instaslice := &inferencev1.Instaslice{}
553555
ret := nvml.Init()
554556
if ret != nvml.SUCCESS {
555-
fmt.Printf("Unable to initialize NVML: %v \n", nvml.ErrorString(ret))
557+
return nil, ret, nil, false, nil, ret
556558
}
557559

558560
count, ret := nvml.DeviceGetCount()
559561
if ret != nvml.SUCCESS {
560-
fmt.Printf("Unable to get device count: %v \n", nvml.ErrorString(ret))
562+
return nil, ret, nil, false, nil, ret
561563
}
562564
gpuModelMap := make(map[string]string)
563565
discoverProfilePerNode := true
564566
for i := 0; i < count; i++ {
565567
device, ret := nvml.DeviceGetHandleByIndex(i)
566568
if ret != nvml.SUCCESS {
567-
fmt.Printf("Unable to get device at index %d: %v \n", i, nvml.ErrorString(ret))
569+
return nil, ret, nil, false, nil, ret
568570
}
569571

570572
uuid, _ := device.GetUUID()
@@ -582,12 +584,12 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
582584
continue
583585
}
584586
if ret != nvml.SUCCESS {
585-
fmt.Printf("error retrieving GpuInstanceProfileInfo for profile %d on GPU %v", i, uuid)
587+
return nil, ret, nil, false, nil, ret
586588
}
587589

588590
memory, ret := device.GetMemoryInfo()
589591
if ret != nvml.SUCCESS {
590-
fmt.Printf("error getting memory info for device %v: %v", uuid, ret)
592+
return nil, ret, nil, false, nil, ret
591593
}
592594

593595
profile := NewMigProfile(i, i, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED, giProfileInfo.SliceCount, giProfileInfo.SliceCount, giProfileInfo.MemorySizeMB, memory.Total)
@@ -600,7 +602,7 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
600602
continue
601603
}
602604
if ret != nvml.SUCCESS {
603-
return nil, 0, nil, true, nil, fmt.Errorf("error retrieving GpuInstancePossiblePlacements for profile %d on GPU %v", i, uuid)
605+
return nil, 0, nil, true, nil, ret
604606
}
605607
placementsForProfile := []inferencev1.Placement{}
606608
for _, p := range giPossiblePlacements {
@@ -626,72 +628,72 @@ func (*InstaSliceDaemonsetReconciler) discoverAvailableProfilesOnGpus() (*infere
626628
return instaslice, ret, gpuModelMap, false, nil, nil
627629
}
628630

629-
func (*InstaSliceDaemonsetReconciler) discoverDanglingSlices(instaslice *inferencev1.Instaslice) {
631+
func (*InstaSliceDaemonsetReconciler) discoverDanglingSlices(instaslice *inferencev1.Instaslice) error {
630632
h := &deviceHandler{}
631633
h.nvml = nvml.New()
632634
h.nvdevice = nvdevice.New(nvdevice.WithNvml(h.nvml))
633635

634-
ret1 := h.nvml.Init()
635-
if ret1 != nvml.SUCCESS {
636-
fmt.Printf("Unable to initialize NVML: %v", nvml.ErrorString(ret1))
636+
errInitNvml := h.nvml.Init()
637+
if errInitNvml != nvml.SUCCESS {
638+
return errInitNvml
637639
}
638640

639-
availableGpusOnNode, ret1 := h.nvml.DeviceGetCount()
640-
if ret1 != nvml.SUCCESS {
641-
fmt.Printf("Unable to get device count: %v", nvml.ErrorString(ret1))
641+
availableGpusOnNode, errObtainingDeviceCount := h.nvml.DeviceGetCount()
642+
if errObtainingDeviceCount != nvml.SUCCESS {
643+
return errObtainingDeviceCount
642644
}
643645

644646
for i := 0; i < availableGpusOnNode; i++ {
645-
device, ret := h.nvml.DeviceGetHandleByIndex(i)
646-
if ret != nvml.SUCCESS {
647-
fmt.Printf("Unable to get device at index %d: %v \n", i, nvml.ErrorString(ret))
647+
device, errObtainingDeviceHandle := h.nvml.DeviceGetHandleByIndex(i)
648+
if errObtainingDeviceHandle != nvml.SUCCESS {
649+
return errObtainingDeviceHandle
648650
}
649651

650-
uuid, ret := device.GetUUID()
651-
if ret != nvml.SUCCESS {
652-
fmt.Printf("Unable to get uuid of device at index %d: %v \n", i, nvml.ErrorString(ret))
652+
uuid, errObtainingDeviceUUID := device.GetUUID()
653+
if errObtainingDeviceUUID != nvml.SUCCESS {
654+
return errObtainingDeviceUUID
653655
}
654656

655-
nvlibParentDevice, err := h.nvdevice.NewDevice(device)
656-
if err != nil {
657-
fmt.Printf("unable to get nvlib GPU parent device for MIG UUID '%v': %v", uuid, ret)
657+
nvlibParentDevice, errObtainingParentDevice := h.nvdevice.NewDevice(device)
658+
if errObtainingParentDevice != nil {
659+
return errObtainingParentDevice
658660
}
659-
migs, err := nvlibParentDevice.GetMigDevices()
660-
if err != nil {
661-
fmt.Printf("unable to get MIG devices on GPU '%v': %v", uuid, err)
661+
migs, errRetrievingMigDevices := nvlibParentDevice.GetMigDevices()
662+
if errRetrievingMigDevices != nil {
663+
return errRetrievingMigDevices
662664
}
663665

664666
for _, mig := range migs {
665667
migUUID, _ := mig.GetUUID()
666668
profile, errForProfile := mig.GetProfile()
667669
if errForProfile != nil {
668-
fmt.Printf("error getting profile in mig loop: %v", errForProfile)
670+
return errForProfile
669671
}
670672

671-
giID, ret := mig.GetGpuInstanceId()
672-
if ret != nvml.SUCCESS {
673-
fmt.Printf("error getting GPU instance ID for MIG device: %v", ret)
673+
giID, errForMigGid := mig.GetGpuInstanceId()
674+
if errForMigGid != nvml.SUCCESS {
675+
return errForMigGid
674676
}
675-
gpuInstance, err1 := device.GetGpuInstanceById(giID)
676-
if err1 != nvml.SUCCESS {
677-
fmt.Printf("err1 %v\n", err1)
677+
gpuInstance, errRetrievingDeviceGid := device.GetGpuInstanceById(giID)
678+
if errRetrievingDeviceGid != nvml.SUCCESS {
679+
return errRetrievingDeviceGid
678680
}
679-
gpuInstanceInfo, err2 := gpuInstance.GetInfo()
680-
if err2 != nvml.SUCCESS {
681-
fmt.Printf("err2 %v\n", err2)
681+
gpuInstanceInfo, errObtainingInfo := gpuInstance.GetInfo()
682+
if errObtainingInfo != nvml.SUCCESS {
683+
return errObtainingInfo
682684
}
683685

684686
ciID, ret := mig.GetComputeInstanceId()
685687
if ret != nvml.SUCCESS {
686-
fmt.Printf("error getting Compute instance ID for MIG device: %v", ret)
688+
return ret
687689
}
688690
ci, ret := gpuInstance.GetComputeInstanceById(ciID)
689691
if ret != nvml.SUCCESS {
690-
fmt.Printf("error getting Compute instance for '%v': %v", ciID, ret)
692+
return ret
691693
}
692694
ciInfo, ret := ci.GetInfo()
693695
if ret != nvml.SUCCESS {
694-
fmt.Printf("error getting Compute instance info for '%v': %v", ciID, ret)
696+
return ret
695697
}
696698
prepared := inferencev1.PreparedDetails{
697699
Profile: profile.GetInfo().String(),
@@ -707,6 +709,7 @@ func (*InstaSliceDaemonsetReconciler) discoverDanglingSlices(instaslice *inferen
707709
instaslice.Spec.Prepared[migUUID] = prepared
708710
}
709711
}
712+
return nil
710713
}
711714

712715
// NewMigProfile constructs a new MigProfile struct using info from the giProfiles and ciProfiles used to create it.

0 commit comments

Comments
 (0)