Skip to content

Commit a191465

Browse files
committed
code refactor for profile
1 parent f361036 commit a191465

File tree

2 files changed

+156
-61
lines changed

2 files changed

+156
-61
lines changed

internal/controller/daemonset_controller.go

Lines changed: 72 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -194,31 +194,16 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
194194

195195
fmt.Printf("The profile id is %v with memory size %v \n", giProfileInfo.Id, giProfileInfo.MemorySizeMB)
196196

197-
var instasliceList inferencev1.InstasliceList
198-
if err := r.List(ctx, &instasliceList, &client.ListOptions{}); err != nil {
199-
fmt.Printf("Error listing Instaslice %v", err)
200-
}
201-
for _, instaslice := range instasliceList.Items {
202-
// Path to the file containing the node name
203-
nodeName := os.Getenv("NODE_NAME")
204-
if instaslice.Name == nodeName {
205-
for _, v := range instaslice.Spec.Allocations {
206-
if v.Processed == "no" {
207-
placement.Size = v.Size
208-
placement.Start = v.Start
209-
}
210-
}
211-
}
212-
}
213-
gi, retCodeForGiWithPlacement := device.CreateGpuInstanceWithPlacement(&giProfileInfo, &placement)
197+
// Path to the file containing the node name
198+
updatedPlacement := r.getAllocationsToprepare(ctx, placement)
199+
gi, retCodeForGiWithPlacement := device.CreateGpuInstanceWithPlacement(&giProfileInfo, &updatedPlacement)
214200
if retCodeForGiWithPlacement != nvml.SUCCESS {
215201
fmt.Printf("error creating GPU instance for '%v': %v \n ", &giProfileInfo, retCodeForGiWithPlacement)
216202
}
217203
giInfo, retForGiInfor := gi.GetInfo()
218204
if retForGiInfor != nvml.SUCCESS {
219205
fmt.Printf("error getting GPU instance info for '%v': %v \n", &giProfileInfo, retForGiInfor)
220206
}
221-
giId = giInfo.Id
222207
//TODO: figure out the compute slice scenario, I think Kubernetes does not support this use case yet
223208
ciProfileInfo, retCodeForCiProfile := gi.GetComputeInstanceProfileInfo(Ciprofileid, CiEngProfileid)
224209
if retCodeForCiProfile != nvml.SUCCESS {
@@ -229,49 +214,7 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
229214
fmt.Printf("error creating Compute instance for '%v': %v \n", ci, retCodeForComputeInstance)
230215
}
231216
//get created mig details
232-
h := &deviceHandler{}
233-
h.nvml = nvml.New()
234-
h.nvdevice = nvdevice.New(nvdevice.WithNvml(h.nvml))
235-
236-
ret1 := h.nvml.Init()
237-
if ret1 != nvml.SUCCESS {
238-
fmt.Printf("Unable to initialize NVML: %v", nvml.ErrorString(ret))
239-
}
240-
nvlibParentDevice, err := h.nvdevice.NewDevice(device)
241-
if err != nil {
242-
fmt.Printf("unable to get nvlib GPU parent device for MIG UUID '%v': %v", uuid, ret)
243-
}
244-
migs, err := nvlibParentDevice.GetMigDevices()
245-
if err != nil {
246-
fmt.Printf("unable to get MIG devices on GPU '%v': %v", uuid, err)
247-
}
248-
for _, mig := range migs {
249-
obtainedProfileName, _ := mig.GetProfile()
250-
fmt.Printf("obtained profile is %v\n", obtainedProfileName)
251-
giID, ret := mig.GetGpuInstanceId()
252-
if ret != nvml.SUCCESS {
253-
fmt.Printf("error getting GPU instance ID for MIG device: %v", ret)
254-
}
255-
gpuInstance, err1 := device.GetGpuInstanceById(giID)
256-
if err1 != nvml.SUCCESS {
257-
fmt.Printf("Unable to get GPU instance %v\n", err1)
258-
}
259-
gpuInstanceInfo, err2 := gpuInstance.GetInfo()
260-
if err2 != nvml.SUCCESS {
261-
fmt.Printf("Unable to get GPU instance info %v\n", err2)
262-
}
263-
fmt.Printf("The instance info size %v and start %v\n", gpuInstanceInfo.Placement.Size, gpuInstanceInfo.Placement.Start)
264-
265-
if profileName == obtainedProfileName.String() {
266-
realizedMig, _ := mig.GetUUID()
267-
migUUID = realizedMig
268-
migCid, _ := mig.GetComputeInstanceId()
269-
ci, _ := gpuInstance.GetComputeInstanceById(migCid)
270-
ciMigInfo, _ := ci.GetInfo()
271-
ciId = ciMigInfo.Id
272-
273-
}
274-
}
217+
giId, migUUID, ciId = r.getCreatedSliceDetails(giId, giInfo, ret, device, uuid, profileName, migUUID, ciId)
275218
//create slice only on one GPU, both CI and GI creation are succeeded.
276219
if retCodeForCiProfile == retCodeForGi {
277220
break
@@ -325,6 +268,74 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
325268
return ctrl.Result{}, nil
326269
}
327270

271+
func (r *InstaSliceDaemonsetReconciler) getAllocationsToprepare(ctx context.Context, placement nvml.GpuInstancePlacement) nvml.GpuInstancePlacement {
272+
var instasliceList inferencev1.InstasliceList
273+
if err := r.List(ctx, &instasliceList, &client.ListOptions{}); err != nil {
274+
fmt.Printf("Error listing Instaslice %v", err)
275+
}
276+
for _, instaslice := range instasliceList.Items {
277+
278+
nodeName := os.Getenv("NODE_NAME")
279+
if instaslice.Name == nodeName {
280+
for _, v := range instaslice.Spec.Allocations {
281+
if v.Processed == "no" {
282+
placement.Size = v.Size
283+
placement.Start = v.Start
284+
}
285+
}
286+
}
287+
}
288+
return placement
289+
}
290+
291+
func (*InstaSliceDaemonsetReconciler) getCreatedSliceDetails(giId uint32, giInfo nvml.GpuInstanceInfo, ret nvml.Return, device nvml.Device, uuid string, profileName string, migUUID string, ciId uint32) (uint32, string, uint32) {
292+
giId = giInfo.Id
293+
h := &deviceHandler{}
294+
h.nvml = nvml.New()
295+
h.nvdevice = nvdevice.New(nvdevice.WithNvml(h.nvml))
296+
297+
ret1 := h.nvml.Init()
298+
if ret1 != nvml.SUCCESS {
299+
fmt.Printf("Unable to initialize NVML: %v", nvml.ErrorString(ret))
300+
}
301+
nvlibParentDevice, err := h.nvdevice.NewDevice(device)
302+
if err != nil {
303+
fmt.Printf("unable to get nvlib GPU parent device for MIG UUID '%v': %v", uuid, ret)
304+
}
305+
migs, err := nvlibParentDevice.GetMigDevices()
306+
if err != nil {
307+
fmt.Printf("unable to get MIG devices on GPU '%v': %v", uuid, err)
308+
}
309+
for _, mig := range migs {
310+
obtainedProfileName, _ := mig.GetProfile()
311+
fmt.Printf("obtained profile is %v\n", obtainedProfileName)
312+
giID, ret := mig.GetGpuInstanceId()
313+
if ret != nvml.SUCCESS {
314+
fmt.Printf("error getting GPU instance ID for MIG device: %v", ret)
315+
}
316+
gpuInstance, err1 := device.GetGpuInstanceById(giID)
317+
if err1 != nvml.SUCCESS {
318+
fmt.Printf("Unable to get GPU instance %v\n", err1)
319+
}
320+
gpuInstanceInfo, err2 := gpuInstance.GetInfo()
321+
if err2 != nvml.SUCCESS {
322+
fmt.Printf("Unable to get GPU instance info %v\n", err2)
323+
}
324+
fmt.Printf("The instance info size %v and start %v\n", gpuInstanceInfo.Placement.Size, gpuInstanceInfo.Placement.Start)
325+
326+
if profileName == obtainedProfileName.String() {
327+
realizedMig, _ := mig.GetUUID()
328+
migUUID = realizedMig
329+
migCid, _ := mig.GetComputeInstanceId()
330+
ci, _ := gpuInstance.GetComputeInstanceById(migCid)
331+
ciMigInfo, _ := ci.GetInfo()
332+
ciId = ciMigInfo.Id
333+
334+
}
335+
}
336+
return giId, migUUID, ciId
337+
}
338+
328339
func (r *InstaSliceDaemonsetReconciler) getAllocation(ctx context.Context, instasliceList inferencev1.InstasliceList, deviceForMig string, profileName string, Giprofileid int, Ciprofileid int, CiEngProfileid int) (string, string, int, int, int) {
329340
if err := r.List(ctx, &instasliceList, &client.ListOptions{}); err != nil {
330341
fmt.Printf("Error listing Instaslice %v", err)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controller
18+
19+
import (
20+
"context"
21+
22+
. "github.com/onsi/ginkgo/v2"
23+
. "github.com/onsi/gomega"
24+
"k8s.io/apimachinery/pkg/api/errors"
25+
"k8s.io/apimachinery/pkg/types"
26+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
27+
28+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
30+
inferencev1 "codeflare.dev/instaslice/api/v1"
31+
)
32+
33+
var _ = Describe("Instaslice Daemonset", func() {
34+
Context("When reconciling a resource", func() {
35+
const resourceName = "test-resource"
36+
37+
ctx := context.Background()
38+
39+
typeNamespacedName := types.NamespacedName{
40+
Name: resourceName,
41+
Namespace: "default", // TODO(user):Modify as needed
42+
}
43+
instaslice := &inferencev1.Instaslice{}
44+
45+
BeforeEach(func() {
46+
By("creating the custom resource for the Kind Instaslice")
47+
err := k8sClient.Get(ctx, typeNamespacedName, instaslice)
48+
if err != nil && errors.IsNotFound(err) {
49+
resource := &inferencev1.Instaslice{
50+
ObjectMeta: metav1.ObjectMeta{
51+
Name: resourceName,
52+
Namespace: "default",
53+
},
54+
// TODO(user): Specify other spec details if needed.
55+
}
56+
Expect(k8sClient.Create(ctx, resource)).To(Succeed())
57+
}
58+
})
59+
60+
AfterEach(func() {
61+
// TODO(user): Cleanup logic after each test, like removing the resource instance.
62+
resource := &inferencev1.Instaslice{}
63+
err := k8sClient.Get(ctx, typeNamespacedName, resource)
64+
Expect(err).NotTo(HaveOccurred())
65+
66+
By("Cleanup the specific resource instance Instaslice")
67+
Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
68+
})
69+
It("should successfully reconcile the resource", func() {
70+
By("Reconciling the created resource")
71+
controllerReconciler := &InstasliceReconciler{
72+
Client: k8sClient,
73+
Scheme: k8sClient.Scheme(),
74+
}
75+
76+
_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
77+
NamespacedName: typeNamespacedName,
78+
})
79+
Expect(err).NotTo(HaveOccurred())
80+
// TODO(user): Add more specific assertions depending on your controller's reconciliation logic.
81+
// Example: If you expect a certain status condition after reconciliation, verify it here.
82+
})
83+
})
84+
})

0 commit comments

Comments
 (0)