Skip to content

Commit f500555

Browse files
Create kueue resources as part of test execution
1 parent 59a1d23 commit f500555

File tree

6 files changed

+77
-64
lines changed

6 files changed

+77
-64
lines changed

Diff for: go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ require (
1111
github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790
1212
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
1313
github.com/project-codeflare/appwrapper v0.25.0
14-
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e
14+
github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0
1515
github.com/ray-project/kuberay/ray-operator v1.1.1
1616
go.uber.org/zap v1.27.0
1717
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8

Diff for: go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
249249
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
250250
github.com/project-codeflare/appwrapper v0.25.1-0.20240926155059-30a8af17b8f4 h1:XYjq50WpGxagELHurCXyiirvdM9OzxTnCMcQC9gebnQ=
251251
github.com/project-codeflare/appwrapper v0.25.1-0.20240926155059-30a8af17b8f4/go.mod h1:zDALq3/gn+eiczpD7TBZWWbAVuwcCGDFuN/77oh+CDw=
252-
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e h1:juFd1dQyioeMxbVE6F0YD25ozm/jiqJE+MpDhu8p22k=
253-
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
252+
github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0 h1:5gfJUhF2GRZIXCUK/aUYTo79Ipo4Ngg9HO8Jgj8zThM=
253+
github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
254254
github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg=
255255
github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
256256
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=

Diff for: test/e2e/mnist_pytorch_appwrapper_test.go

+12-5
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,30 @@ import (
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3030
"k8s.io/apimachinery/pkg/runtime"
31+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
3132
)
3233

3334
func TestMnistPyTorchAppWrapperCpu(t *testing.T) {
34-
runMnistPyTorchAppWrapper(t, "cpu")
35+
runMnistPyTorchAppWrapper(t, "cpu", 0)
3536
}
3637

3738
func TestMnistPyTorchAppWrapperGpu(t *testing.T) {
38-
runMnistPyTorchAppWrapper(t, "gpu")
39+
runMnistPyTorchAppWrapper(t, "gpu", 1)
3940
}
4041

4142
// Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
42-
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) {
43+
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
4344
test := With(t)
4445

45-
// Create a namespace and localqueue in that namespace
46+
// Create a namespace
4647
namespace := test.NewTestNamespace()
47-
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")
48+
49+
// Create Kueue resources
50+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
51+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
52+
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
53+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
54+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
4855

4956
// Test configuration
5057
config := &corev1.ConfigMap{

Diff for: test/e2e/mnist_rayjob_raycluster_test.go

+62-17
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3434
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3535
"k8s.io/apimachinery/pkg/runtime"
36+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
3637
)
3738

3839
// Trains the MNIST dataset as a RayJob, executed by a Ray cluster
@@ -49,9 +50,15 @@ func TestMnistRayJobRayClusterGpu(t *testing.T) {
4950
func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) {
5051
test := With(t)
5152

52-
// Create a namespace and localqueue in that namespace
53+
// Create a namespace
5354
namespace := test.NewTestNamespace()
54-
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")
55+
56+
// Create Kueue resources
57+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
58+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
59+
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
60+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
61+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
5562

5663
// Create MNIST training script
5764
mnist := constructMNISTConfigMap(test, namespace)
@@ -61,7 +68,6 @@ func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int
6168

6269
// Create RayCluster and assign it to the localqueue
6370
rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
64-
AssignToLocalQueue(rayCluster, localQueue)
6571
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
6672
test.Expect(err).NotTo(HaveOccurred())
6773
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
@@ -78,8 +84,8 @@ func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int
7884

7985
rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name)
8086

81-
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String())
82-
rayClient := NewRayClusterClient(rayDashboardURL)
87+
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL)
88+
rayClient := GetRayClusterClient(test, rayDashboardURL, test.Config().BearerToken)
8389

8490
test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name)
8591
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong).
@@ -111,9 +117,15 @@ func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) {
111117
func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
112118
test := With(t)
113119

114-
// Create a namespace and localqueue in that namespace
120+
// Create a namespace
115121
namespace := test.NewTestNamespace()
116-
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")
122+
123+
// Create Kueue resources
124+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
125+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
126+
clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus)
127+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
128+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
117129

118130
// Create MNIST training script
119131
mnist := constructMNISTConfigMap(test, namespace)
@@ -167,8 +179,8 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number
167179

168180
rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name)
169181

170-
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String())
171-
rayClient := NewRayClusterClient(rayDashboardURL)
182+
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL)
183+
rayClient := GetRayClusterClient(test, rayDashboardURL, test.Config().BearerToken)
172184

173185
test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name)
174186
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong).
@@ -374,7 +386,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
374386
}
375387
}
376388

377-
func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {
389+
func getRayDashboardURL(test Test, namespace, rayClusterName string) string {
378390
dashboardName := "ray-dashboard-" + rayClusterName
379391

380392
if IsOpenShift(test) {
@@ -396,10 +408,10 @@ func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {
396408
return resp.StatusCode, nil
397409
}, TestTimeoutShort).Should(Not(Equal(503)))
398410

399-
return url.URL{
400-
Scheme: "https",
401-
Host: hostname,
402-
}
411+
dashboardUrl, _ := url.Parse("https://" + hostname)
412+
test.T().Logf("Ray-dashboard route : %s\n", dashboardUrl.String())
413+
414+
return dashboardUrl.String()
403415
}
404416

405417
ingress := GetIngress(test, namespace, dashboardName)
@@ -408,8 +420,41 @@ func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {
408420
test.Eventually(Ingress(test, ingress.Namespace, ingress.Name), TestTimeoutShort).
409421
Should(WithTransform(LoadBalancerIngresses, HaveLen(1)))
410422

411-
return url.URL{
412-
Scheme: "http",
413-
Host: ingress.Spec.Rules[0].Host,
423+
hostname := ingress.Spec.Rules[0].Host
424+
dashboardUrl, _ := url.Parse("http://" + hostname)
425+
test.T().Logf("Ray-dashboard route : %s\n", dashboardUrl.String())
426+
427+
return dashboardUrl.String()
428+
}
429+
430+
// Create ClusterQueue
431+
func createClusterQueue(test Test, resourceFlavor *v1beta1.ResourceFlavor, numberOfGpus int) *v1beta1.ClusterQueue {
432+
cqSpec := v1beta1.ClusterQueueSpec{
433+
NamespaceSelector: &metav1.LabelSelector{},
434+
ResourceGroups: []v1beta1.ResourceGroup{
435+
{
436+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
437+
Flavors: []v1beta1.FlavorQuotas{
438+
{
439+
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
440+
Resources: []v1beta1.ResourceQuota{
441+
{
442+
Name: corev1.ResourceCPU,
443+
NominalQuota: resource.MustParse("8"),
444+
},
445+
{
446+
Name: corev1.ResourceMemory,
447+
NominalQuota: resource.MustParse("12Gi"),
448+
},
449+
{
450+
Name: corev1.ResourceName("nvidia.com/gpu"),
451+
NominalQuota: resource.MustParse(fmt.Sprint(numberOfGpus)),
452+
},
453+
},
454+
},
455+
},
456+
},
457+
},
414458
}
459+
return CreateKueueClusterQueue(test, cqSpec)
415460
}

Diff for: test/e2e/setup.sh

-27
Original file line numberDiff line numberDiff line change
@@ -69,30 +69,3 @@ done
6969
echo ""
7070

7171
sleep 5
72-
echo Creating Kueue ResourceFlavor and ClusterQueue
73-
cat <<EOF | kubectl apply -f -
74-
apiVersion: kueue.x-k8s.io/v1beta1
75-
kind: ResourceFlavor
76-
metadata:
77-
name: "default-flavor"
78-
EOF
79-
80-
cat <<EOF | kubectl apply -f -
81-
apiVersion: kueue.x-k8s.io/v1beta1
82-
kind: ClusterQueue
83-
metadata:
84-
name: "e2e-cluster-queue"
85-
spec:
86-
namespaceSelector: {} # match all.
87-
resourceGroups:
88-
- coveredResources: ["cpu","memory", "nvidia.com/gpu"]
89-
flavors:
90-
- name: "default-flavor"
91-
resources:
92-
- name: "cpu"
93-
nominalQuota: 4
94-
- name: "memory"
95-
nominalQuota: "20G"
96-
- name: "nvidia.com/gpu"
97-
nominalQuota: "1"
98-
EOF

Diff for: test/e2e/support.go

-12
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@ import (
2121

2222
"github.com/onsi/gomega"
2323
"github.com/project-codeflare/codeflare-common/support"
24-
25-
"sigs.k8s.io/controller-runtime/pkg/client"
26-
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
2724
)
2825

2926
//go:embed *.py *.txt *.sh
@@ -35,12 +32,3 @@ func ReadFile(t support.Test, fileName string) []byte {
3532
t.Expect(err).NotTo(gomega.HaveOccurred())
3633
return file
3734
}
38-
39-
func AssignToLocalQueue(object client.Object, localqueue *kueuev1beta1.LocalQueue) {
40-
labels := object.GetLabels()
41-
if labels == nil {
42-
labels = make(map[string]string)
43-
}
44-
labels["kueue.x-k8s.io/queue-name"] = localqueue.Name
45-
object.SetLabels(labels)
46-
}

0 commit comments

Comments
 (0)