@@ -28,6 +28,7 @@ import (
28
28
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
29
"k8s.io/apimachinery/pkg/runtime"
30
30
"k8s.io/apimachinery/pkg/types"
31
+ "k8s.io/utils/ptr"
31
32
"sigs.k8s.io/yaml"
32
33
)
33
34
@@ -179,3 +180,196 @@ func deployment(replicaCount int, milliCPU int64) workloadv1beta2.AppWrapperComp
179
180
Template : runtime.RawExtension {Raw : jsonBytes },
180
181
}
181
182
}
183
+
184
+ const rayClusterYAML = `
185
+ apiVersion: ray.io/v1
186
+ kind: RayCluster
187
+ metadata:
188
+ labels:
189
+ controller-tools.k8s.io: '1.0'
190
+ name: %v
191
+ spec:
192
+ autoscalerOptions:
193
+ idleTimeoutSeconds: 60
194
+ imagePullPolicy: Always
195
+ resources:
196
+ limits:
197
+ cpu: 500m
198
+ memory: 512Mi
199
+ requests:
200
+ cpu: 500m
201
+ memory: 512Mi
202
+ upscalingMode: Default
203
+ enableInTreeAutoscaling: false
204
+ headGroupSpec:
205
+ rayStartParams:
206
+ block: 'true'
207
+ dashboard-host: 0.0.0.0
208
+ num-gpus: '0'
209
+ serviceType: ClusterIP
210
+ template:
211
+ spec:
212
+ containers:
213
+ - env:
214
+ - name: MY_POD_IP
215
+ valueFrom:
216
+ fieldRef:
217
+ fieldPath: status.podIP
218
+ - name: RAY_USE_TLS
219
+ value: '0'
220
+ - name: RAY_TLS_SERVER_CERT
221
+ value: /home/ray/workspace/tls/server.crt
222
+ - name: RAY_TLS_SERVER_KEY
223
+ value: /home/ray/workspace/tls/server.key
224
+ - name: RAY_TLS_CA_CERT
225
+ value: /home/ray/workspace/tls/ca.crt
226
+ image: quay.io/project-codeflare/ray:latest-py39-cu118
227
+ imagePullPolicy: Always
228
+ lifecycle:
229
+ preStop:
230
+ exec:
231
+ command:
232
+ - /bin/sh
233
+ - -c
234
+ - ray stop
235
+ name: ray-head
236
+ ports:
237
+ - containerPort: 6379
238
+ name: gcs
239
+ - containerPort: 8265
240
+ name: dashboard
241
+ - containerPort: 10001
242
+ name: client
243
+ resources:
244
+ limits:
245
+ cpu: 2
246
+ memory: 8G
247
+ nvidia.com/gpu: 0
248
+ requests:
249
+ cpu: 2
250
+ memory: 8G
251
+ nvidia.com/gpu: 0
252
+ volumeMounts:
253
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
254
+ name: odh-trusted-ca-cert
255
+ subPath: odh-trusted-ca-bundle.crt
256
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
257
+ name: odh-trusted-ca-cert
258
+ subPath: odh-trusted-ca-bundle.crt
259
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
260
+ name: odh-ca-cert
261
+ subPath: odh-ca-bundle.crt
262
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
263
+ name: odh-ca-cert
264
+ subPath: odh-ca-bundle.crt
265
+ imagePullSecrets:
266
+ - name: unit-test-pull-secret
267
+ volumes:
268
+ - configMap:
269
+ items:
270
+ - key: ca-bundle.crt
271
+ path: odh-trusted-ca-bundle.crt
272
+ name: odh-trusted-ca-bundle
273
+ optional: true
274
+ name: odh-trusted-ca-cert
275
+ - configMap:
276
+ items:
277
+ - key: odh-ca-bundle.crt
278
+ path: odh-ca-bundle.crt
279
+ name: odh-trusted-ca-bundle
280
+ optional: true
281
+ name: odh-ca-cert
282
+ rayVersion: 2.7.0
283
+ workerGroupSpecs:
284
+ - groupName: small-group-unit-test-cluster-ray
285
+ maxReplicas: %v
286
+ minReplicas: %v
287
+ rayStartParams:
288
+ block: 'true'
289
+ num-gpus: '7'
290
+ replicas: %v
291
+ template:
292
+ metadata:
293
+ annotations:
294
+ key: value
295
+ labels:
296
+ key: value
297
+ spec:
298
+ containers:
299
+ - env:
300
+ - name: MY_POD_IP
301
+ valueFrom:
302
+ fieldRef:
303
+ fieldPath: status.podIP
304
+ - name: RAY_USE_TLS
305
+ value: '0'
306
+ - name: RAY_TLS_SERVER_CERT
307
+ value: /home/ray/workspace/tls/server.crt
308
+ - name: RAY_TLS_SERVER_KEY
309
+ value: /home/ray/workspace/tls/server.key
310
+ - name: RAY_TLS_CA_CERT
311
+ value: /home/ray/workspace/tls/ca.crt
312
+ image: quay.io/project-codeflare/ray:latest-py39-cu118
313
+ lifecycle:
314
+ preStop:
315
+ exec:
316
+ command:
317
+ - /bin/sh
318
+ - -c
319
+ - ray stop
320
+ name: machine-learning
321
+ resources:
322
+ requests:
323
+ cpu: %v
324
+ memory: 5G
325
+ nvidia.com/gpu: 7
326
+ volumeMounts:
327
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
328
+ name: odh-trusted-ca-cert
329
+ subPath: odh-trusted-ca-bundle.crt
330
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
331
+ name: odh-trusted-ca-cert
332
+ subPath: odh-trusted-ca-bundle.crt
333
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
334
+ name: odh-ca-cert
335
+ subPath: odh-ca-bundle.crt
336
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
337
+ name: odh-ca-cert
338
+ subPath: odh-ca-bundle.crt
339
+ imagePullSecrets:
340
+ - name: unit-test-pull-secret
341
+ volumes:
342
+ - configMap:
343
+ items:
344
+ - key: ca-bundle.crt
345
+ path: odh-trusted-ca-bundle.crt
346
+ name: odh-trusted-ca-bundle
347
+ optional: true
348
+ name: odh-trusted-ca-cert
349
+ - configMap:
350
+ items:
351
+ - key: odh-ca-bundle.crt
352
+ path: odh-ca-bundle.crt
353
+ name: odh-trusted-ca-bundle
354
+ optional: true
355
+ name: odh-ca-cert
356
+ `
357
+
358
+ func rayCluster (workerCount int , milliCPU int64 ) workloadv1beta2.AppWrapperComponent {
359
+ workerCPU := resource .NewMilliQuantity (milliCPU , resource .DecimalSI )
360
+ yamlString := fmt .Sprintf (rayClusterYAML ,
361
+ randName ("raycluster" ),
362
+ workerCount , workerCount , workerCount ,
363
+ workerCPU )
364
+
365
+ jsonBytes , err := yaml .YAMLToJSON ([]byte (yamlString ))
366
+ Expect (err ).NotTo (HaveOccurred ())
367
+ replicas := int32 (workerCount )
368
+ return workloadv1beta2.AppWrapperComponent {
369
+ PodSets : []workloadv1beta2.AppWrapperPodSet {
370
+ {Replicas : ptr .To (int32 (1 )), Path : "template.spec.headGroupSpec.template" },
371
+ {Replicas : & replicas , Path : "template.spec.workerGroupSpecs[0].template" },
372
+ },
373
+ Template : runtime.RawExtension {Raw : jsonBytes },
374
+ }
375
+ }
0 commit comments