5
5
"fmt"
6
6
"os"
7
7
"reflect"
8
+ "strconv"
8
9
"strings"
9
10
"time"
10
11
@@ -364,19 +365,32 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
364
365
return nil , nil , err
365
366
}
366
367
367
- if r .shouldPrepareNewRayCluster (rayServiceInstance , activeRayCluster ) {
368
+ clusterAction := r .shouldPrepareNewRayCluster (rayServiceInstance , activeRayCluster )
369
+ if clusterAction == RolloutNew {
368
370
// For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously.
369
371
// Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
370
372
enableZeroDowntime := true
371
373
if s := os .Getenv (ENABLE_ZERO_DOWNTIME ); strings .ToLower (s ) == "false" {
372
374
enableZeroDowntime = false
373
375
}
374
376
if enableZeroDowntime || ! enableZeroDowntime && activeRayCluster == nil {
375
- r .markRestart (rayServiceInstance )
377
+ // Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will
378
+ // actually create the pending RayCluster instance.
379
+ r .markRestartAndAddPendingClusterName (rayServiceInstance )
376
380
} else {
377
381
r .Log .Info ("Zero-downtime upgrade is disabled (ENABLE_ZERO_DOWNTIME: false). Skip preparing a new RayCluster." )
378
382
}
379
383
return activeRayCluster , nil , nil
384
+ } else if clusterAction == Update {
385
+ // Update the active cluster.
386
+ r .Log .Info ("Updating the active RayCluster instance." )
387
+ if activeRayCluster , err = r .constructRayClusterForRayService (rayServiceInstance , activeRayCluster .Name ); err != nil {
388
+ return nil , nil , err
389
+ }
390
+ if err := r .updateRayClusterInstance (ctx , activeRayCluster ); err != nil {
391
+ return nil , nil , err
392
+ }
393
+ return activeRayCluster , nil , nil
380
394
}
381
395
382
396
if pendingRayCluster , err = r .createRayClusterInstanceIfNeeded (ctx , rayServiceInstance , pendingRayCluster ); err != nil {
@@ -468,67 +482,158 @@ func (r *RayServiceReconciler) cleanUpServeConfigCache(rayServiceInstance *rayv1
468
482
}
469
483
}
470
484
485
+ type ClusterAction int
486
+
487
+ const (
488
+ DoNothing ClusterAction = iota // value 0
489
+ Update // value 1
490
+ RolloutNew // value 2
491
+ )
492
+
471
493
// shouldPrepareNewRayCluster checks if we need to generate a new pending cluster.
472
- func (r * RayServiceReconciler ) shouldPrepareNewRayCluster (rayServiceInstance * rayv1.RayService , activeRayCluster * rayv1.RayCluster ) bool {
494
+ func (r * RayServiceReconciler ) shouldPrepareNewRayCluster (rayServiceInstance * rayv1.RayService , activeRayCluster * rayv1.RayCluster ) ClusterAction {
473
495
// Prepare new RayCluster if:
474
496
// 1. No active cluster and no pending cluster
475
497
// 2. No pending cluster, and the active RayCluster has changed.
476
498
if rayServiceInstance .Status .PendingServiceStatus .RayClusterName == "" {
477
499
if activeRayCluster == nil {
478
500
r .Log .Info ("No active Ray cluster. RayService operator should prepare a new Ray cluster." )
479
- return true
501
+ return RolloutNew
480
502
}
481
- activeClusterHash := activeRayCluster .ObjectMeta .Annotations [utils .RayServiceClusterHashKey ]
482
- goalClusterHash , err := generateRayClusterJsonHash (rayServiceInstance .Spec .RayClusterSpec )
503
+
504
+ // Case 1: If everything is identical except for the Replicas and WorkersToDelete of
505
+ // each WorkerGroup, then do nothing.
506
+ activeClusterHash := activeRayCluster .ObjectMeta .Annotations [utils .HashWithoutReplicasAndWorkersToDeleteKey ]
507
+ goalClusterHash , err := generateHashWithoutReplicasAndWorkersToDelete (rayServiceInstance .Spec .RayClusterSpec )
508
+ errContextFailedToSerialize := "Failed to serialize new RayCluster config. " +
509
+ "Manual config updates will NOT be tracked accurately. " +
510
+ "Please manually tear down the cluster and apply a new config."
483
511
if err != nil {
484
- errContext := "Failed to serialize new RayCluster config. " +
485
- "Manual config updates will NOT be tracked accurately. " +
486
- "Please manually tear down the cluster and apply a new config."
487
- r .Log .Error (err , errContext )
488
- return true
512
+ r .Log .Error (err , errContextFailedToSerialize )
513
+ return DoNothing
489
514
}
490
515
491
- if activeClusterHash != goalClusterHash {
492
- r .Log .Info ("Active RayCluster config doesn't match goal config. " +
493
- "RayService operator should prepare a new Ray cluster.\n " +
494
- "* Active RayCluster config hash: " + activeClusterHash + "\n " +
495
- "* Goal RayCluster config hash: " + goalClusterHash )
496
- } else {
497
- r .Log .Info ("Active Ray cluster config matches goal config." )
516
+ if activeClusterHash == goalClusterHash {
517
+ r .Log .Info ("Active Ray cluster config matches goal config. No need to update RayCluster." )
518
+ return DoNothing
498
519
}
499
520
500
- return activeClusterHash != goalClusterHash
521
+ // Case 2: Otherwise, if everything is identical except for the Replicas and WorkersToDelete of
522
+ // the existing workergroups, and one or more new workergroups are added at the end, then update the cluster.
523
+ activeClusterNumWorkerGroups , err := strconv .Atoi (activeRayCluster .ObjectMeta .Annotations [utils .NumWorkerGroupsKey ])
524
+ if err != nil {
525
+ r .Log .Error (err , errContextFailedToSerialize )
526
+ return DoNothing
527
+ }
528
+ goalNumWorkerGroups := len (rayServiceInstance .Spec .RayClusterSpec .WorkerGroupSpecs )
529
+ r .Log .Info ("number of worker groups" , "activeClusterNumWorkerGroups" , activeClusterNumWorkerGroups , "goalNumWorkerGroups" , goalNumWorkerGroups )
530
+ if goalNumWorkerGroups > activeClusterNumWorkerGroups {
531
+
532
+ // Remove the new workergroup(s) from the end before calculating the hash.
533
+ goalClusterSpec := rayServiceInstance .Spec .RayClusterSpec .DeepCopy ()
534
+ goalClusterSpec .WorkerGroupSpecs = goalClusterSpec .WorkerGroupSpecs [:activeClusterNumWorkerGroups ]
535
+
536
+ // Generate the hash of the old worker group specs.
537
+ goalClusterHash , err = generateHashWithoutReplicasAndWorkersToDelete (* goalClusterSpec )
538
+ if err != nil {
539
+ r .Log .Error (err , errContextFailedToSerialize )
540
+ return DoNothing
541
+ }
542
+
543
+ if activeClusterHash == goalClusterHash {
544
+ r .Log .Info ("Active RayCluster config matches goal config, except that one or more entries were appended to WorkerGroupSpecs. Updating RayCluster." )
545
+ return Update
546
+ }
547
+ }
548
+
549
+ // Case 3: Otherwise, rollout a new cluster.
550
+ r .Log .Info ("Active RayCluster config doesn't match goal config. " +
551
+ "RayService operator should prepare a new Ray cluster.\n " +
552
+ "* Active RayCluster config hash: " + activeClusterHash + "\n " +
553
+ "* Goal RayCluster config hash: " + goalClusterHash )
554
+ return RolloutNew
501
555
}
502
556
503
- return false
557
+ return DoNothing
504
558
}
505
559
506
560
// createRayClusterInstanceIfNeeded checks if we need to create a new RayCluster instance. If so, create one.
507
561
func (r * RayServiceReconciler ) createRayClusterInstanceIfNeeded (ctx context.Context , rayServiceInstance * rayv1.RayService , pendingRayCluster * rayv1.RayCluster ) (* rayv1.RayCluster , error ) {
562
+ // Early return if no pending RayCluster needs to be created.
508
563
if rayServiceInstance .Status .PendingServiceStatus .RayClusterName == "" {
509
- // No exist pending RayCluster and no need to create one.
510
564
return nil , nil
511
565
}
512
566
513
- // Create a new RayCluster if:
514
- // 1. No RayCluster pending.
515
- // 2. Config update for the pending cluster.
516
- equal , err := compareRayClusterJsonHash (pendingRayCluster .Spec , rayServiceInstance .Spec .RayClusterSpec )
517
- if err != nil {
518
- r .Log .Error (err , "Fail to generate hash for RayClusterSpec" )
519
- return nil , err
567
+ var clusterAction ClusterAction
568
+ var err error
569
+
570
+ if pendingRayCluster == nil {
571
+ clusterAction = RolloutNew
572
+ } else {
573
+ clusterAction , err = getClusterAction (pendingRayCluster .Spec , rayServiceInstance .Spec .RayClusterSpec )
574
+ if err != nil {
575
+ r .Log .Error (err , "Fail to generate hash for RayClusterSpec" )
576
+ return nil , err
577
+ }
520
578
}
521
579
522
- if pendingRayCluster == nil || ! equal {
580
+ switch clusterAction {
581
+ case RolloutNew :
582
+ r .Log .Info ("Creating a new pending RayCluster instance." )
523
583
pendingRayCluster , err = r .createRayClusterInstance (ctx , rayServiceInstance , rayServiceInstance .Status .PendingServiceStatus .RayClusterName )
524
- if err != nil {
584
+ case Update :
585
+ r .Log .Info ("Updating the pending RayCluster instance." )
586
+ if pendingRayCluster , err = r .constructRayClusterForRayService (rayServiceInstance , pendingRayCluster .Name ); err != nil {
525
587
return nil , err
526
588
}
589
+ err = r .updateRayClusterInstance (ctx , pendingRayCluster )
590
+ }
591
+
592
+ if err != nil {
593
+ return nil , err
527
594
}
528
595
529
596
return pendingRayCluster , nil
530
597
}
531
598
599
+ // updateRayClusterInstance updates the RayCluster instance.
600
+ func (r * RayServiceReconciler ) updateRayClusterInstance (ctx context.Context , rayClusterInstance * rayv1.RayCluster ) error {
601
+ r .Log .V (1 ).Info ("updateRayClusterInstance" , "Name" , rayClusterInstance .Name , "Namespace" , rayClusterInstance .Namespace )
602
+ // Printing the whole RayCluster is too noisy. Only print the spec.
603
+ r .Log .V (1 ).Info ("updateRayClusterInstance" , "rayClusterInstance.Spec" , rayClusterInstance .Spec )
604
+
605
+ // Fetch the current state of the RayCluster
606
+ currentRayCluster , err := r .getRayClusterByNamespacedName (ctx , client.ObjectKey {
607
+ Namespace : rayClusterInstance .Namespace ,
608
+ Name : rayClusterInstance .Name ,
609
+ })
610
+ if err != nil {
611
+ r .Log .Error (err , "Failed to get the current state of RayCluster" , "Namespace" , rayClusterInstance .Namespace , "Name" , rayClusterInstance .Name )
612
+ return err
613
+ }
614
+
615
+ if currentRayCluster == nil {
616
+ r .Log .Info ("RayCluster not found, possibly deleted" , "Namespace" , rayClusterInstance .Namespace , "Name" , rayClusterInstance .Name )
617
+ return nil
618
+ }
619
+
620
+ // Update the fetched RayCluster with new changes
621
+ currentRayCluster .Spec = rayClusterInstance .Spec
622
+
623
+ // Update the labels and annotations
624
+ currentRayCluster .Labels = rayClusterInstance .Labels
625
+ currentRayCluster .Annotations = rayClusterInstance .Annotations
626
+
627
+ // Update the RayCluster
628
+ if err = r .Update (ctx , currentRayCluster ); err != nil {
629
+ r .Log .Error (err , "Fail to update RayCluster " + currentRayCluster .Name )
630
+ return err
631
+ }
632
+
633
+ r .Log .V (1 ).Info ("updated RayCluster" , "rayClusterInstance" , currentRayCluster )
634
+ return nil
635
+ }
636
+
532
637
// createRayClusterInstance deletes the old RayCluster instance if exists. Only when no existing RayCluster, create a new RayCluster instance.
533
638
// One important part is that if this method deletes the old RayCluster, it will return instantly. It depends on the controller to call it again to generate the new RayCluster instance.
534
639
func (r * RayServiceReconciler ) createRayClusterInstance (ctx context.Context , rayServiceInstance * rayv1.RayService , rayClusterInstanceName string ) (* rayv1.RayCluster , error ) {
@@ -591,14 +696,15 @@ func (r *RayServiceReconciler) constructRayClusterForRayService(rayService *rayv
591
696
rayClusterAnnotations [k ] = v
592
697
}
593
698
rayClusterAnnotations [utils .EnableServeServiceKey ] = utils .EnableServeServiceTrue
594
- rayClusterAnnotations [utils .RayServiceClusterHashKey ], err = generateRayClusterJsonHash (rayService .Spec .RayClusterSpec )
699
+ errContext := "Failed to serialize RayCluster config. " +
700
+ "Manual config updates will NOT be tracked accurately. " +
701
+ "Please tear down the cluster and apply a new config."
702
+ rayClusterAnnotations [utils .HashWithoutReplicasAndWorkersToDeleteKey ], err = generateHashWithoutReplicasAndWorkersToDelete (rayService .Spec .RayClusterSpec )
595
703
if err != nil {
596
- errContext := "Failed to serialize RayCluster config. " +
597
- "Manual config updates will NOT be tracked accurately. " +
598
- "Please tear down the cluster and apply a new config."
599
704
r .Log .Error (err , errContext )
600
705
return nil , err
601
706
}
707
+ rayClusterAnnotations [utils .NumWorkerGroupsKey ] = strconv .Itoa (len (rayService .Spec .RayClusterSpec .WorkerGroupSpecs ))
602
708
603
709
rayCluster := & rayv1.RayCluster {
604
710
ObjectMeta : metav1.ObjectMeta {
@@ -862,7 +968,7 @@ func updateDashboardStatus(rayServiceClusterStatus *rayv1.RayServiceStatus, isHe
862
968
}
863
969
}
864
970
865
- func (r * RayServiceReconciler ) markRestart (rayServiceInstance * rayv1.RayService ) {
971
+ func (r * RayServiceReconciler ) markRestartAndAddPendingClusterName (rayServiceInstance * rayv1.RayService ) {
866
972
// Generate RayCluster name for pending cluster.
867
973
r .Log .V (1 ).Info ("Current cluster is unhealthy, prepare to restart." , "Status" , rayServiceInstance .Status )
868
974
rayServiceInstance .Status .ServiceStatus = rayv1 .Restarting
@@ -1139,8 +1245,41 @@ func (r *RayServiceReconciler) labelHealthyServePods(ctx context.Context, rayClu
1139
1245
return nil
1140
1246
}
1141
1247
1142
- func generateRayClusterJsonHash (rayClusterSpec rayv1.RayClusterSpec ) (string , error ) {
1143
- // Mute all fields that will not trigger new RayCluster preparation. For example,
1248
+ func getClusterAction (oldSpec rayv1.RayClusterSpec , newSpec rayv1.RayClusterSpec ) (ClusterAction , error ) {
1249
+ // Return the appropriate action based on the difference in the old and new RayCluster specs.
1250
+
1251
+ // Case 1: If everything is identical except for the Replicas and WorkersToDelete of
1252
+ // each WorkerGroup, then do nothing.
1253
+ sameHash , err := compareRayClusterJsonHash (oldSpec , newSpec , generateHashWithoutReplicasAndWorkersToDelete )
1254
+ if err != nil {
1255
+ return DoNothing , err
1256
+ }
1257
+ if sameHash {
1258
+ return DoNothing , nil
1259
+ }
1260
+
1261
+ // Case 2: Otherwise, if everything is identical except for the Replicas and WorkersToDelete of
1262
+ // the existing workergroups, and one or more new workergroups are added at the end, then update the cluster.
1263
+ newSpecWithoutWorkerGroups := newSpec .DeepCopy ()
1264
+ if len (newSpec .WorkerGroupSpecs ) > len (oldSpec .WorkerGroupSpecs ) {
1265
+ // Remove the new worker groups from the new spec.
1266
+ newSpecWithoutWorkerGroups .WorkerGroupSpecs = newSpecWithoutWorkerGroups .WorkerGroupSpecs [:len (oldSpec .WorkerGroupSpecs )]
1267
+
1268
+ sameHash , err = compareRayClusterJsonHash (oldSpec , * newSpecWithoutWorkerGroups , generateHashWithoutReplicasAndWorkersToDelete )
1269
+ if err != nil {
1270
+ return DoNothing , err
1271
+ }
1272
+ if sameHash {
1273
+ return Update , nil
1274
+ }
1275
+ }
1276
+
1277
+ // Case 3: Otherwise, rollout a new cluster.
1278
+ return RolloutNew , nil
1279
+ }
1280
+
1281
+ func generateHashWithoutReplicasAndWorkersToDelete (rayClusterSpec rayv1.RayClusterSpec ) (string , error ) {
1282
+ // Mute certain fields that will not trigger new RayCluster preparation. For example,
1144
1283
// Autoscaler will update `Replicas` and `WorkersToDelete` when scaling up/down.
1145
1284
updatedRayClusterSpec := rayClusterSpec .DeepCopy ()
1146
1285
for i := 0 ; i < len (updatedRayClusterSpec .WorkerGroupSpecs ); i ++ {
@@ -1152,13 +1291,13 @@ func generateRayClusterJsonHash(rayClusterSpec rayv1.RayClusterSpec) (string, er
1152
1291
return utils .GenerateJsonHash (updatedRayClusterSpec )
1153
1292
}
1154
1293
1155
- func compareRayClusterJsonHash (spec1 rayv1.RayClusterSpec , spec2 rayv1.RayClusterSpec ) (bool , error ) {
1156
- hash1 , err1 := generateRayClusterJsonHash (spec1 )
1294
+ func compareRayClusterJsonHash (spec1 rayv1.RayClusterSpec , spec2 rayv1.RayClusterSpec , hashFunc func (rayv1. RayClusterSpec ) ( string , error ) ) (bool , error ) {
1295
+ hash1 , err1 := hashFunc (spec1 )
1157
1296
if err1 != nil {
1158
1297
return false , err1
1159
1298
}
1160
1299
1161
- hash2 , err2 := generateRayClusterJsonHash (spec2 )
1300
+ hash2 , err2 := hashFunc (spec2 )
1162
1301
if err2 != nil {
1163
1302
return false , err2
1164
1303
}
0 commit comments