@@ -15,6 +15,7 @@ import (
15
15
"time"
16
16
17
17
"github.com/bombsimon/logrusr/v2"
18
+ workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
18
19
"github.com/spf13/cobra"
19
20
corev1 "k8s.io/api/core/v1"
20
21
"k8s.io/apimachinery/pkg/api/errors"
@@ -31,7 +32,9 @@ import (
31
32
"sigs.k8s.io/controller-runtime/pkg/cache"
32
33
"sigs.k8s.io/controller-runtime/pkg/client"
33
34
"sigs.k8s.io/controller-runtime/pkg/controller"
35
+ "sigs.k8s.io/controller-runtime/pkg/event"
34
36
"sigs.k8s.io/controller-runtime/pkg/healthz"
37
+ "sigs.k8s.io/controller-runtime/pkg/manager"
35
38
"sigs.k8s.io/controller-runtime/pkg/metrics"
36
39
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
37
40
"sigs.k8s.io/controller-runtime/pkg/predicate"
@@ -78,16 +81,16 @@ var runCmd = &cobra.Command{
78
81
LeaderElectionID : "node-labeler.gitpod.io" ,
79
82
})
80
83
if err != nil {
81
- log .WithError (err ).Fatal ("unable to start node-labeber " )
84
+ log .WithError (err ).Fatal ("unable to start node-labeler " )
82
85
}
83
86
84
- client , err := client .New (ctrl .GetConfigOrDie (), client.Options {})
87
+ kClient , err := client .New (ctrl .GetConfigOrDie (), client.Options {})
85
88
if err != nil {
86
89
log .WithError (err ).Fatal ("unable to create client" )
87
90
}
88
91
89
92
r := & PodReconciler {
90
- client ,
93
+ kClient ,
91
94
}
92
95
93
96
componentPredicate , err := predicate .LabelSelectorPredicate (metav1.LabelSelector {
@@ -110,6 +113,36 @@ var runCmd = &cobra.Command{
110
113
log .WithError (err ).Fatal ("unable to bind controller watch event handler" )
111
114
}
112
115
116
+ if err := mgr .GetFieldIndexer ().IndexField (context .Background (), & workspacev1.Workspace {}, "status.runtime.nodeName" , func (o client.Object ) []string {
117
+ ws := o .(* workspacev1.Workspace )
118
+ if ws .Status .Runtime == nil {
119
+ return nil
120
+ }
121
+ return []string {ws .Status .Runtime .NodeName }
122
+ }); err != nil {
123
+ log .WithError (err ).Fatal ("unable to create workspace indexer" )
124
+ return
125
+ }
126
+
127
+ nsac , err := NewNodeScaledownAnnotationController (mgr .GetClient ())
128
+ if err != nil {
129
+ log .WithError (err ).Fatal ("unable to create node scaledown annotation controller" )
130
+ }
131
+ err = nsac .SetupWithManager (mgr )
132
+ if err != nil {
133
+ log .WithError (err ).Fatal ("unable to bind node scaledown annotation controller" )
134
+ }
135
+
136
+ err = mgr .Add (manager .RunnableFunc (func (ctx context.Context ) error {
137
+ <- ctx .Done ()
138
+ log .Info ("Received shutdown signal - stopping NodeScaledownAnnotationController" )
139
+ nsac .Stop ()
140
+ return nil
141
+ }))
142
+ if err != nil {
143
+ log .WithError (err ).Fatal ("couldn't properly clean up node scaledown annotation controller" )
144
+ }
145
+
113
146
metrics .Registry .MustRegister (NodeLabelerCounterVec )
114
147
metrics .Registry .MustRegister (NodeLabelerTimeHistVec )
115
148
@@ -123,10 +156,10 @@ var runCmd = &cobra.Command{
123
156
log .WithError (err ).Fatal ("unable to set up ready check" )
124
157
}
125
158
126
- log .Info ("starting node-labeber " )
159
+ log .Info ("starting node-labeler " )
127
160
err = mgr .Start (ctrl .SetupSignalHandler ())
128
161
if err != nil {
129
- log .WithError (err ).Fatal ("problem running node-labeber " )
162
+ log .WithError (err ).Fatal ("problem running node-labeler " )
130
163
}
131
164
132
165
log .Info ("Received SIGINT - shutting down" )
@@ -135,6 +168,8 @@ var runCmd = &cobra.Command{
135
168
136
169
func init () {
137
170
utilruntime .Must (clientgoscheme .AddToScheme (scheme ))
171
+ utilruntime .Must (workspacev1 .AddToScheme (scheme ))
172
+
138
173
rootCmd .AddCommand (runCmd )
139
174
}
140
175
@@ -249,6 +284,207 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
249
284
return reconcile.Result {}, nil
250
285
}
251
286
287
+ type NodeScaledownAnnotationController struct {
288
+ client.Client
289
+ nodesToReconcile chan string
290
+ stopChan chan struct {}
291
+ }
292
+
293
+ func NewNodeScaledownAnnotationController (client client.Client ) (* NodeScaledownAnnotationController , error ) {
294
+ controller := & NodeScaledownAnnotationController {
295
+ Client : client ,
296
+ nodesToReconcile : make (chan string , 1000 ),
297
+ stopChan : make (chan struct {}),
298
+ }
299
+
300
+ return controller , nil
301
+ }
302
+
303
+ func (c * NodeScaledownAnnotationController ) SetupWithManager (mgr ctrl.Manager ) error {
304
+ go c .reconciliationWorker ()
305
+ go c .periodicReconciliation ()
306
+
307
+ return ctrl .NewControllerManagedBy (mgr ).
308
+ Named ("node-scaledown-annotation-controller" ).
309
+ For (& workspacev1.Workspace {}).
310
+ WithEventFilter (c .workspaceFilter ()).
311
+ Complete (c )
312
+ }
313
+
314
+ // periodicReconciliation periodically reconciles all nodes in the cluster
315
+ func (c * NodeScaledownAnnotationController ) periodicReconciliation () {
316
+ ticker := time .NewTicker (5 * time .Minute )
317
+ defer ticker .Stop ()
318
+
319
+ for {
320
+ select {
321
+ case <- ticker .C :
322
+ log .Info ("starting periodic full reconciliation" )
323
+ ctx := context .Background ()
324
+ if _ , err := c .reconcileAllNodes (ctx ); err != nil {
325
+ log .WithError (err ).Error ("periodic reconciliation failed" )
326
+ }
327
+ case <- c .stopChan :
328
+ log .Info ("stopping periodic full reconciliation" )
329
+ return
330
+ }
331
+ }
332
+ }
333
+
334
+ // reconciliationWorker consumes nodesToReconcile and reconciles each node
335
+ func (c * NodeScaledownAnnotationController ) reconciliationWorker () {
336
+ log .Info ("reconciliation worker started" )
337
+ for {
338
+ select {
339
+ case nodeName := <- c .nodesToReconcile :
340
+ ctx := context .Background ()
341
+ if err := c .reconcileNode (ctx , nodeName ); err != nil {
342
+ log .WithError (err ).WithField ("node" , nodeName ).Error ("failed to reconcile node from queue" )
343
+ }
344
+ case <- c .stopChan :
345
+ log .Info ("reconciliation worker stopping" )
346
+ return
347
+ }
348
+ }
349
+ }
350
+
351
+ func (c * NodeScaledownAnnotationController ) workspaceFilter () predicate.Predicate {
352
+ return predicate.Funcs {
353
+ CreateFunc : func (e event.CreateEvent ) bool {
354
+ ws := e .Object .(* workspacev1.Workspace )
355
+ if ws .Status .Runtime == nil {
356
+ log .WithField ("workspace" , ws .Name ).Info ("workspace not ready yet" )
357
+ return false
358
+ }
359
+
360
+ return ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != ""
361
+ },
362
+ UpdateFunc : func (e event.UpdateEvent ) bool {
363
+ wsOld := e .ObjectOld .(* workspacev1.Workspace )
364
+ ws := e .ObjectNew .(* workspacev1.Workspace )
365
+ // if we haven't seen runtime info before and now it's there, let's reconcile.
366
+ // similarly, if the node name changed, we need to reconcile the old node as well.
367
+ if (wsOld .Status .Runtime == nil && ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != "" ) || // we just got runtime info
368
+ (wsOld .Status .Runtime != nil && ws .Status .Runtime != nil && wsOld .Status .Runtime .NodeName != ws .Status .Runtime .NodeName ) { // node name changed
369
+ if wsOld .Status .Runtime != nil && wsOld .Status .Runtime .NodeName != "" {
370
+ c .queueNodeForReconciliation (wsOld .Status .Runtime .NodeName )
371
+ }
372
+ return true
373
+ }
374
+
375
+ return false
376
+ },
377
+ DeleteFunc : func (e event.DeleteEvent ) bool {
378
+ ws := e .Object .(* workspacev1.Workspace )
379
+ if ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != "" {
380
+ c .queueNodeForReconciliation (ws .Status .Runtime .NodeName )
381
+ return true
382
+ }
383
+ return false
384
+ },
385
+ }
386
+ }
387
+
388
+ func (c * NodeScaledownAnnotationController ) queueNodeForReconciliation (nodeName string ) {
389
+ select {
390
+ case c .nodesToReconcile <- nodeName :
391
+ log .WithField ("node" , nodeName ).Info ("queued node for reconciliation" )
392
+ default :
393
+ log .WithField ("node" , nodeName ).Warn ("reconciliation queue full" )
394
+ }
395
+ }
396
+
397
+ func (c * NodeScaledownAnnotationController ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
398
+ log .WithField ("request" , req .NamespacedName .String ()).Info ("WorkspaceCountController reconciling" )
399
+
400
+ var ws workspacev1.Workspace
401
+ if err := c .Get (ctx , req .NamespacedName , & ws ); err != nil {
402
+ if ! errors .IsNotFound (err ) {
403
+ log .WithError (err ).WithField ("workspace" , req .NamespacedName ).Error ("unable to fetch Workspace" )
404
+ return ctrl.Result {}, err
405
+ }
406
+ return ctrl.Result {}, nil
407
+ }
408
+
409
+ if ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != "" {
410
+ c .queueNodeForReconciliation (ws .Status .Runtime .NodeName )
411
+ }
412
+
413
+ log .WithField ("runtime" , ws .Status .Runtime ).Warn ("reconciling object with no Runtime/NodeName, which wasn't filtered out by workspaceFilter" )
414
+ return ctrl.Result {}, nil
415
+ }
416
+
417
+ // Cleanup method to be called when shutting down the controller
418
+ func (wc * NodeScaledownAnnotationController ) Stop () {
419
+ close (wc .stopChan )
420
+ }
421
+
422
+ func (c * NodeScaledownAnnotationController ) reconcileAllNodes (ctx context.Context ) (ctrl.Result , error ) {
423
+ var nodes corev1.NodeList
424
+ if err := c .List (ctx , & nodes ); err != nil {
425
+ log .WithError (err ).Error ("failed to list nodes" )
426
+ return ctrl.Result {}, err
427
+ }
428
+
429
+ for _ , node := range nodes .Items {
430
+ c .queueNodeForReconciliation (node .Name )
431
+ }
432
+
433
+ return ctrl.Result {}, nil
434
+ }
435
+
436
+ func (c * NodeScaledownAnnotationController ) reconcileNode (ctx context.Context , nodeName string ) error {
437
+ var workspaceList workspacev1.WorkspaceList
438
+ if err := c .List (ctx , & workspaceList , client.MatchingFields {
439
+ "status.runtime.nodeName" : nodeName ,
440
+ }); err != nil {
441
+ return fmt .Errorf ("failed to list workspaces: %w" , err )
442
+ }
443
+
444
+ log .WithField ("node" , nodeName ).WithField ("count" , len (workspaceList .Items )).Info ("acting on workspaces" )
445
+ count := len (workspaceList .Items )
446
+
447
+ return c .updateNodeAnnotation (ctx , nodeName , count )
448
+ }
449
+
450
+ func (c * NodeScaledownAnnotationController ) updateNodeAnnotation (ctx context.Context , nodeName string , count int ) error {
451
+ return retry .RetryOnConflict (retry .DefaultBackoff , func () error {
452
+ ctx , cancel := context .WithTimeout (ctx , 5 * time .Second )
453
+ defer cancel ()
454
+
455
+ var node corev1.Node
456
+ err := c .Get (ctx , types.NamespacedName {Name : nodeName }, & node )
457
+ if err != nil {
458
+ return fmt .Errorf ("obtaining node %s: %w" , nodeName , err )
459
+ }
460
+
461
+ shouldDisableScaleDown := count > 0
462
+ currentlyDisabled := false
463
+ if val , exists := node .Annotations ["cluster-autoscaler.kubernetes.io/scale-down-disabled" ]; exists {
464
+ currentlyDisabled = val == "true"
465
+ }
466
+
467
+ // Only update if the state needs to change
468
+ if shouldDisableScaleDown != currentlyDisabled {
469
+ if node .Annotations == nil {
470
+ node .Annotations = make (map [string ]string )
471
+ }
472
+
473
+ if shouldDisableScaleDown {
474
+ node .Annotations ["cluster-autoscaler.kubernetes.io/scale-down-disabled" ] = "true"
475
+ log .WithField ("nodeName" , nodeName ).Info ("disabling scale-down for node" )
476
+ } else {
477
+ delete (node .Annotations , "cluster-autoscaler.kubernetes.io/scale-down-disabled" )
478
+ log .WithField ("nodeName" , nodeName ).Info ("enabling scale-down for node" )
479
+ }
480
+
481
+ return c .Update (ctx , & node )
482
+ }
483
+
484
+ return nil
485
+ })
486
+ }
487
+
252
488
func updateLabel (label string , add bool , nodeName string , client client.Client ) error {
253
489
return retry .RetryOnConflict (retry .DefaultBackoff , func () error {
254
490
ctx , cancel := context .WithTimeout (context .Background (), 5 * time .Second )
0 commit comments