Skip to content

Commit 4d3cca4

Browse files
[node-labeler] Introduce workspace count controller (#20509)
* [ws-daemon] Introduce pod count controller * remove unnecessary variable * move to `node-labeler` * act on ws crds * Fix runtime not filled in yet * Make tests pass! * Improve test file structure * Fix `node-labeler:lib` build * Remove unnecessary changes * Address some review comments (thanks, kyle!) * Try caching? * Queue deleted nodes and periodically reconcile it all * WCC cleanup function * Fix tests * Update name * Add metrics for controller * Add synchronization for node reconciliation to prevent race conditions * Address review comments * Remove superflous log * Remove unneeded metrics and add cool log line * big yellow warning for a thing that should not happen
1 parent 76781bf commit 4d3cca4

File tree

8 files changed

+590
-13
lines changed

8 files changed

+590
-13
lines changed

Diff for: components/node-labeler/BUILD.yaml

+10
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,16 @@ packages:
55
- "**/*.go"
66
- "go.mod"
77
- "go.sum"
8+
- "crd/*.yaml"
89
deps:
910
- components/common-go:lib
11+
- components/ws-manager-api/go:lib
12+
- components/ws-manager-mk2:crd
1013
env:
1114
- CGO_ENABLED=0
1215
- GOOS=linux
16+
prep:
17+
- ["mv", "_deps/components-ws-manager-mk2--crd/workspace.gitpod.io_workspaces.yaml", "crd/workspace.gitpod.io_workspaces.yaml"]
1318
config:
1419
packaging: app
1520
buildCommand: ["go", "build", "-trimpath", "-ldflags", "-buildid= -w -s -X 'github.com/gitpod-io/gitpod/node-labeler/cmd.Version=commit-${__git_commit}'"]
@@ -34,5 +39,10 @@ packages:
3439
- "**/*.go"
3540
- "go.mod"
3641
- "go.sum"
42+
- "crd/*.yaml"
3743
deps:
3844
- components/common-go:lib
45+
- components/ws-manager-api/go:lib
46+
- components/ws-manager-mk2:crd
47+
prep:
48+
- ["mv", "_deps/components-ws-manager-mk2--crd/workspace.gitpod.io_workspaces.yaml", "crd/workspace.gitpod.io_workspaces.yaml"]

Diff for: components/node-labeler/cmd/root.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ var (
3232
// rootCmd represents the base command when called without any subcommands
3333
var rootCmd = &cobra.Command{
3434
Use: ServiceName,
35-
Short: "node-labeler is in charge of maintining the node labels that workspaces require to run in a node",
35+
Short: "node-labeler is in charge of maintaining the node labels that workspaces require to run in a node",
3636
PersistentPreRun: func(cmd *cobra.Command, args []string) {
3737
log.Init(ServiceName, Version, jsonLog, verbose)
3838
},

Diff for: components/node-labeler/cmd/run.go

+241-5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"time"
1616

1717
"github.com/bombsimon/logrusr/v2"
18+
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
1819
"github.com/spf13/cobra"
1920
corev1 "k8s.io/api/core/v1"
2021
"k8s.io/apimachinery/pkg/api/errors"
@@ -31,7 +32,9 @@ import (
3132
"sigs.k8s.io/controller-runtime/pkg/cache"
3233
"sigs.k8s.io/controller-runtime/pkg/client"
3334
"sigs.k8s.io/controller-runtime/pkg/controller"
35+
"sigs.k8s.io/controller-runtime/pkg/event"
3436
"sigs.k8s.io/controller-runtime/pkg/healthz"
37+
"sigs.k8s.io/controller-runtime/pkg/manager"
3538
"sigs.k8s.io/controller-runtime/pkg/metrics"
3639
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3740
"sigs.k8s.io/controller-runtime/pkg/predicate"
@@ -78,16 +81,16 @@ var runCmd = &cobra.Command{
7881
LeaderElectionID: "node-labeler.gitpod.io",
7982
})
8083
if err != nil {
81-
log.WithError(err).Fatal("unable to start node-labeber")
84+
log.WithError(err).Fatal("unable to start node-labeler")
8285
}
8386

84-
client, err := client.New(ctrl.GetConfigOrDie(), client.Options{})
87+
kClient, err := client.New(ctrl.GetConfigOrDie(), client.Options{})
8588
if err != nil {
8689
log.WithError(err).Fatal("unable to create client")
8790
}
8891

8992
r := &PodReconciler{
90-
client,
93+
kClient,
9194
}
9295

9396
componentPredicate, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{
@@ -110,6 +113,36 @@ var runCmd = &cobra.Command{
110113
log.WithError(err).Fatal("unable to bind controller watch event handler")
111114
}
112115

116+
if err := mgr.GetFieldIndexer().IndexField(context.Background(), &workspacev1.Workspace{}, "status.runtime.nodeName", func(o client.Object) []string {
117+
ws := o.(*workspacev1.Workspace)
118+
if ws.Status.Runtime == nil {
119+
return nil
120+
}
121+
return []string{ws.Status.Runtime.NodeName}
122+
}); err != nil {
123+
log.WithError(err).Fatal("unable to create workspace indexer")
124+
return
125+
}
126+
127+
nsac, err := NewNodeScaledownAnnotationController(mgr.GetClient())
128+
if err != nil {
129+
log.WithError(err).Fatal("unable to create node scaledown annotation controller")
130+
}
131+
err = nsac.SetupWithManager(mgr)
132+
if err != nil {
133+
log.WithError(err).Fatal("unable to bind node scaledown annotation controller")
134+
}
135+
136+
err = mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
137+
<-ctx.Done()
138+
log.Info("Received shutdown signal - stopping NodeScaledownAnnotationController")
139+
nsac.Stop()
140+
return nil
141+
}))
142+
if err != nil {
143+
log.WithError(err).Fatal("couldn't properly clean up node scaledown annotation controller")
144+
}
145+
113146
metrics.Registry.MustRegister(NodeLabelerCounterVec)
114147
metrics.Registry.MustRegister(NodeLabelerTimeHistVec)
115148

@@ -123,10 +156,10 @@ var runCmd = &cobra.Command{
123156
log.WithError(err).Fatal("unable to set up ready check")
124157
}
125158

126-
log.Info("starting node-labeber")
159+
log.Info("starting node-labeler")
127160
err = mgr.Start(ctrl.SetupSignalHandler())
128161
if err != nil {
129-
log.WithError(err).Fatal("problem running node-labeber")
162+
log.WithError(err).Fatal("problem running node-labeler")
130163
}
131164

132165
log.Info("Received SIGINT - shutting down")
@@ -135,6 +168,8 @@ var runCmd = &cobra.Command{
135168

136169
func init() {
137170
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
171+
utilruntime.Must(workspacev1.AddToScheme(scheme))
172+
138173
rootCmd.AddCommand(runCmd)
139174
}
140175

@@ -249,6 +284,207 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
249284
return reconcile.Result{}, nil
250285
}
251286

287+
type NodeScaledownAnnotationController struct {
288+
client.Client
289+
nodesToReconcile chan string
290+
stopChan chan struct{}
291+
}
292+
293+
func NewNodeScaledownAnnotationController(client client.Client) (*NodeScaledownAnnotationController, error) {
294+
controller := &NodeScaledownAnnotationController{
295+
Client: client,
296+
nodesToReconcile: make(chan string, 1000),
297+
stopChan: make(chan struct{}),
298+
}
299+
300+
return controller, nil
301+
}
302+
303+
func (c *NodeScaledownAnnotationController) SetupWithManager(mgr ctrl.Manager) error {
304+
go c.reconciliationWorker()
305+
go c.periodicReconciliation()
306+
307+
return ctrl.NewControllerManagedBy(mgr).
308+
Named("node-scaledown-annotation-controller").
309+
For(&workspacev1.Workspace{}).
310+
WithEventFilter(c.workspaceFilter()).
311+
Complete(c)
312+
}
313+
314+
// periodicReconciliation periodically reconciles all nodes in the cluster
315+
func (c *NodeScaledownAnnotationController) periodicReconciliation() {
316+
ticker := time.NewTicker(5 * time.Minute)
317+
defer ticker.Stop()
318+
319+
for {
320+
select {
321+
case <-ticker.C:
322+
log.Info("starting periodic full reconciliation")
323+
ctx := context.Background()
324+
if _, err := c.reconcileAllNodes(ctx); err != nil {
325+
log.WithError(err).Error("periodic reconciliation failed")
326+
}
327+
case <-c.stopChan:
328+
log.Info("stopping periodic full reconciliation")
329+
return
330+
}
331+
}
332+
}
333+
334+
// reconciliationWorker consumes nodesToReconcile and reconciles each node
335+
func (c *NodeScaledownAnnotationController) reconciliationWorker() {
336+
log.Info("reconciliation worker started")
337+
for {
338+
select {
339+
case nodeName := <-c.nodesToReconcile:
340+
ctx := context.Background()
341+
if err := c.reconcileNode(ctx, nodeName); err != nil {
342+
log.WithError(err).WithField("node", nodeName).Error("failed to reconcile node from queue")
343+
}
344+
case <-c.stopChan:
345+
log.Info("reconciliation worker stopping")
346+
return
347+
}
348+
}
349+
}
350+
351+
func (c *NodeScaledownAnnotationController) workspaceFilter() predicate.Predicate {
352+
return predicate.Funcs{
353+
CreateFunc: func(e event.CreateEvent) bool {
354+
ws := e.Object.(*workspacev1.Workspace)
355+
if ws.Status.Runtime == nil {
356+
log.WithField("workspace", ws.Name).Info("workspace not ready yet")
357+
return false
358+
}
359+
360+
return ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != ""
361+
},
362+
UpdateFunc: func(e event.UpdateEvent) bool {
363+
wsOld := e.ObjectOld.(*workspacev1.Workspace)
364+
ws := e.ObjectNew.(*workspacev1.Workspace)
365+
// if we haven't seen runtime info before and now it's there, let's reconcile.
366+
// similarly, if the node name changed, we need to reconcile the old node as well.
367+
if (wsOld.Status.Runtime == nil && ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "") || // we just got runtime info
368+
(wsOld.Status.Runtime != nil && ws.Status.Runtime != nil && wsOld.Status.Runtime.NodeName != ws.Status.Runtime.NodeName) { // node name changed
369+
if wsOld.Status.Runtime != nil && wsOld.Status.Runtime.NodeName != "" {
370+
c.queueNodeForReconciliation(wsOld.Status.Runtime.NodeName)
371+
}
372+
return true
373+
}
374+
375+
return false
376+
},
377+
DeleteFunc: func(e event.DeleteEvent) bool {
378+
ws := e.Object.(*workspacev1.Workspace)
379+
if ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "" {
380+
c.queueNodeForReconciliation(ws.Status.Runtime.NodeName)
381+
return true
382+
}
383+
return false
384+
},
385+
}
386+
}
387+
388+
func (c *NodeScaledownAnnotationController) queueNodeForReconciliation(nodeName string) {
389+
select {
390+
case c.nodesToReconcile <- nodeName:
391+
log.WithField("node", nodeName).Info("queued node for reconciliation")
392+
default:
393+
log.WithField("node", nodeName).Warn("reconciliation queue full")
394+
}
395+
}
396+
397+
func (c *NodeScaledownAnnotationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
398+
log.WithField("request", req.NamespacedName.String()).Info("WorkspaceCountController reconciling")
399+
400+
var ws workspacev1.Workspace
401+
if err := c.Get(ctx, req.NamespacedName, &ws); err != nil {
402+
if !errors.IsNotFound(err) {
403+
log.WithError(err).WithField("workspace", req.NamespacedName).Error("unable to fetch Workspace")
404+
return ctrl.Result{}, err
405+
}
406+
return ctrl.Result{}, nil
407+
}
408+
409+
if ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "" {
410+
c.queueNodeForReconciliation(ws.Status.Runtime.NodeName)
411+
}
412+
413+
log.WithField("runtime", ws.Status.Runtime).Warn("reconciling object with no Runtime/NodeName, which wasn't filtered out by workspaceFilter")
414+
return ctrl.Result{}, nil
415+
}
416+
417+
// Cleanup method to be called when shutting down the controller
418+
func (wc *NodeScaledownAnnotationController) Stop() {
419+
close(wc.stopChan)
420+
}
421+
422+
func (c *NodeScaledownAnnotationController) reconcileAllNodes(ctx context.Context) (ctrl.Result, error) {
423+
var nodes corev1.NodeList
424+
if err := c.List(ctx, &nodes); err != nil {
425+
log.WithError(err).Error("failed to list nodes")
426+
return ctrl.Result{}, err
427+
}
428+
429+
for _, node := range nodes.Items {
430+
c.queueNodeForReconciliation(node.Name)
431+
}
432+
433+
return ctrl.Result{}, nil
434+
}
435+
436+
func (c *NodeScaledownAnnotationController) reconcileNode(ctx context.Context, nodeName string) error {
437+
var workspaceList workspacev1.WorkspaceList
438+
if err := c.List(ctx, &workspaceList, client.MatchingFields{
439+
"status.runtime.nodeName": nodeName,
440+
}); err != nil {
441+
return fmt.Errorf("failed to list workspaces: %w", err)
442+
}
443+
444+
log.WithField("node", nodeName).WithField("count", len(workspaceList.Items)).Info("acting on workspaces")
445+
count := len(workspaceList.Items)
446+
447+
return c.updateNodeAnnotation(ctx, nodeName, count)
448+
}
449+
450+
func (c *NodeScaledownAnnotationController) updateNodeAnnotation(ctx context.Context, nodeName string, count int) error {
451+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
452+
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
453+
defer cancel()
454+
455+
var node corev1.Node
456+
err := c.Get(ctx, types.NamespacedName{Name: nodeName}, &node)
457+
if err != nil {
458+
return fmt.Errorf("obtaining node %s: %w", nodeName, err)
459+
}
460+
461+
shouldDisableScaleDown := count > 0
462+
currentlyDisabled := false
463+
if val, exists := node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"]; exists {
464+
currentlyDisabled = val == "true"
465+
}
466+
467+
// Only update if the state needs to change
468+
if shouldDisableScaleDown != currentlyDisabled {
469+
if node.Annotations == nil {
470+
node.Annotations = make(map[string]string)
471+
}
472+
473+
if shouldDisableScaleDown {
474+
node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"] = "true"
475+
log.WithField("nodeName", nodeName).Info("disabling scale-down for node")
476+
} else {
477+
delete(node.Annotations, "cluster-autoscaler.kubernetes.io/scale-down-disabled")
478+
log.WithField("nodeName", nodeName).Info("enabling scale-down for node")
479+
}
480+
481+
return c.Update(ctx, &node)
482+
}
483+
484+
return nil
485+
})
486+
}
487+
252488
func updateLabel(label string, add bool, nodeName string, client client.Client) error {
253489
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
254490
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)

0 commit comments

Comments
 (0)