From f3833573a233cec040b569e02fc2e197b8cef30c Mon Sep 17 00:00:00 2001 From: akutz Date: Tue, 18 Feb 2025 08:10:14 -0600 Subject: [PATCH] Do deployment rollout when capabilities change This patch changes the behavior of how the pod is exited when the capabilities have changed. Instead of all the replicas exiting at the same time, the exits are managed by causing the pod's deployment to rollout a new set of pods. This ensures that at least one pod is always online to respond to webhook requests. This patch also ensures the capabilities controllers are run only on the leader. --- .golangci.yml | 4 +- config/default/manager_pod_info_patch.yaml | 2 + config/default/manager_webhook_patch.yaml | 1 - .../local/vmoperator/local_env_var_patch.yaml | 2 + config/manager/manager.yaml | 2 - config/rbac/role.yaml | 7 + config/replacements/kustomization.yaml | 15 + .../wcp/vmoperator/manager_env_var_patch.yaml | 6 + .../configmap_capability_controller.go | 35 +- ...figmap_capability_controller_suite_test.go | 16 +- .../configmap_capability_controller_test.go | 124 +++++-- .../crd/crd_capability_controller.go | 25 +- .../crd_capability_controller_suite_test.go | 16 +- .../crd/crd_capability_controller_test.go | 134 ++++++-- controllers/infra/capability/exit/exit.go | 17 - main.go | 21 ++ pkg/config/capabilities/capabilities_test.go | 104 +++++- pkg/config/capabilities/capabilties.go | 147 +++++--- pkg/config/config.go | 4 + pkg/config/default.go | 2 + pkg/config/env.go | 2 + pkg/config/env/env.go | 6 + pkg/config/env_test.go | 4 + pkg/constants/constants.go | 14 + pkg/exit/exit.go | 180 ++++++++++ pkg/exit/exit_suite_test.go | 17 + pkg/exit/exit_test.go | 317 ++++++++++++++++++ pkg/manager/manager.go | 21 +- 28 files changed, 1054 insertions(+), 191 deletions(-) delete mode 100644 controllers/infra/capability/exit/exit.go create mode 100644 pkg/exit/exit.go create mode 100644 pkg/exit/exit_suite_test.go create mode 100644 pkg/exit/exit_test.go diff --git a/.golangci.yml b/.golangci.yml index 0bf5244c2..ad749fde3 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -85,7 +85,9 @@ linters-settings: - alias: pkgctx pkg: github.com/vmware-tanzu/vm-operator/pkg/context - alias: pkgerr - pkg: github.com/vmware-tanzu/vm-operator/pkg/pkgerr + pkg: github.com/vmware-tanzu/vm-operator/pkg/errors + - alias: pkgexit + pkg: github.com/vmware-tanzu/vm-operator/pkg/exit - alias: ctxop pkg: github.com/vmware-tanzu/vm-operator/pkg/context/operation - alias: pkgmgr diff --git a/config/default/manager_pod_info_patch.yaml b/config/default/manager_pod_info_patch.yaml index e28dc445c..7e3020419 100644 --- a/config/default/manager_pod_info_patch.yaml +++ b/config/default/manager_pod_info_patch.yaml @@ -9,6 +9,8 @@ spec: containers: - name: manager env: + - name: DEPLOYMENT_NAME + value: DEPLOYMENT_NAME - name: POD_NAMESPACE valueFrom: fieldRef: diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml index eceb3a307..bbd263384 100644 --- a/config/default/manager_webhook_patch.yaml +++ b/config/default/manager_webhook_patch.yaml @@ -6,7 +6,6 @@ metadata: spec: template: spec: - # There's an assumption in kustomization.yaml that manager is container[0] containers: - name: manager env: diff --git a/config/local/vmoperator/local_env_var_patch.yaml b/config/local/vmoperator/local_env_var_patch.yaml index 71771fce2..e55e3252e 100644 --- a/config/local/vmoperator/local_env_var_patch.yaml +++ b/config/local/vmoperator/local_env_var_patch.yaml @@ -9,6 +9,8 @@ spec: containers: - name: manager env: + - name: SIGUSR2_RESTART_ENABLED + value: "false" - name: ASYNC_SIGNAL_ENABLED value: "true" - name: ASYNC_CREATE_ENABLED diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index ed56fb8e5..f378d8ca1 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -27,8 +27,6 @@ spec: name: vmoperator-controller-manager spec: containers: - # There's an assumption in manager_webhook_patch.yaml that manager is container[0] - # If new containers are added, please bear this in mind - command: - /manager image: controller:latest diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 3704c232b..99960780d 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -55,6 +55,13 @@ rules: - get - list - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - get + - patch - apiGroups: - cns.vmware.com resources: diff --git a/config/replacements/kustomization.yaml b/config/replacements/kustomization.yaml index 8ba5a5812..24b69f505 100644 --- a/config/replacements/kustomization.yaml +++ b/config/replacements/kustomization.yaml @@ -330,3 +330,18 @@ replacements: version: v1 kind: CustomResourceDefinition name: virtualmachinesetresourcepolicies.vmoperator.vmware.com + +# DEPLOYMENT_NAME +- source: + fieldPath: metadata.name + group: apps + version: v1 + kind: Deployment + namespace: system + name: controller-manager + targets: + - select: + kind: Deployment + name: controller-manager + fieldPaths: + - spec.template.spec.containers.[name=manager].env.[name=DEPLOYMENT_NAME].value diff --git a/config/wcp/vmoperator/manager_env_var_patch.yaml b/config/wcp/vmoperator/manager_env_var_patch.yaml index ed8e053f9..c15cb0856 100644 --- a/config/wcp/vmoperator/manager_env_var_patch.yaml +++ b/config/wcp/vmoperator/manager_env_var_patch.yaml @@ -1,3 +1,9 @@ +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: SIGUSR2_RESTART_ENABLED + value: "false" + - op: add path: /spec/template/spec/containers/0/env/- value: diff --git a/controllers/infra/capability/configmap/configmap_capability_controller.go b/controllers/infra/capability/configmap/configmap_capability_controller.go index 954ef6c22..98f459f53 100644 --- a/controllers/infra/capability/configmap/configmap_capability_controller.go +++ b/controllers/infra/capability/configmap/configmap_capability_controller.go @@ -1,6 +1,6 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 package capability @@ -20,14 +20,13 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" pkgmgr "github.com/vmware-tanzu/vm-operator/pkg/manager" "github.com/vmware-tanzu/vm-operator/pkg/record" kubeutil "github.com/vmware-tanzu/vm-operator/pkg/util/kube" - "github.com/vmware-tanzu/vm-operator/pkg/util/ptr" ) // AddToManager adds this package's controller to the provided manager. @@ -50,6 +49,7 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err r := NewReconciler( ctx, + mgr.GetClient(), cache, ctrl.Log.WithName("controllers").WithName(controllerName), record.New(mgr.GetEventRecorderFor(controllerNameLong)), @@ -60,7 +60,6 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err c, err := controller.New(controllerName, mgr, controller.Options{ Reconciler: r, MaxConcurrentReconciles: 1, - NeedLeaderElection: ptr.To(false), }) if err != nil { return err @@ -87,13 +86,15 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err func NewReconciler( ctx context.Context, - client ctrlclient.Reader, + client ctrlclient.Client, + reader ctrlclient.Reader, logger logr.Logger, recorder record.Recorder) *Reconciler { return &Reconciler{ Context: ctx, Client: client, + Reader: reader, Logger: logger, Recorder: recorder, } @@ -101,7 +102,8 @@ func NewReconciler( type Reconciler struct { Context context.Context - Client ctrlclient.Reader + Client ctrlclient.Client + Reader ctrlclient.Reader Logger logr.Logger Recorder record.Recorder } @@ -112,16 +114,25 @@ func (r *Reconciler) Reconcile( ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + r.Logger.Info("Reconciling capabilities") + ctx = pkgcfg.JoinContext(ctx, r.Context) + ctx = logr.NewContext(ctx, r.Logger) var obj corev1.ConfigMap - if err := r.Client.Get(ctx, req.NamespacedName, &obj); err != nil { + if err := r.Reader.Get(ctx, req.NamespacedName, &obj); err != nil { return ctrl.Result{}, ctrlclient.IgnoreNotFound(err) } - if capabilities.UpdateCapabilitiesFeatures(ctx, obj) { - r.Logger.Info("killing pod due to changed capabilities") - exit.Exit() + if diff, ok := capabilities.WouldUpdateCapabilitiesFeatures(ctx, obj); ok { + if err := pkgexit.Restart( + ctx, + r.Client, + fmt.Sprintf("capabilities have changed: %s", diff)); err != nil { + + r.Logger.Error(err, "Failed to exit due to capability change") + return ctrl.Result{}, err + } } return ctrl.Result{}, nil diff --git a/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go b/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go index 3f838c9df..a5633df4a 100644 --- a/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go +++ b/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go @@ -1,30 +1,20 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 package capability_test import ( - "sync/atomic" "testing" . "github.com/onsi/ginkgo/v2" capability "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/configmap" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/manager" "github.com/vmware-tanzu/vm-operator/test/builder" ) -var numExits int32 - -func init() { - exit.Exit = func() { - atomic.AddInt32(&numExits, 1) - } -} - var suite = builder.NewTestSuiteForControllerWithContext( pkgcfg.UpdateContext( pkgcfg.NewContextWithDefaultConfig(), diff --git a/controllers/infra/capability/configmap/configmap_capability_controller_test.go b/controllers/infra/capability/configmap/configmap_capability_controller_test.go index 18c19b601..5cf249f08 100644 --- a/controllers/infra/capability/configmap/configmap_capability_controller_test.go +++ b/controllers/infra/capability/configmap/configmap_capability_controller_test.go @@ -1,22 +1,24 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 package capability_test import ( - "sync/atomic" "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" + pkgconst "github.com/vmware-tanzu/vm-operator/pkg/constants" "github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels" "github.com/vmware-tanzu/vm-operator/test/builder" ) @@ -28,15 +30,17 @@ var _ = Describe( testlabels.EnvTest, testlabels.V1Alpha3, ), + Ordered, func() { var ( - ctx *builder.IntegrationTestContext - obj *corev1.ConfigMap + ctx *builder.IntegrationTestContext + obj *corev1.ConfigMap + dep appsv1.Deployment + depKey ctrlclient.ObjectKey ) BeforeEach(func() { - atomic.StoreInt32(&numExits, 0) ctx = suite.NewIntegrationTestContext() obj = &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -44,6 +48,49 @@ var _ = Describe( Namespace: capabilities.ConfigMapNamespace, }, } + + dep = appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "vmop-deployment-", + Namespace: ctx.PodNamespace, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "controller-manager", + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-manager", + Namespace: ctx.PodNamespace, + Labels: map[string]string{ + "app": "controller-manager", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "controller-manager", + Image: "vmop:latest", + }, + }, + }, + }, + }, + } + + Expect(ctx.Client.Create(ctx, &dep)).To(Succeed()) + + depKey = ctrlclient.ObjectKeyFromObject(&dep) + + pkgcfg.SetContext( + suite.Context, + func(config *pkgcfg.Config) { + config.DeploymentName = dep.Name + config.PodNamespace = ctx.PodNamespace + }, + ) }) JustBeforeEach(func() { @@ -76,13 +123,22 @@ var _ = Describe( }) Specify("the pod was exited", func() { Eventually(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(1))) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr := dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr).ToNot(BeEmpty()) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).ToNot(BeEmpty()) + g.Expect(lastExitReason).To(Equal("capabilities have changed: TKGMultipleCL=true")) }, time.Second*5).Should(Succeed()) }) - Specify("feature states should be updated", func() { - Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) - }, time.Second*5).Should(Succeed()) + When("there is an error getting the deployment", func() { + BeforeEach(func() { + Expect(ctx.Client.Delete(ctx, &dep)).To(Succeed()) + }) + Specify("coverage", FlakeAttempts(5), func() { + // No test, just for coverage + }) }) }) @@ -101,18 +157,20 @@ var _ = Describe( }) Specify("the pod was exited", func() { Eventually(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(1))) - }, time.Second*5).Should(Succeed()) - }) - Specify("feature states should be updated", func() { - Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeFalse()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr := dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr).ToNot(BeEmpty()) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: TKGMultipleCL=false")) }, time.Second*5).Should(Succeed()) }) }) }) When("capabilities have changed twice", func() { + var lastExitTimeStr1 string + BeforeEach(func() { pkgcfg.SetContext( suite.Context, @@ -128,24 +186,34 @@ var _ = Describe( JustBeforeEach(func() { Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr1 = dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr1).ToNot(BeEmpty()) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: TKGMultipleCL=true")) }, time.Second*5).Should(Succeed()) + pkgcfg.SetContext(suite.Context, func(config *pkgcfg.Config) { + config.Features.TKGMultipleCL = true + }) + obj.Data[capabilities.CapabilityKeyTKGMultipleContentLibraries] = "false" Expect(ctx.Client.Update(ctx, obj)).To(Succeed()) }) Specify("the pod was exited once on create and once on update", func() { Eventually(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(2))) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr2 := dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr2).ToNot(BeEmpty()) + g.Expect(lastExitTimeStr2).ToNot(Equal(lastExitTimeStr1)) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: TKGMultipleCL=false")) }, time.Second*5).Should(Succeed()) }) - Specify("feature states should be updated", func() { - Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeFalse()) - }, time.Second*5).Should(Succeed()) - }) }) When("the capabilities have not changed", func() { @@ -164,8 +232,10 @@ var _ = Describe( }) Specify("the pod was not exited and features were not updated", func() { Consistently(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(0))) - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeFalse()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + g.Expect(dep.Spec.Template.Annotations).ToNot(HaveKey(pkgconst.LastRestartTimeAnnotationKey)) + g.Expect(dep.Spec.Template.Annotations).ToNot(HaveKey(pkgconst.LastRestartReasonAnnotationKey)) }, time.Second*3).Should(Succeed()) }) }) diff --git a/controllers/infra/capability/crd/crd_capability_controller.go b/controllers/infra/capability/crd/crd_capability_controller.go index 5a5348481..027fd534d 100644 --- a/controllers/infra/capability/crd/crd_capability_controller.go +++ b/controllers/infra/capability/crd/crd_capability_controller.go @@ -1,6 +1,6 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 package capability @@ -16,13 +16,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" capv1 "github.com/vmware-tanzu/vm-operator/external/capabilities/api/v1alpha1" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" "github.com/vmware-tanzu/vm-operator/pkg/record" - "github.com/vmware-tanzu/vm-operator/pkg/util/ptr" ) // AddToManager adds this package's controller to the provided manager. @@ -57,7 +56,6 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err WithEventFilter(predicate.ResourceVersionChangedPredicate{}). WithOptions(controller.Options{ MaxConcurrentReconciles: 1, - NeedLeaderElection: ptr.To(false), }). Complete(r) } @@ -90,16 +88,25 @@ func (r *Reconciler) Reconcile( ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + r.Logger.Info("Reconciling capabilities") + ctx = pkgcfg.JoinContext(ctx, r.Context) + ctx = logr.NewContext(ctx, r.Logger) var obj capv1.Capabilities if err := r.Client.Get(ctx, req.NamespacedName, &obj); err != nil { return ctrl.Result{}, ctrlclient.IgnoreNotFound(err) } - if capabilities.UpdateCapabilitiesFeatures(ctx, obj) { - r.Logger.Info("killing pod due to changed capabilities") - exit.Exit() + if diff, ok := capabilities.WouldUpdateCapabilitiesFeatures(ctx, obj); ok { + if err := pkgexit.Restart( + ctx, + r.Client, + fmt.Sprintf("capabilities have changed: %s", diff)); err != nil { + + r.Logger.Error(err, "Failed to exit due to capability change") + return ctrl.Result{}, err + } } return ctrl.Result{}, nil diff --git a/controllers/infra/capability/crd/crd_capability_controller_suite_test.go b/controllers/infra/capability/crd/crd_capability_controller_suite_test.go index a5418ae26..33f2c1c69 100644 --- a/controllers/infra/capability/crd/crd_capability_controller_suite_test.go +++ b/controllers/infra/capability/crd/crd_capability_controller_suite_test.go @@ -1,30 +1,20 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 package capability_test import ( - "sync/atomic" "testing" . "github.com/onsi/ginkgo/v2" capability "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/crd" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/manager" "github.com/vmware-tanzu/vm-operator/test/builder" ) -var numExits int32 - -func init() { - exit.Exit = func() { - atomic.AddInt32(&numExits, 1) - } -} - var suite = builder.NewTestSuiteForControllerWithContext( pkgcfg.UpdateContext( pkgcfg.NewContextWithDefaultConfig(), diff --git a/controllers/infra/capability/crd/crd_capability_controller_test.go b/controllers/infra/capability/crd/crd_capability_controller_test.go index 6ce84a79b..df912aff6 100644 --- a/controllers/infra/capability/crd/crd_capability_controller_test.go +++ b/controllers/infra/capability/crd/crd_capability_controller_test.go @@ -1,22 +1,25 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 package capability_test import ( - "sync/atomic" "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" capv1 "github.com/vmware-tanzu/vm-operator/external/capabilities/api/v1alpha1" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" + pkgconst "github.com/vmware-tanzu/vm-operator/pkg/constants" "github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels" "github.com/vmware-tanzu/vm-operator/test/builder" ) @@ -28,16 +31,18 @@ var _ = Describe( testlabels.EnvTest, testlabels.V1Alpha3, ), + Ordered, func() { var ( ctx *builder.IntegrationTestContext obj *capv1.Capabilities status capv1.CapabilitiesStatus + dep appsv1.Deployment + depKey ctrlclient.ObjectKey ) BeforeEach(func() { - atomic.StoreInt32(&numExits, 0) ctx = suite.NewIntegrationTestContext() status = capv1.CapabilitiesStatus{} obj = &capv1.Capabilities{ @@ -45,6 +50,49 @@ var _ = Describe( Name: capabilities.CapabilitiesName, }, } + + dep = appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "vmop-deployment-", + Namespace: ctx.PodNamespace, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "controller-manager", + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-manager", + Namespace: ctx.PodNamespace, + Labels: map[string]string{ + "app": "controller-manager", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "controller-manager", + Image: "vmop:latest", + }, + }, + }, + }, + }, + } + + Expect(ctx.Client.Create(ctx, &dep)).To(Succeed()) + + depKey = ctrlclient.ObjectKeyFromObject(&dep) + + pkgcfg.SetContext( + suite.Context, + func(config *pkgcfg.Config) { + config.DeploymentName = dep.Name + config.PodNamespace = ctx.PodNamespace + }, + ) }) JustBeforeEach(func() { @@ -89,15 +137,22 @@ var _ = Describe( }) Specify("the pod was exited", func() { Eventually(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(1))) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr := dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr).ToNot(BeEmpty()) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: BringYourOwnEncryptionKey=true,TKGMultipleCL=true,WorkloadDomainIsolation=true")) }, time.Second*5).Should(Succeed()) }) - Specify("feature states should be updated", func() { - Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.BringYourOwnEncryptionKey).To(BeTrue()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.WorkloadDomainIsolation).To(BeTrue()) - }, time.Second*5).Should(Succeed()) + + When("there is an error getting the deployment", func() { + BeforeEach(func() { + Expect(ctx.Client.Delete(ctx, &dep)).To(Succeed()) + }) + Specify("coverage", FlakeAttempts(5), func() { + // No test, just for coverage + }) }) }) @@ -126,20 +181,20 @@ var _ = Describe( }) Specify("the pod was exited", func() { Eventually(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(1))) - }, time.Second*5).Should(Succeed()) - }) - Specify("feature states should be updated", func() { - Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.BringYourOwnEncryptionKey).To(BeFalse()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeFalse()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.WorkloadDomainIsolation).To(BeFalse()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr := dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr).ToNot(BeEmpty()) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: BringYourOwnEncryptionKey=false,TKGMultipleCL=false,WorkloadDomainIsolation=false")) }, time.Second*5).Should(Succeed()) }) }) }) When("capabilities have changed twice", func() { + var lastExitTimeStr1 string + BeforeEach(func() { pkgcfg.SetContext( suite.Context, @@ -165,11 +220,20 @@ var _ = Describe( JustBeforeEach(func() { Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.BringYourOwnEncryptionKey).To(BeTrue()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.WorkloadDomainIsolation).To(BeTrue()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr1 = dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr1).ToNot(BeEmpty()) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: BringYourOwnEncryptionKey=true,TKGMultipleCL=true,WorkloadDomainIsolation=true")) }, time.Second*5).Should(Succeed()) + pkgcfg.SetContext(suite.Context, func(config *pkgcfg.Config) { + config.Features.BringYourOwnEncryptionKey = true + config.Features.TKGMultipleCL = true + config.Features.WorkloadDomainIsolation = true + }) + obj.Status.Supervisor[capabilities.CapabilityKeyBringYourOwnKeyProvider] = capv1.CapabilityStatus{ Activated: false, } @@ -178,15 +242,13 @@ var _ = Describe( Specify("the pod was exited once on create and once on update", func() { Eventually(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(2))) - }, time.Second*5).Should(Succeed()) - }) - - Specify("feature states should be updated", func() { - Eventually(func(g Gomega) { - g.Expect(pkgcfg.FromContext(suite.Context).Features.BringYourOwnEncryptionKey).To(BeFalse()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.WorkloadDomainIsolation).To(BeTrue()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + lastExitTimeStr2 := dep.Spec.Template.Annotations[pkgconst.LastRestartTimeAnnotationKey] + g.Expect(lastExitTimeStr2).ToNot(BeEmpty()) + g.Expect(lastExitTimeStr2).ToNot(Equal(lastExitTimeStr1)) + lastExitReason := dep.Spec.Template.Annotations[pkgconst.LastRestartReasonAnnotationKey] + g.Expect(lastExitReason).To(Equal("capabilities have changed: BringYourOwnEncryptionKey=false")) }, time.Second*5).Should(Succeed()) }) }) @@ -216,10 +278,10 @@ var _ = Describe( }) Specify("the pod was not exited and features were not updated", func() { Consistently(func(g Gomega) { - g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(0))) - g.Expect(pkgcfg.FromContext(suite.Context).Features.BringYourOwnEncryptionKey).To(BeFalse()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeFalse()) - g.Expect(pkgcfg.FromContext(suite.Context).Features.WorkloadDomainIsolation).To(BeFalse()) + var dep appsv1.Deployment + g.Expect(ctx.Client.Get(ctx, depKey, &dep)).To(Succeed()) + g.Expect(dep.Spec.Template.Annotations).ToNot(HaveKey(pkgconst.LastRestartTimeAnnotationKey)) + g.Expect(dep.Spec.Template.Annotations).ToNot(HaveKey(pkgconst.LastRestartReasonAnnotationKey)) }, time.Second*3).Should(Succeed()) }) }) diff --git a/controllers/infra/capability/exit/exit.go b/controllers/infra/capability/exit/exit.go deleted file mode 100644 index 7c279c45b..000000000 --- a/controllers/infra/capability/exit/exit.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. - -package exit - -import ( - "os" -) - -// Exit is used in testing to assert the capability controller exits the process -// when the capabilities have changed. -var Exit func() - -func init() { - Exit = func() { os.Exit(1) } -} diff --git a/main.go b/main.go index aa3081a3b..3540ccff7 100644 --- a/main.go +++ b/main.go @@ -12,6 +12,7 @@ import ( "path" "time" + "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" @@ -34,6 +35,7 @@ import ( pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" pkgmgr "github.com/vmware-tanzu/vm-operator/pkg/manager" pkgmgrinit "github.com/vmware-tanzu/vm-operator/pkg/manager/init" "github.com/vmware-tanzu/vm-operator/pkg/mem" @@ -90,6 +92,8 @@ func main() { initWebhookServer() + initSIGUSR2RestartHandler() + setupLog.Info("Starting controller manager") sigHandler := ctrlsig.SetupSignalHandler() if err := mgr.Start(sigHandler); err != nil { @@ -380,3 +384,20 @@ func initWebhookServer() { os.Exit(1) } } + +func initSIGUSR2RestartHandler() { + setupLog.Info("SIGUSR2 restart handler", + "enabled", defaultConfig.SIGUSR2RestartEnabled) + + if !defaultConfig.SIGUSR2RestartEnabled { + return + } + + // Allow the pod to restart via pkg/exit.Restart when SIGUSR2 is received. + // This simulates the behavior when capabilities are changed and the pod + // is the leader. + _ = pkgexit.NewRestartSignalHandler( + logr.NewContext(ctx, setupLog), + mgr.GetClient(), + mgr.Elected()) +} diff --git a/pkg/config/capabilities/capabilities_test.go b/pkg/config/capabilities/capabilities_test.go index ebb5b249b..1df2d9a75 100644 --- a/pkg/config/capabilities/capabilities_test.go +++ b/pkg/config/capabilities/capabilities_test.go @@ -248,12 +248,16 @@ var _ = Describe("UpdateCapabilities", func() { var _ = Describe("UpdateCapabilitiesFeatures", func() { var ( - ctx context.Context + ctx context.Context + ok bool + diff string ) BeforeEach(func() { ctx = pkgcfg.NewContext() ctx = logr.NewContext(ctx, logf.Log) + + ok, diff = false, "" }) When("obj is map[string]string", func() { @@ -264,7 +268,7 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { obj = map[string]string{} }) JustBeforeEach(func() { - capabilities.UpdateCapabilitiesFeatures(ctx, obj) + diff, ok = capabilities.UpdateCapabilitiesFeatures(ctx, obj) }) Context(capabilities.CapabilityKeyTKGMultipleContentLibraries, func() { BeforeEach(func() { @@ -272,6 +276,8 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { obj[capabilities.CapabilityKeyTKGMultipleContentLibraries] = trueString }) Specify("Enabled", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("TKGMultipleCL=true")) Expect(pkgcfg.FromContext(ctx).Features.TKGMultipleCL).To(BeTrue()) }) }) @@ -288,6 +294,8 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { obj[capabilities.CapabilityKeyWorkloadIsolation] = trueString }) Specify("Enabled", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("WorkloadDomainIsolation=true")) Expect(pkgcfg.FromContext(ctx).Features.WorkloadDomainIsolation).To(BeTrue()) }) }) @@ -302,7 +310,7 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { obj.Data = map[string]string{} }) JustBeforeEach(func() { - capabilities.UpdateCapabilitiesFeatures(ctx, obj) + diff, ok = capabilities.UpdateCapabilitiesFeatures(ctx, obj) }) Context(capabilities.CapabilityKeyTKGMultipleContentLibraries, func() { BeforeEach(func() { @@ -326,6 +334,8 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { obj.Data[capabilities.CapabilityKeyWorkloadIsolation] = trueString }) Specify("Enabled", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("WorkloadDomainIsolation=true")) Expect(pkgcfg.FromContext(ctx).Features.WorkloadDomainIsolation).To(BeTrue()) }) }) @@ -340,7 +350,7 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { obj.Status.Supervisor = map[capv1.CapabilityName]capv1.CapabilityStatus{} }) JustBeforeEach(func() { - capabilities.UpdateCapabilitiesFeatures(ctx, obj) + diff, ok = capabilities.UpdateCapabilitiesFeatures(ctx, obj) }) Context(capabilities.CapabilityKeyBringYourOwnKeyProvider, func() { BeforeEach(func() { @@ -350,6 +360,8 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { } }) Specify("Enabled", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("BringYourOwnEncryptionKey=true")) Expect(pkgcfg.FromContext(ctx).Features.BringYourOwnEncryptionKey).To(BeTrue()) }) }) @@ -361,6 +373,8 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { } }) Specify("Enabled", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("TKGMultipleCL=true")) Expect(pkgcfg.FromContext(ctx).Features.TKGMultipleCL).To(BeTrue()) }) }) @@ -372,8 +386,90 @@ var _ = Describe("UpdateCapabilitiesFeatures", func() { } }) Specify("Enabled", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("WorkloadDomainIsolation=true")) + Expect(pkgcfg.FromContext(ctx).Features.WorkloadDomainIsolation).To(BeTrue()) + }) + }) + }) +}) + +var _ = Describe("WouldUpdateCapabilitiesFeatures", func() { + var ( + ctx context.Context + obj capv1.Capabilities + + ok bool + diff string + ) + + BeforeEach(func() { + ctx = pkgcfg.NewContext() + ctx = logr.NewContext(ctx, logf.Log) + obj.Status.Supervisor = map[capv1.CapabilityName]capv1.CapabilityStatus{ + capabilities.CapabilityKeyBringYourOwnKeyProvider: { + Activated: true, + }, + capabilities.CapabilityKeyTKGMultipleContentLibraries: { + Activated: true, + }, + capabilities.CapabilityKeyWorkloadIsolation: { + Activated: true, + }, + } + + ok, diff = false, "" + }) + + JustBeforeEach(func() { + diff, ok = capabilities.WouldUpdateCapabilitiesFeatures(ctx, obj) + }) + + When("the resource exists", func() { + When("the capabilities are not different", func() { + BeforeEach(func() { + pkgcfg.SetContext(ctx, func(config *pkgcfg.Config) { + config.Features.BringYourOwnEncryptionKey = true + config.Features.TKGMultipleCL = true + config.Features.WorkloadDomainIsolation = true + }) + }) + Specify("capabilities did not change", func() { + Expect(ok).To(BeFalse()) + Expect(diff).To(BeEmpty()) + }) + Specify(capabilities.CapabilityKeyBringYourOwnKeyProvider, func() { + Expect(pkgcfg.FromContext(ctx).Features.BringYourOwnEncryptionKey).To(BeTrue()) + }) + Specify(capabilities.CapabilityKeyTKGMultipleContentLibraries, func() { + Expect(pkgcfg.FromContext(ctx).Features.TKGMultipleCL).To(BeTrue()) + }) + Specify(capabilities.CapabilityKeyWorkloadIsolation, func() { Expect(pkgcfg.FromContext(ctx).Features.WorkloadDomainIsolation).To(BeTrue()) }) }) + + When("the capabilities are different", func() { + BeforeEach(func() { + pkgcfg.SetContext(ctx, func(config *pkgcfg.Config) { + config.Features.BringYourOwnEncryptionKey = false + config.Features.TKGMultipleCL = false + config.Features.WorkloadDomainIsolation = false + }) + }) + Specify("capabilities changed", func() { + Expect(ok).To(BeTrue()) + Expect(diff).To(Equal("BringYourOwnEncryptionKey=true,TKGMultipleCL=true,WorkloadDomainIsolation=true")) + }) + Specify(capabilities.CapabilityKeyBringYourOwnKeyProvider, func() { + Expect(pkgcfg.FromContext(ctx).Features.BringYourOwnEncryptionKey).To(BeFalse()) + }) + Specify(capabilities.CapabilityKeyTKGMultipleContentLibraries, func() { + Expect(pkgcfg.FromContext(ctx).Features.TKGMultipleCL).To(BeFalse()) + }) + Specify(capabilities.CapabilityKeyWorkloadIsolation, func() { + Expect(pkgcfg.FromContext(ctx).Features.WorkloadDomainIsolation).To(BeFalse()) + }) + }) }) }) diff --git a/pkg/config/capabilities/capabilties.go b/pkg/config/capabilities/capabilties.go index aeaa67271..d36350545 100644 --- a/pkg/config/capabilities/capabilties.go +++ b/pkg/config/capabilities/capabilties.go @@ -6,7 +6,10 @@ package capabilities import ( "context" + "fmt" + "slices" "strconv" + "strings" "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" @@ -64,14 +67,16 @@ func UpdateCapabilities( if err := k8sClient.Get(ctx, CapabilitiesKey, &obj); err != nil { return false, err } - return UpdateCapabilitiesFeatures(ctx, obj), nil + _, ok := UpdateCapabilitiesFeatures(ctx, obj) + return ok, nil } var obj corev1.ConfigMap if err := k8sClient.Get(ctx, ConfigMapKey, &obj); err != nil { return false, err } - return UpdateCapabilitiesFeatures(ctx, obj), nil + _, ok := UpdateCapabilitiesFeatures(ctx, obj) + return ok, nil } type updateCapTypes interface { @@ -88,88 +93,128 @@ type updateCapTypes interface { // The return value indicates if any of the features changed. func UpdateCapabilitiesFeatures[T updateCapTypes]( ctx context.Context, - obj T) bool { + obj T) (string, bool) { + + return updateCapabilitiesFeatures(ctx, obj, false) +} + +// WouldUpdateCapabilitiesFeatures is like UpdateCapabilitiesFeatures but does +// not actually cause a change to the capabilities. +func WouldUpdateCapabilitiesFeatures[T updateCapTypes]( + ctx context.Context, + obj T) (string, bool) { + + return updateCapabilitiesFeatures(ctx, obj, true) +} + +func updateCapabilitiesFeatures[T updateCapTypes]( + ctx context.Context, + obj T, + dryRun bool) (string, bool) { var ( - logger = logr.FromContextOrDiscard(ctx) + newFeat pkgcfg.FeatureStates + logger = logr.FromContextOrDiscard(ctx).WithValues("dryRun", dryRun) oldFeat = pkgcfg.FromContext(ctx).Features ) + logger.Info( + "Checking if capabilities would update features", + "oldFeat", oldFeat) + switch tObj := (any)(obj).(type) { case map[string]string: - updateCapabilitiesFeaturesFromMap(ctx, tObj) + newFeat = updateCapabilitiesFeaturesFromMap(tObj, oldFeat) case corev1.ConfigMap: - updateCapabilitiesFeaturesFromMap(ctx, tObj.Data) + newFeat = updateCapabilitiesFeaturesFromMap(tObj.Data, oldFeat) case capv1.Capabilities: - updateCapabilitiesFeaturesFromCRD(ctx, tObj) + newFeat = updateCapabilitiesFeaturesFromCRD(tObj, oldFeat) } - if newFeat := pkgcfg.FromContext(ctx).Features; oldFeat != newFeat { - logger.Info( - "Updated features from capabilities", - "diff", cmp.Diff(oldFeat, newFeat)) + if oldFeat != newFeat { + if !dryRun { + pkgcfg.SetContext(ctx, func(config *pkgcfg.Config) { + config.Features = newFeat + }) + } + + // Use a custom diff reporter to get a sanitized version of the + // differences between the old and new features. + r := &diffReporter{} + _ = cmp.Equal(oldFeat, newFeat, cmp.Reporter(r)) + diff := r.String() - return true + logger.Info("Updated features from capabilities", "diff", diff) + return diff, true } - return false + logger.Info("Features not updated from capabilities") + return "", false } func updateCapabilitiesFeaturesFromMap( - ctx context.Context, - data map[string]string) { + data map[string]string, + fs pkgcfg.FeatureStates) pkgcfg.FeatureStates { - pkgcfg.SetContext(ctx, func(config *pkgcfg.Config) { - // TKGMultipleCL is unique in that it is a capability but it predates - // SVAsyncUpgrade. - config.Features.TKGMultipleCL = isEnabled(data[CapabilityKeyTKGMultipleContentLibraries]) + // TKGMultipleCL is unique in that it is a capability but it predates + // SVAsyncUpgrade. + fs.TKGMultipleCL = isEnabled(data[CapabilityKeyTKGMultipleContentLibraries]) - if config.Features.SVAsyncUpgrade { - // All other capabilities are gated by SVAsyncUpgrade. - config.Features.WorkloadDomainIsolation = isEnabled(data[CapabilityKeyWorkloadIsolation]) - } - }) + if fs.SVAsyncUpgrade { + // All other capabilities are gated by SVAsyncUpgrade. + fs.WorkloadDomainIsolation = isEnabled(data[CapabilityKeyWorkloadIsolation]) + } + + return fs } func updateCapabilitiesFeaturesFromCRD( - ctx context.Context, - obj capv1.Capabilities) { - - var ( - byok *bool - tkgMultipleCL *bool - workloadDomainIsolation *bool - ) + obj capv1.Capabilities, + fs pkgcfg.FeatureStates) pkgcfg.FeatureStates { for capName, capStatus := range obj.Status.Supervisor { switch capName { case CapabilityKeyBringYourOwnKeyProvider: - setCap(&byok, capStatus.Activated) + fs.BringYourOwnEncryptionKey = capStatus.Activated case CapabilityKeyTKGMultipleContentLibraries: - setCap(&tkgMultipleCL, capStatus.Activated) + fs.TKGMultipleCL = capStatus.Activated case CapabilityKeyWorkloadIsolation: - setCap(&workloadDomainIsolation, capStatus.Activated) + fs.WorkloadDomainIsolation = capStatus.Activated } } - - pkgcfg.SetContext(ctx, func(config *pkgcfg.Config) { - if byok != nil { - config.Features.BringYourOwnEncryptionKey = *byok - } - if tkgMultipleCL != nil { - config.Features.TKGMultipleCL = *tkgMultipleCL - } - if workloadDomainIsolation != nil { - config.Features.WorkloadDomainIsolation = *workloadDomainIsolation - } - }) -} - -func setCap(dst **bool, val bool) { - *dst = &val + return fs } func isEnabled(v string) bool { ok, _ := strconv.ParseBool(v) return ok } + +// diffReporter is a custom reporter for comparing config.Features structs +// that lists only the features that have changed using a single line of +// comma-separated values. +type diffReporter struct { + path cmp.Path + diffs []string +} + +func (r *diffReporter) PushStep(ps cmp.PathStep) { + r.path = append(r.path, ps) +} + +func (r *diffReporter) Report(rs cmp.Result) { + if !rs.Equal() { + capName := strings.TrimPrefix(r.path.Last().String(), ".") + _, newVal := r.path.Last().Values() + r.diffs = append(r.diffs, fmt.Sprintf("%s=%v", capName, newVal)) + } +} + +func (r *diffReporter) PopStep() { + r.path = r.path[:len(r.path)-1] +} + +func (r *diffReporter) String() string { + slices.Sort(r.diffs) + return strings.Join(r.diffs, ",") +} diff --git a/pkg/config/config.go b/pkg/config/config.go index e411e65f5..5f556680c 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -80,6 +80,10 @@ type Config struct { PodName string PodNamespace string PodServiceAccountName string + DeploymentName string + + // SIGUSR2RestartEnabled allows SIGUSR2 to cause the pod to restart. + SIGUSR2RestartEnabled bool // PrivilegedUsers is a comma-delimited a list of users that are, in // addition to the kube-admin and system users, treated as privileged by diff --git a/pkg/config/default.go b/pkg/config/default.go index e67f0a548..26878cdcc 100644 --- a/pkg/config/default.go +++ b/pkg/config/default.go @@ -47,6 +47,8 @@ func Default() Config { PoweredOnVMHasIPRequeueDelay: 10 * time.Second, SyncImageRequeueDelay: 10 * time.Second, NetworkProviderType: NetworkProviderTypeNamed, + SIGUSR2RestartEnabled: false, + DeploymentName: defaultPrefix + "controller-manager", PodName: defaultPrefix + "controller-manager", PodNamespace: defaultPrefix + "system", PodServiceAccountName: "default", diff --git a/pkg/config/env.go b/pkg/config/env.go index 30a71a5ff..1a5bdff5b 100644 --- a/pkg/config/env.go +++ b/pkg/config/env.go @@ -46,6 +46,8 @@ func FromEnv() Config { setDuration(env.SyncPeriod, &config.SyncPeriod) setInt(env.MaxConcurrentReconciles, &config.MaxConcurrentReconciles) setString(env.LeaderElectionID, &config.LeaderElectionID) + setBool(env.SIGUSR2RestartEnabled, &config.SIGUSR2RestartEnabled) + setString(env.DeploymentName, &config.DeploymentName) setString(env.PodName, &config.PodName) setString(env.PodNamespace, &config.PodNamespace) setString(env.PodServiceAccountName, &config.PodServiceAccountName) diff --git a/pkg/config/env/env.go b/pkg/config/env/env.go index b9206cdc7..c766453f5 100644 --- a/pkg/config/env/env.go +++ b/pkg/config/env/env.go @@ -40,6 +40,8 @@ const ( MaxConcurrentReconciles MemStatsPeriod LeaderElectionID + SIGUSR2RestartEnabled + DeploymentName PodName PodNamespace PodServiceAccountName @@ -141,6 +143,10 @@ func (n VarName) String() string { return "MEM_STATS_PERIOD" case LeaderElectionID: return "LEADER_ELECTION_ID" + case SIGUSR2RestartEnabled: + return "SIGUSR2_RESTART_ENABLED" + case DeploymentName: + return "DEPLOYMENT_NAME" case PodName: return "POD_NAME" case PodNamespace: diff --git a/pkg/config/env_test.go b/pkg/config/env_test.go index b3015984b..12099fd23 100644 --- a/pkg/config/env_test.go +++ b/pkg/config/env_test.go @@ -108,6 +108,8 @@ var _ = Describe( Expect(os.Setenv("POWERED_ON_VM_HAS_IP_REQUEUE_DELAY", "126h")).To(Succeed()) Expect(os.Setenv("MEM_STATS_PERIOD", "127h")).To(Succeed()) Expect(os.Setenv("SYNC_IMAGE_REQUEUE_DELAY", "128h")).To(Succeed()) + Expect(os.Setenv("DEPLOYMENT_NAME", "129")).To(Succeed()) + Expect(os.Setenv("SIGUSR2_RESTART_ENABLED", "true")).To(Succeed()) }) It("Should return a default config overridden by the environment", func() { Expect(config).To(BeComparableTo(pkgcfg.Config{ @@ -162,6 +164,8 @@ var _ = Describe( PoweredOnVMHasIPRequeueDelay: 126 * time.Hour, MemStatsPeriod: 127 * time.Hour, SyncImageRequeueDelay: 128 * time.Hour, + DeploymentName: "129", + SIGUSR2RestartEnabled: true, })) }) }) diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 12978c443..70580cf61 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -73,4 +73,18 @@ const ( // FastDeployModeLinked is a fast deploy mode. See FastDeployAnnotationKey // for more information. FastDeployModeLinked = "linked" + + // LastRestartTimeAnnotationKey is applied to a Deployment's pod template + // spec when the pod needs to restart itself, ex. the capabilities change. + // The application of this annotation causes the Deployment to do a rollout + // of new pods, ensuring at least one pod is online at all times. + // The value is an RFC3339Nano formatted timestamp. + LastRestartTimeAnnotationKey = "vmoperator.vmware.com/last-restart-time" + + // LastRestartReasonAnnotationKey is applied to a Deployment's pod template + // spec when the pod needs to restart itself, ex. the capabilities change. + // The application of this annotation causes the Deployment to do a rollout + // of new pods, ensuring at least one pod is online at all times. + // The value is the reason for the restart. + LastRestartReasonAnnotationKey = "vmoperator.vmware.com/last-restart-reason" ) diff --git a/pkg/exit/exit.go b/pkg/exit/exit.go new file mode 100644 index 000000000..23eed3fc7 --- /dev/null +++ b/pkg/exit/exit.go @@ -0,0 +1,180 @@ +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 + +package exit + +import ( + "context" + "fmt" + "os" + "os/signal" + "sync" + "syscall" + "time" + + "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + + pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" + pkgconst "github.com/vmware-tanzu/vm-operator/pkg/constants" +) + +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;patch + +// Restart instructs the Deployment responsible for this pod to do a rollout, +// restarting all of the replicas. By using this method, at least one pod is +// always up and responsive. +func Restart( + ctx context.Context, + k8sClient ctrlclient.Client, + reason string) error { + + if ctx == nil { + panic("ctx is nil") + } + if k8sClient == nil { + panic("k8sClient is nil") + } + + logger := logr.FromContextOrDiscard(ctx) + + config := pkgcfg.FromContext(ctx) + + var ( + obj appsv1.Deployment + objKey = ctrlclient.ObjectKey{ + Name: config.DeploymentName, + Namespace: config.PodNamespace, + } + ) + + lastExitTime := time.Now().UTC().Format(time.RFC3339Nano) + logger.Info( + "Restarting pod", + "reason", reason, + "lastExitTime", lastExitTime, + "deploymentKey", objKey.String()) + + if err := k8sClient.Get(ctx, objKey, &obj); err != nil { + return fmt.Errorf( + "failed to get deployment %s while restarting pod: %w", + objKey, err) + } + + patch := ctrlclient.StrategicMergeFrom(obj.DeepCopy()) + + tplSpecAnno := obj.Spec.Template.Annotations + if tplSpecAnno == nil { + tplSpecAnno = map[string]string{} + } + + tplSpecAnno[pkgconst.LastRestartTimeAnnotationKey] = lastExitTime + tplSpecAnno[pkgconst.LastRestartReasonAnnotationKey] = reason + obj.Spec.Template.Annotations = tplSpecAnno + + if err := k8sClient.Patch(ctx, &obj, patch); err != nil { + return fmt.Errorf( + "failed to patch deployment %s while restarting pod: %w", + objKey, err) + } + + return nil +} + +// RestartSignal is the signal that causes the pod to be restarted. +const RestartSignal = syscall.SIGUSR2 + +// RestartSignalHandler waits to receive a signal that tells the pod to restart. +type RestartSignalHandler interface { + Close() + Closed() <-chan struct{} +} + +type restartSignalHandler struct { + c chan os.Signal + d chan struct{} + o sync.Once +} + +// Closed returns a channel that is closed when the signal handler is no longer +// monitoring for a signal. +func (h *restartSignalHandler) Closed() <-chan struct{} { + return h.d +} + +// Close closes the signal handler and stops receiving signals. This function +// is idempotent and may be called multiple times. +func (h *restartSignalHandler) Close() { + h.o.Do(func() { + signal.Stop(h.c) + close(h.c) + close(h.d) + }) +} + +// NewRestartSignalHandler returns a new signal handler that waits to receive +// a SIGUSR2 signal, upon which the pod is restarted via the package's +// Restart(context.Context, ctrlclient.Client, string) error method. +// Calling the handler's Close() method will stop the signal handler. +func NewRestartSignalHandler( + ctx context.Context, + k8sClient ctrlclient.Client, + elected <-chan struct{}) RestartSignalHandler { + + if ctx == nil { + panic("ctx is nil") + } + if k8sClient == nil { + panic("k8sClient is nil") + } + if elected == nil { + panic("elected is nil") + } + + h := &restartSignalHandler{ + c: make(chan os.Signal, 1), + d: make(chan struct{}), + } + + logger := logr.FromContextOrDiscard(ctx) + + // Receive the restart signal. + signal.Notify(h.c, RestartSignal) + + go func() { + for { + // Wait for the signal. + s := <-h.c + + if s == nil { + logger.Info("Signal channel closed sans signal") + h.Close() + return + } + + sigLogger := logger.WithValues("signal", s) + sigLogger.Info("Received signal") + + select { + case <-elected: + // When the leader or when there is no leader election, proceed + // with the restart. + if err := Restart( + ctx, + k8sClient, + fmt.Sprintf("received %s", s)); err != nil { + + sigLogger.Error(err, "failed to restart pod upon signal") + } + default: + // When not the leader, do nothing. The leader is responsible + // restarting the pod(s). + sigLogger.Info("Ignore signal for non-leader") + } + } + }() + + return h +} diff --git a/pkg/exit/exit_suite_test.go b/pkg/exit/exit_suite_test.go new file mode 100644 index 000000000..b3df9034a --- /dev/null +++ b/pkg/exit/exit_suite_test.go @@ -0,0 +1,17 @@ +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 + +package exit_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestExit(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Exit Suite") +} diff --git a/pkg/exit/exit_test.go b/pkg/exit/exit_test.go new file mode 100644 index 000000000..0c2e223f9 --- /dev/null +++ b/pkg/exit/exit_test.go @@ -0,0 +1,317 @@ +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 + +package exit_test + +import ( + "context" + "fmt" + "syscall" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" + pkgconst "github.com/vmware-tanzu/vm-operator/pkg/constants" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" +) + +var _ = DescribeTable("Restart", + func(nilCtx, nilClient, errOnGet, errOnPatch bool) { + + var ( + client ctrlclient.Client + withFuncs interceptor.Funcs + config = pkgcfg.Default() + ctx = pkgcfg.WithConfig(config) + numExits = 10 + obj = appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: config.PodNamespace, + Name: config.DeploymentName, + }, + } + objKey = ctrlclient.ObjectKeyFromObject(&obj) + uniqueLastExitTimes = map[time.Time]struct{}{} + ) + + if errOnGet { + withFuncs.Get = func( + _ context.Context, + _ ctrlclient.WithWatch, + _ ctrlclient.ObjectKey, + _ ctrlclient.Object, + _ ...ctrlclient.GetOption) error { + + return fmt.Errorf("fake") + } + } + if errOnPatch { + withFuncs.Patch = func( + _ context.Context, + _ ctrlclient.WithWatch, + _ ctrlclient.Object, + _ ctrlclient.Patch, + _ ...ctrlclient.PatchOption) error { + + return fmt.Errorf("fake") + } + } + + client = fake.NewClientBuilder(). + WithInterceptorFuncs(withFuncs). + WithObjects(&obj). + Build() + + if nilCtx { + ctx = nil + } + if nilClient { + client = nil + } + + fn := func() { + exitErr := pkgexit.Restart(ctx, client, "") + if errOnGet { + Expect(exitErr).To(HaveOccurred()) + Expect(exitErr.Error()).To(HavePrefix("failed to get deployment")) + } else if errOnPatch { + Expect(exitErr).To(HaveOccurred()) + Expect(exitErr.Error()).To(HavePrefix("failed to patch deployment")) + } else { + Expect(client.Get(ctx, objKey, &obj)).To(Succeed()) + lastExitTimeStr := obj.Spec.Template. + Annotations[pkgconst.LastRestartTimeAnnotationKey] + + Expect(lastExitTimeStr).ToNot(BeEmpty()) + lastExitTime, err := time.Parse(time.RFC3339Nano, lastExitTimeStr) + Expect(err).ToNot(HaveOccurred()) + uniqueLastExitTimes[lastExitTime] = struct{}{} + } + } + + switch { + case nilCtx: + Expect(fn).To(PanicWith("ctx is nil")) + case nilClient: + Expect(fn).To(PanicWith("k8sClient is nil")) + default: + for i := 0; i < numExits; i++ { + Expect(fn).ToNot(Panic()) + } + switch { + case errOnGet, errOnPatch: + Expect(uniqueLastExitTimes).To(BeEmpty()) + default: + Expect(uniqueLastExitTimes).To(HaveLen(numExits)) + } + } + }, + Entry( + "should panic with nil context", + true, // nilCtx + false, // nilClient + false, // errOnGet + false, // errOnPatch + ), + Entry( + "should panic with nil client", + false, // nilCtx + true, // nilClient + false, // errOnGet + false, // errOnPatch + ), + Entry( + "should restart", + false, // nilCtx + false, // nilClient + false, // errOnGet + false, // errOnPatch + ), + Entry( + "get deployment error", + false, // nilCtx + false, // nilClient + true, // errOnGet + false, // errOnPatch + ), + Entry( + "patch deployment error", + false, // nilCtx + false, // nilClient + false, // errOnGet + true, // errOnPatch + ), +) + +var _ = DescribeTable("RestartSignalHandler", + Ordered, + func( + nilCtx, + nilClient, + nilElected, + closeSansSig, + isNotLeader, + errOnRestart bool) { + + var ( + client ctrlclient.Client + withFuncs interceptor.Funcs + elected = make(chan struct{}) + config = pkgcfg.Default() + ctx = pkgcfg.WithConfig(config) + obj = appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: config.PodNamespace, + Name: config.DeploymentName, + }, + } + objKey = ctrlclient.ObjectKeyFromObject(&obj) + sigHandler pkgexit.RestartSignalHandler + ) + + if errOnRestart { + withFuncs.Patch = func( + _ context.Context, + _ ctrlclient.WithWatch, + _ ctrlclient.Object, + _ ctrlclient.Patch, + _ ...ctrlclient.PatchOption) error { + + return fmt.Errorf("fake") + } + } + + client = fake.NewClientBuilder(). + WithInterceptorFuncs(withFuncs). + WithObjects(&obj). + Build() + + if nilCtx { + ctx = nil + } + if nilClient { + client = nil + } + if nilElected { + elected = nil + } + + fn := func() { + sigHandler = pkgexit.NewRestartSignalHandler(ctx, client, elected) + } + + switch { + case nilCtx: + Expect(fn).To(PanicWith("ctx is nil")) + case nilClient: + Expect(fn).To(PanicWith("k8sClient is nil")) + case nilElected: + Expect(fn).To(PanicWith("elected is nil")) + default: + Expect(fn).ToNot(Panic()) + + switch { + case closeSansSig: + sigHandler.Close() + case !isNotLeader: + close(elected) + } + + // Send this process the signal that causes the restart. + Expect(syscall.Kill(syscall.Getpid(), pkgexit.RestartSignal)).To(Succeed()) + + Eventually(func(g Gomega) { + g.Expect(client.Get(ctx, objKey, &obj)).To(Succeed()) + lastExitTimeStr := obj.Spec.Template. + Annotations[pkgconst.LastRestartTimeAnnotationKey] + + switch { + case closeSansSig, isNotLeader, errOnRestart: + g.Expect(lastExitTimeStr).To(BeEmpty()) + default: + g.Expect(lastExitTimeStr).ToNot(BeEmpty()) + } + + }, 3*time.Second).Should(Succeed()) + + // Close the signal handler. + sigHandler.Close() + + Eventually(func(g Gomega) { + g.Expect(sigHandler.Closed()).Should(BeClosed()) + }, 3*time.Second).Should(Succeed()) + } + }, + Entry( + "should panic with nil context", + true, // nilCtx + false, // nilClient + false, // nilElected + false, // closeSansSig + false, // isNotLeader + false, // errOnRestart + ), + Entry( + "should panic with nil client", + false, // nilCtx + true, // nilClient + false, // nilElected + false, // closeSansSig + false, // isNotLeader + false, // errOnRestart + ), + Entry( + "should panic with nil elected", + false, // nilCtx + false, // nilClient + true, // nilElected + false, // closeSansSig + false, // isNotLeader + false, // errOnRestart + ), + Entry( + "should close sans signal", + false, // nilCtx + false, // nilClient + false, // nilElected + true, // closeSansSig + false, // isNotLeader + false, // errOnRestart + ), + Entry( + "should receive signal and ignore", + false, // nilCtx + false, // nilClient + false, // nilElected + false, // closeSansSig + true, // isNotLeader + false, // errOnRestart + ), + Entry( + "should receive signal and fail to restart", + false, // nilCtx + false, // nilClient + false, // nilElected + false, // closeSansSig + false, // isNotLeader + true, // errOnRestart + ), + Entry( + "should receive signal and restart", + false, // nilCtx + false, // nilClient + false, // nilElected + false, // closeSansSig + false, // isNotLeader + false, // errOnRestart + ), +) diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index f6b9de945..5ef2cbd82 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -9,6 +9,7 @@ import ( "fmt" "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/cache" @@ -53,6 +54,7 @@ func New(ctx context.Context, opts Options) (Manager, error) { // Ensure the default options are set. opts.defaults() + _ = appsv1.AddToScheme(opts.Scheme) _ = clientgoscheme.AddToScheme(opts.Scheme) _ = ncpv1alpha1.AddToScheme(opts.Scheme) _ = cnsv1alpha1.AddToScheme(opts.Scheme) @@ -82,14 +84,23 @@ func New(ctx context.Context, opts Options) (Manager, error) { }, Client: client.Options{ Cache: &client.CacheOptions{ - // An informer is created for each watched resource. Due to the - // number of ConfigMap and Secret resources that may exist, - // watching each one can result in VM Operator being terminated - // due to an out-of-memory error, i.e. OOMKill. To avoid this - // outcome, ConfigMap and Secret resources are not cached. DisableFor: []client.Object{ + // An informer is created for each watched resource. Due to the + // number of ConfigMap and Secret resources that may exist, + // watching each one can result in VM Operator being terminated + // due to an out-of-memory error, i.e. OOMKill. To avoid this + // outcome, ConfigMap and Secret resources are not cached. &corev1.ConfigMap{}, &corev1.Secret{}, + + // The pkg/exit.Restart function gets a Deployment resource + // in order to patch it to restart the pods in the + // deployment when capabilities have changed. + // Capabilities do not change often enough to warrant + // caching the Deployment resource, and thus there is no + // reason to cache Deployment resources as nothing else in + // VM Operator gets them. + &appsv1.Deployment{}, }, }, },