Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

neonvm-controller: make IPAM code production-ready #1278

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
46 changes: 31 additions & 15 deletions neonvm-controller/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ import (

vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/neonvm/controllers"
"github.com/neondatabase/autoscaling/pkg/neonvm/ipam"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/taskgroup"
)

var (
Expand Down Expand Up @@ -152,7 +154,8 @@ func main() {
logConfig.Sampling = nil // Disabling sampling; it's enabled by default for zap's production configs.
logConfig.Level.SetLevel(zap.InfoLevel)
logConfig.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
logger := zapr.NewLogger(zap.Must(logConfig.Build(zap.AddStacktrace(zapcore.PanicLevel))))
zapLogger := zap.Must(logConfig.Build(zap.AddStacktrace(zapcore.PanicLevel)))
logger := zapr.NewLogger(zapLogger)

ctrl.SetLogger(logger)
// define klog settings (used in LeaderElector)
Expand Down Expand Up @@ -182,7 +185,7 @@ func main() {
})
if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
panic(err)
}

reconcilerMetrics := controllers.MakeReconcilerMetrics()
Expand All @@ -197,27 +200,36 @@ func main() {
FailingRefreshInterval: failingRefreshInterval,
AtMostOnePod: atMostOnePod,
DefaultCPUScalingMode: defaultCpuScalingMode,
NADConfig: controllers.GetNADConfig(),
}

ipam, err := ipam.New(rc.NADConfig.IPAMName, rc.NADConfig.IPAMNamespace)
if err != nil {
setupLog.Error(err, "unable to create ipam")
panic(err)
}
defer ipam.Close()

vmReconciler := &controllers.VMReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("virtualmachine-controller"),
Config: rc,
Metrics: reconcilerMetrics,
IPAM: ipam,
}
vmReconcilerMetrics, err := vmReconciler.SetupWithManager(mgr)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "VirtualMachine")
os.Exit(1)
panic(err)
}
vmWebhook := &controllers.VMWebhook{
Recorder: mgr.GetEventRecorderFor("virtualmachine-webhook"),
Config: rc,
}
if err := vmWebhook.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "VirtualMachine")
os.Exit(1)
panic(err)
}

migrationReconciler := &controllers.VirtualMachineMigrationReconciler{
Expand All @@ -230,43 +242,47 @@ func main() {
migrationReconcilerMetrics, err := migrationReconciler.SetupWithManager(mgr)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "VirtualMachineMigration")
os.Exit(1)
panic(err)
}
migrationWebhook := &controllers.VMMigrationWebhook{
Recorder: mgr.GetEventRecorderFor("virtualmachinemigration-webhook"),
Config: rc,
}
if err := migrationWebhook.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "VirtualMachine")
os.Exit(1)
panic(err)
}
//+kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
panic(err)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
panic(err)
}

dbgSrv := debugServerFunc(vmReconcilerMetrics, migrationReconcilerMetrics)
if err := mgr.Add(dbgSrv); err != nil {
setupLog.Error(err, "unable to set up debug server")
os.Exit(1)
panic(err)
}

if err := mgr.Add(vmReconcilerMetrics.FailingRefresher()); err != nil {
setupLog.Error(err, "unable to set up failing refresher")
os.Exit(1)
panic(err)
}
tg := taskgroup.NewGroup(zapLogger)
tg.Go("run manager", func(logger *zap.Logger) error {
return run(mgr)
})
tg.Go("run ipam cleanup", func(logger *zap.Logger) error {
return ipam.RunCleanup(context.Background(), "default")
})

// NOTE: THE CONTROLLER MUST IMMEDIATELY EXIT AFTER RUNNING THE MANAGER.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question (maybe blocking?): This is no longer upheld, right? IIRC it's required due to releasing the leader election lease immediately when the manager exits

if err := run(mgr); err != nil {
setupLog.Error(err, "run manager error")
os.Exit(1)
}
err = tg.Wait()
panic(err)
}

func debugServerFunc(reconcilers ...controllers.ReconcilerWithMetrics) manager.RunnableFunc {
Expand Down
10 changes: 7 additions & 3 deletions neonvm/config/network/overlay-ipam.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ metadata:
spec:
config: '{
"ipam": {
"range": "10.100.0.0/16",
"range_start": "10.100.128.0",
"network_name": "neonvm"
"network_name": "neonvm",
"ipRanges": [
{
"range": "10.100.0.0/16",
"range_start": "10.100.128.0"
}
]
}
}'
3 changes: 3 additions & 0 deletions pkg/neonvm/controllers/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,7 @@ type ReconcilerConfig struct {
AtMostOnePod bool
// DefaultCPUScalingMode is the default CPU scaling mode that will be used for VMs with empty spec.cpuScalingMode
DefaultCPUScalingMode vmv1.CpuScalingMode

// NADConfig is the configuration for the Network Attachment Definitions
NADConfig *NADConfig
}
2 changes: 2 additions & 0 deletions pkg/neonvm/controllers/functests/vm_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ var _ = Describe("VirtualMachine controller", func() {
FailingRefreshInterval: 1 * time.Minute,
AtMostOnePod: false,
DefaultCPUScalingMode: vmv1.CpuScalingModeQMP,
NADConfig: nil,
},
IPAM: nil,
}

_, err = virtualmachineReconciler.Reconcile(ctx, reconcile.Request{
Expand Down
97 changes: 25 additions & 72 deletions pkg/neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ type VMReconciler struct {
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *ReconcilerConfig
IPAM *ipam.IPAM

Metrics ReconcilerMetrics `exhaustruct:"optional"`
}
Expand Down Expand Up @@ -238,27 +239,7 @@ func (r *VMReconciler) doFinalizerOperationsForVirtualMachine(ctx context.Contex

// Release overlay IP address
if vm.Spec.ExtraNetwork != nil {
// Create IPAM object
nadName, err := nadIpamName()
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
nadNamespace, err := nadIpamNamespace()
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
ipam, err := ipam.New(ctx, nadName, nadNamespace)
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
defer ipam.Close()
ip, err := ipam.ReleaseIP(ctx, vm.Name, vm.Namespace)
ip, err := r.IPAM.ReleaseIP(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace})
if err != nil {
// ignore error
log.Error(err, "fail to release IP, error ignored")
Expand Down Expand Up @@ -347,23 +328,7 @@ func (r *VMReconciler) acquireOverlayIP(ctx context.Context, vm *vmv1.VirtualMac
}

log := log.FromContext(ctx)

// Create IPAM object
nadName, err := nadIpamName()
if err != nil {
return err
}
nadNamespace, err := nadIpamNamespace()
if err != nil {
return err
}
ipam, err := ipam.New(ctx, nadName, nadNamespace)
if err != nil {
log.Error(err, "failed to create IPAM")
return err
}
defer ipam.Close()
ip, err := ipam.AcquireIP(ctx, vm.Name, vm.Namespace)
ip, err := r.IPAM.AcquireIP(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace})
if err != nil {
log.Error(err, "fail to acquire IP")
return err
Expand Down Expand Up @@ -416,7 +381,9 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine)

case "":
if err := r.acquireOverlayIP(ctx, vm); err != nil {
return err
log.Error(err, "Failed to acquire overlay IP", "VirtualMachine", vm.Name)
r.Recorder.Event(vm, "Warning", "OverlayNet", "Failed to acquire overlay IP")
// Don't return error, we will keep running without an overlay IP
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question (blocking): IIUC this means that if we have some transient issue where we fail to initially acquire the overlay IP, we'll never get one? Is that accurate, and is that the behavior we want?

}
// VirtualMachine just created, change Phase to "Pending"
vm.Status.Phase = vmv1.VmPending
Expand Down Expand Up @@ -1710,15 +1677,7 @@ func podSpec(
if len(vm.Spec.ExtraNetwork.MultusNetwork) > 0 { // network specified in spec
nadNetwork = vm.Spec.ExtraNetwork.MultusNetwork
} else { // get network from env variables
nadName, err := nadRunnerName()
if err != nil {
return nil, err
}
nadNamespace, err := nadRunnerNamespace()
if err != nil {
return nil, err
}
nadNetwork = fmt.Sprintf("%s/%s", nadNamespace, nadName)
nadNetwork = fmt.Sprintf("%s/%s", config.NADConfig.RunnerNamespace, config.NADConfig.RunnerName)
}
pod.ObjectMeta.Annotations[nadapiv1.NetworkAttachmentAnnot] = fmt.Sprintf("%s@%s", nadNetwork, vm.Spec.ExtraNetwork.Interface)
}
Expand Down Expand Up @@ -1768,33 +1727,27 @@ func (r *VMReconciler) tryUpdateVM(ctx context.Context, vm *vmv1.VirtualMachine)
return r.Update(ctx, vm)
}

// return Network Attachment Definition name with IPAM settings
func nadIpamName() (string, error) {
return getEnvVarValue("NAD_IPAM_NAME")
type NADConfig struct {
IPAMName string
IPAMNamespace string
RunnerName string
RunnerNamespace string
}

// return Network Attachment Definition namespace with IPAM settings
func nadIpamNamespace() (string, error) {
return getEnvVarValue("NAD_IPAM_NAMESPACE")
}

// return Network Attachment Definition name for second interface in Runner
func nadRunnerName() (string, error) {
return getEnvVarValue("NAD_RUNNER_NAME")
}

// return Network Attachment Definition namespace for second interface in Runner
func nadRunnerNamespace() (string, error) {
return getEnvVarValue("NAD_RUNNER_NAMESPACE")
}

// return env variable value
func getEnvVarValue(envVarName string) (string, error) {
value, found := os.LookupEnv(envVarName)
if !found {
return "", fmt.Errorf("unable to find %s environment variable", envVarName)
func GetNADConfig() *NADConfig {
getVar := func(envVarName string) string {
value, ok := os.LookupEnv(envVarName)
if !ok {
panic(fmt.Errorf("unable to find %s environment variable", envVarName))
}
return value
}
return &NADConfig{
IPAMName: getVar("NAD_IPAM_NAME"),
IPAMNamespace: getVar("NAD_IPAM_NAMESPACE"),
RunnerName: getVar("NAD_RUNNER_NAME"),
RunnerNamespace: getVar("NAD_RUNNER_NAMESPACE"),
}
return value, nil
}

// sshKeygen generates a pair of public and private keys using the ed25519
Expand Down
2 changes: 2 additions & 0 deletions pkg/neonvm/controllers/vm_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,10 @@ func newTestParams(t *testing.T) *testParams {
FailingRefreshInterval: time.Minute,
AtMostOnePod: false,
DefaultCPUScalingMode: vmv1.CpuScalingModeQMP,
NADConfig: nil,
},
Metrics: testReconcilerMetrics,
IPAM: nil,
}

return params
Expand Down
1 change: 1 addition & 0 deletions pkg/neonvm/controllers/vmmigration_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func newMigrationTestParams(t *testing.T) *migrationTestParams {
FailingRefreshInterval: time.Minute,
AtMostOnePod: false,
DefaultCPUScalingMode: vmv1.CpuScalingModeQMP,
NADConfig: nil,
},
Metrics: testReconcilerMetrics,
}
Expand Down
Loading
Loading