Skip to content

Commit

Permalink
feat: ignore rejected and cordoned machines in pxe boot handler
Browse files Browse the repository at this point in the history
Return 404 for such machines, so they make their own decision/fallback.

Also add an option to disable DHCP proxy.

Do some minor logging improvements.

Signed-off-by: Utku Ozdemir <[email protected]>
  • Loading branch information
utkuozdemir committed Jan 28, 2025
1 parent 24f4e21 commit 36d5fbd
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 14 deletions.
2 changes: 2 additions & 0 deletions cmd/provider/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ func init() {
"Enable controller runtime resource cache.")
rootCmd.Flags().BoolVar(&providerOptions.WipeWithZeroes, "wipe-with-zeroes", provider.DefaultOptions.WipeWithZeroes,
"When wiping a machine, write zeroes to the whole disk instead doing a fast wipe.")
rootCmd.Flags().BoolVar(&providerOptions.DisableDHCPProxy, "disable-dhcp-proxy", provider.DefaultOptions.DisableDHCPProxy,
"Disable the DHCP proxy server.")

// RedFish options
rootCmd.Flags().BoolVar(&providerOptions.RedfishOptions.UseAlways, "redfish-use-always", provider.DefaultOptions.RedfishOptions.UseAlways,
Expand Down
4 changes: 1 addition & 3 deletions internal/provider/controllers/power_operation.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ type powerOperationControllerHelper struct {
minRebootInterval time.Duration
}

//nolint:gocyclo,cyclop
//nolint:cyclop
func (helper *powerOperationControllerHelper) transform(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger,
infraMachine *infra.Machine, powerOperation *resources.PowerOperation,
) error {
Expand Down Expand Up @@ -149,8 +149,6 @@ func (helper *powerOperationControllerHelper) transform(ctx context.Context, r c
}

powerOperation.TypedSpec().Value.LastPowerOperation = specs.PowerState_POWER_STATE_OFF
case infraMachine.TypedSpec().Value.Cordoned:
logger.Debug("machine is cordoned, skip power management")
default:
logger.Debug("machine power state is already as desired")
}
Expand Down
31 changes: 24 additions & 7 deletions internal/provider/ipxe/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
omnispecs "github.com/siderolabs/omni/client/api/omni/specs"
"github.com/siderolabs/omni/client/pkg/omni/resources/infra"
"github.com/siderolabs/talos-metal-agent/pkg/config"
"go.uber.org/zap"
Expand Down Expand Up @@ -97,11 +98,11 @@ func (handler *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {

decision, err := handler.makeBootDecision(ctx, arch, uuid, logger)
if err != nil {
handler.logger.Error("failed to check if Talos is installed", zap.Error(err))
handler.logger.Error("failed to make boot decision", zap.Error(err))

w.WriteHeader(http.StatusInternalServerError)

if _, err = w.Write([]byte("failed to check if Talos is installed")); err != nil {
if _, err = w.Write([]byte("failed to make boot decision")); err != nil {
handler.logger.Error("failed to write error response", zap.Error(err))
}

Expand All @@ -111,7 +112,7 @@ func (handler *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
w.WriteHeader(decision.statusCode)

if _, err = w.Write([]byte(decision.body)); err != nil {
handler.logger.Error("failed to write ok response", zap.Error(err))
handler.logger.Error("failed to write response", zap.Error(err))

return
}
Expand All @@ -129,7 +130,7 @@ type bootDecision struct {
mode specs.BootMode
}

//nolint:cyclop
//nolint:gocyclo,cyclop
func (handler *Handler) makeBootDecision(ctx context.Context, arch, uuid string, logger *zap.Logger) (bootDecision, error) {
switch arch { // https://ipxe.org/cfg/buildarch
case archArm64:
Expand All @@ -153,6 +154,22 @@ func (handler *Handler) makeBootDecision(ctx context.Context, arch, uuid string,
return bootDecision{}, err
}

if infraMachine != nil {
logger = logger.With(zap.String("machine_id", infraMachine.Metadata().ID()))

if infraMachine.TypedSpec().Value.Cordoned {
logger.Info("machine is cordoned, skip making a boot decision")

return bootDecision{body: "machine is cordoned", statusCode: http.StatusNotFound}, nil
}

if infraMachine.TypedSpec().Value.AcceptanceStatus == omnispecs.InfraMachineConfigSpec_REJECTED {
logger.Info("machine is rejected, return not found")

return bootDecision{statusCode: http.StatusNotFound}, nil
}
}

requiredBootMode := machine.RequiredBootMode(infraMachine, bmcConfiguration, wipeStatus, logger)

var userExtraKernelArgs []string
Expand All @@ -169,7 +186,7 @@ func (handler *Handler) makeBootDecision(ctx context.Context, arch, uuid string,

switch requiredBootMode {
case specs.BootMode_BOOT_MODE_AGENT_PXE:
logger.Info("boot into agent mode")
logger.Info("boot machine: Talos agent mode")

body, statusCode, agentErr := handler.bootIntoAgentMode(ctx, arch, userExtraKernelArgs)
if agentErr != nil {
Expand All @@ -182,7 +199,7 @@ func (handler *Handler) makeBootDecision(ctx context.Context, arch, uuid string,
statusCode: statusCode,
}, nil
case specs.BootMode_BOOT_MODE_TALOS_PXE:
logger.Info("boot Talos over iPXE")
logger.Info("boot machine: Talos over iPXE")

consoleKernelArgs := handler.consoleKernelArgs(arch)
extraKernelArgs := slices.Concat(handler.defaultKernelArgs, consoleKernelArgs, userExtraKernelArgs)
Expand All @@ -204,7 +221,7 @@ func (handler *Handler) makeBootDecision(ctx context.Context, arch, uuid string,
statusCode: http.StatusOK,
}, nil
case specs.BootMode_BOOT_MODE_TALOS_DISK:
logger.Info("boot from disk")
logger.Info("boot machine: from the disk")

switch handler.bootFromDiskMethod {
case Boot404:
Expand Down
1 change: 1 addition & 0 deletions internal/provider/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type Options struct {
UseLocalBootAssets bool
ClearState bool
WipeWithZeroes bool
DisableDHCPProxy bool

RedfishOptions redfish.Options

Expand Down
14 changes: 10 additions & 4 deletions internal/provider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ func (p *Provider) Run(ctx context.Context) error {
bmcClientFactory := bmc.NewClientFactory(bmc.ClientFactoryOptions{
RedfishOptions: p.options.RedfishOptions,
}, p.logger)
dhcpProxy := dhcp.NewProxy(apiAdvertiseAddress, p.options.APIPort, dhcpProxyIfaceOrIP, p.logger.With(zap.String("component", "dhcp_proxy")))
tftpServer := tftp.NewServer(p.logger.With(zap.String("component", "tftp_server")))
bmcAPIAddressReader := bmcapi.NewAddressReader(p.options.APIPowerMgmtStateDir)
agentClient := agent.NewClient(agentConnectionEventCh, p.options.WipeWithZeroes, p.logger.With(zap.String("component", "agent_client"))) //nolint:contextcheck // false positive
Expand Down Expand Up @@ -181,12 +180,19 @@ func (p *Provider) Run(ctx context.Context) error {
}
}

return p.runComponents(ctx, []component{
components := []component{
{cosiRuntime.Run, "COSI runtime"},
{srvr.Run, "server"},
{dhcpProxy.Run, "DHCP proxy"},
{tftpServer.Run, "TFTP server"},
})
}

if !p.options.DisableDHCPProxy {
dhcpProxy := dhcp.NewProxy(apiAdvertiseAddress, p.options.APIPort, dhcpProxyIfaceOrIP, p.logger.With(zap.String("component", "dhcp_proxy")))

components = append(components, component{dhcpProxy.Run, "DHCP proxy"})
}

return p.runComponents(ctx, components)
}

func (p *Provider) buildCOSIRuntime(omniAPIClient *client.Client) (*runtime.Runtime, error) {
Expand Down

0 comments on commit 36d5fbd

Please sign in to comment.