Skip to content

Commit f068d1c

Browse files
✨ Add support for checking Machine conditions in MachineHealthCheck (#12827)
* Add support for checking Machine conditions in MachineHealthCheck MachineHealthCheck currently only allows checking Node conditions to validate if a machine is healthy. However, machine conditions capture conditions that do not exist on nodes, for example, control plane node conditions such as EtcdPodHealthy, SchedulerPodHealthy that can indicate if a controlplane machine has been created correctly. Adding support for Machine conditions enables us to perform remediation during control plane upgrades. This PR introduces a new field as part of the MachineHealthCheckChecks: - `UnhealthyMachineConditions` This will mirror the behavior of `UnhealthyNodeConditions` but the MachineHealthCheck controller will instead check the machine conditions. This reimplements and extends earlier work originally proposed in a previous PR 12275. Co-authored-by: Justin Miron <[email protected]> Signed-off-by: Furkat Gofurov <[email protected]> * Fix PR check Markdown links CI Signed-off-by: Furkat Gofurov <[email protected]> * Address review comments Signed-off-by: Furkat Gofurov <[email protected]> * Address review comments: rework node and machine checks in needsRemediation() method If both a node condition and machine condition are unhealthy, pick one reason but combine all the messages Signed-off-by: Furkat Gofurov <[email protected]> * Address Stefan comments (conversion) Signed-off-by: Furkat Gofurov <[email protected]> * Address review comments Fabrizio (mhc target, mhc controller code) Refactors `needsRemediation`, specifically following changes were made: - Move machine condition evaluation to always execute first, regardless of node state - Ensure machine conditions are checked in ALL scenarios: * When node is missing (t.nodeMissing) * When node hasn't appeared yet (t.Node == nil) * When node exists (t.Node != nil) - Consistently merge node and machine condition messages in all failure scenarios - Maintain backward compatibility with existing condition message formats - Use appropriate condition reasons based on which conditions are unhealthy Signed-off-by: Furkat Gofurov <[email protected]> * Fix event message to reflect both machine and node condition checking Signed-off-by: Furkat Gofurov <[email protected]> * Simplify `needsRemediation` function further by using two sub functions: one for machineChecks and the other for nodeChecks. Another benefit of this code struct, is that condition management is implemented only in one place. Co-authored-by: Fabrizio Pandini Signed-off-by: Furkat Gofurov <[email protected]> * Add CEL validation to prevent disallowed UnhealthyMachineCondition types Signed-off-by: Furkat Gofurov <[email protected]> * Clarify `UnhealthyMachineConditionV1Beta1Reason` precedence over node reasons Signed-off-by: Furkat Gofurov <[email protected]> * Address review comments (Stefan) Signed-off-by: Furkat Gofurov <[email protected]> --------- Signed-off-by: Furkat Gofurov <[email protected]> Co-authored-by: Justin Miron <[email protected]>
1 parent 06aae54 commit f068d1c

33 files changed

+1755
-116
lines changed

api/core/v1beta1/conversion.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ func (src *Cluster) ConvertTo(dstRaw conversion.Hub) error {
7373
return err
7474
}
7575

76+
dst.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions
77+
for i, md := range restored.Spec.Topology.Workers.MachineDeployments {
78+
dst.Spec.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions
79+
}
80+
7681
// Recover intent for bool values converted to *bool.
7782
clusterv1.Convert_bool_To_Pointer_bool(src.Spec.Paused, ok, restored.Spec.Paused, &dst.Spec.Paused)
7883

@@ -145,6 +150,11 @@ func (src *ClusterClass) ConvertTo(dstRaw conversion.Hub) error {
145150
return err
146151
}
147152

153+
dst.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions
154+
for i, md := range restored.Spec.Workers.MachineDeployments {
155+
dst.Spec.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions
156+
}
157+
148158
// Recover intent for bool values converted to *bool.
149159
for i, patch := range dst.Spec.Patches {
150160
for j, definition := range patch.Definitions {
@@ -513,6 +523,8 @@ func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error {
513523
return err
514524
}
515525

526+
dst.Spec.Checks.UnhealthyMachineConditions = restored.Spec.Checks.UnhealthyMachineConditions
527+
516528
clusterv1.Convert_int32_To_Pointer_int32(src.Status.ExpectedMachines, ok, restored.Status.ExpectedMachines, &dst.Status.ExpectedMachines)
517529
clusterv1.Convert_int32_To_Pointer_int32(src.Status.CurrentHealthy, ok, restored.Status.CurrentHealthy, &dst.Status.CurrentHealthy)
518530
clusterv1.Convert_int32_To_Pointer_int32(src.Status.RemediationsAllowed, ok, restored.Status.RemediationsAllowed, &dst.Status.RemediationsAllowed)

api/core/v1beta2/cluster_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,16 @@ type ControlPlaneTopologyHealthCheckChecks struct {
725725
// +kubebuilder:validation:MinItems=1
726726
// +kubebuilder:validation:MaxItems=100
727727
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
728+
729+
// unhealthyMachineConditions contains a list of the machine conditions that determine
730+
// whether a machine is considered unhealthy. The conditions are combined in a
731+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
732+
//
733+
// +optional
734+
// +listType=atomic
735+
// +kubebuilder:validation:MinItems=1
736+
// +kubebuilder:validation:MaxItems=100
737+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
728738
}
729739

730740
// ControlPlaneTopologyHealthCheckRemediation configures if and how remediations are triggered if a control plane Machine is unhealthy.
@@ -975,6 +985,16 @@ type MachineDeploymentTopologyHealthCheckChecks struct {
975985
// +kubebuilder:validation:MinItems=1
976986
// +kubebuilder:validation:MaxItems=100
977987
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
988+
989+
// unhealthyMachineConditions contains a list of the machine conditions that determine
990+
// whether a machine is considered unhealthy. The conditions are combined in a
991+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
992+
//
993+
// +optional
994+
// +listType=atomic
995+
// +kubebuilder:validation:MinItems=1
996+
// +kubebuilder:validation:MaxItems=100
997+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
978998
}
979999

9801000
// MachineDeploymentTopologyHealthCheckRemediation configures if and how remediations are triggered if a MachineDeployment Machine is unhealthy.

api/core/v1beta2/clusterclass_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,16 @@ type ControlPlaneClassHealthCheckChecks struct {
281281
// +kubebuilder:validation:MinItems=1
282282
// +kubebuilder:validation:MaxItems=100
283283
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
284+
285+
// unhealthyMachineConditions contains a list of the machine conditions that determine
286+
// whether a machine is considered unhealthy. The conditions are combined in a
287+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
288+
//
289+
// +optional
290+
// +listType=atomic
291+
// +kubebuilder:validation:MinItems=1
292+
// +kubebuilder:validation:MaxItems=100
293+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
284294
}
285295

286296
// ControlPlaneClassHealthCheckRemediation configures if and how remediations are triggered if a control plane Machine is unhealthy.
@@ -542,6 +552,16 @@ type MachineDeploymentClassHealthCheckChecks struct {
542552
// +kubebuilder:validation:MinItems=1
543553
// +kubebuilder:validation:MaxItems=100
544554
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
555+
556+
// unhealthyMachineConditions contains a list of the machine conditions that determine
557+
// whether a machine is considered unhealthy. The conditions are combined in a
558+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
559+
//
560+
// +optional
561+
// +listType=atomic
562+
// +kubebuilder:validation:MinItems=1
563+
// +kubebuilder:validation:MaxItems=100
564+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
545565
}
546566

547567
// MachineDeploymentClassHealthCheckRemediation configures if and how remediations are triggered if a MachineDeployment Machine is unhealthy.

api/core/v1beta2/machine_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,10 @@ const (
287287
// defined by a MachineHealthCheck object.
288288
MachineHealthCheckUnhealthyNodeReason = "UnhealthyNode"
289289

290+
// MachineHealthCheckUnhealthyMachineReason surfaces when the machine does not pass the health checks
291+
// defined by a MachineHealthCheck object.
292+
MachineHealthCheckUnhealthyMachineReason = "UnhealthyMachine"
293+
290294
// MachineHealthCheckNodeStartupTimeoutReason surfaces when the node hosted on the machine does not appear within
291295
// the timeout defined by a MachineHealthCheck object.
292296
MachineHealthCheckNodeStartupTimeoutReason = "NodeStartupTimeout"

api/core/v1beta2/machinehealthcheck_types.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,16 @@ type MachineHealthCheckChecks struct {
111111
// +kubebuilder:validation:MinItems=1
112112
// +kubebuilder:validation:MaxItems=100
113113
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
114+
115+
// unhealthyMachineConditions contains a list of the machine conditions that determine
116+
// whether a machine is considered unhealthy. The conditions are combined in a
117+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
118+
//
119+
// +optional
120+
// +listType=atomic
121+
// +kubebuilder:validation:MinItems=1
122+
// +kubebuilder:validation:MaxItems=100
123+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
114124
}
115125

116126
// MachineHealthCheckRemediation configures if and how remediations are triggered if a Machine is unhealthy.
@@ -227,7 +237,33 @@ type UnhealthyNodeCondition struct {
227237

228238
// timeoutSeconds is the duration that a node must be in a given status for,
229239
// after which the node is considered unhealthy.
230-
// For example, with a value of "1h", the node must match the status
240+
// For example, with a value of "3600", the node must match the status
241+
// for at least 1 hour before being considered unhealthy.
242+
// +required
243+
// +kubebuilder:validation:Minimum=0
244+
TimeoutSeconds *int32 `json:"timeoutSeconds,omitempty"`
245+
}
246+
247+
// UnhealthyMachineCondition represents a Machine condition type and value with a timeout
248+
// specified as a duration. When the named condition has been in the given
249+
// status for at least the timeout value, a machine is considered unhealthy.
250+
type UnhealthyMachineCondition struct {
251+
// type of Machine condition
252+
// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$`
253+
// +kubebuilder:validation:MinLength=1
254+
// +kubebuilder:validation:MaxLength=316
255+
// +kubebuilder:validation:XValidation:rule="!(self in ['Ready','Available','HealthCheckSucceeded','OwnerRemediated','ExternallyRemediated'])",message="type must not be one of: Ready, Available, HealthCheckSucceeded, OwnerRemediated, ExternallyRemediated"
256+
// +required
257+
Type string `json:"type,omitempty"`
258+
259+
// status of the condition, one of True, False, Unknown.
260+
// +required
261+
// +kubebuilder:validation:Enum=True;False;Unknown
262+
Status metav1.ConditionStatus `json:"status,omitempty"`
263+
264+
// timeoutSeconds is the duration that a machine must be in a given status for,
265+
// after which the machine is considered unhealthy.
266+
// For example, with a value of "3600", the machine must match the status
231267
// for at least 1 hour before being considered unhealthy.
232268
// +required
233269
// +kubebuilder:validation:Minimum=0

api/core/v1beta2/v1beta1_condition_consts.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,11 @@ const (
157157

158158
// UnhealthyNodeConditionV1Beta1Reason is the reason used when a machine's node has one of the MachineHealthCheck's unhealthy conditions.
159159
UnhealthyNodeConditionV1Beta1Reason = "UnhealthyNode"
160+
161+
// UnhealthyMachineConditionV1Beta1Reason is the reason used when a machine has one of the MachineHealthCheck's unhealthy conditions.
162+
// When both machine and node issues are detected, this reason takes precedence over node-related reasons
163+
// (NodeNotFoundV1Beta1Reason, NodeStartupTimeoutV1Beta1Reason, UnhealthyNodeConditionV1Beta1Reason).
164+
UnhealthyMachineConditionV1Beta1Reason = "UnhealthyMachine"
160165
)
161166

162167
const (

api/core/v1beta2/zz_generated.deepcopy.go

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)