Skip to content

Commit 227fe2f

Browse files
committed
Improve MHC reporting: add reason to condition, add reason+message to
log Signed-off-by: Stefan Büringer [email protected]
1 parent 3ecb43b commit 227fe2f

File tree

2 files changed

+21
-15
lines changed

2 files changed

+21
-15
lines changed

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,10 @@ func (t *healthCheckTarget) machineChecks(logger logr.Logger) ([]string, time.Du
183183
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
184184

185185
if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
186-
unhealthyMachineMessages = append(unhealthyMachineMessages, fmt.Sprintf("Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
187-
logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
186+
unhealthyMachineMessages = append(unhealthyMachineMessages, fmt.Sprintf("Condition %s on Machine is reporting status %s with reason %s for more than %s",
187+
c.Type, c.Status, machineCondition.Reason, timeoutSecondsDuration.String()))
188+
logger.V(3).Info(fmt.Sprintf("Target is unhealthy: Machine condition is in unhealthy state more than %s", timeoutSecondsDuration.String()),
189+
"condition", c.Type, "state", c.Status, "reason", machineCondition.Reason, "message", machineCondition.Message)
188190
continue
189191
}
190192

@@ -272,8 +274,10 @@ func (t *healthCheckTarget) nodeChecks(logger logr.Logger, timeoutForMachineToHa
272274
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
273275

274276
if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
275-
unhealthyNodeMessages = append(unhealthyNodeMessages, fmt.Sprintf("Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
276-
logger.V(3).Info("Target is unhealthy: node condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
277+
unhealthyNodeMessages = append(unhealthyNodeMessages, fmt.Sprintf("Condition %s on Node is reporting status %s with reason %s for more than %s",
278+
c.Type, c.Status, nodeCondition.Reason, timeoutSecondsDuration.String()))
279+
logger.V(3).Info(fmt.Sprintf("Target is unhealthy: Node condition is in unhealthy state more than %s", timeoutSecondsDuration.String()),
280+
"condition", c.Type, "state", c.Status, "reason", nodeCondition.Reason, "message", nodeCondition.Message)
277281
continue
278282
}
279283

internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ func TestHealthCheckTargets(t *testing.T) {
362362
}
363363

364364
// Target for when the node has been in an unknown state for shorter than the timeout
365-
testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second)
365+
testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, "NodeStatusUnknown", 200*time.Second)
366366
nodeUnknown200 := healthCheckTarget{
367367
Cluster: cluster,
368368
MHC: testMHC,
@@ -372,7 +372,7 @@ func TestHealthCheckTargets(t *testing.T) {
372372
}
373373

374374
// Second Target for when the node has been in an unknown state for shorter than the timeout
375-
testNodeUnknown100 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 100*time.Second)
375+
testNodeUnknown100 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, "NodeStatusUnknown", 100*time.Second)
376376
nodeUnknown100 := healthCheckTarget{
377377
Cluster: cluster,
378378
MHC: testMHC,
@@ -382,16 +382,16 @@ func TestHealthCheckTargets(t *testing.T) {
382382
}
383383

384384
// Target for when the node has been in an unknown state for longer than the timeout
385-
testNodeUnknown400 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 400*time.Second)
385+
testNodeUnknown400 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, "NodeStatusUnknown", 400*time.Second)
386386
nodeUnknown400 := healthCheckTarget{
387387
Cluster: cluster,
388388
MHC: testMHC,
389389
Machine: testMachine.DeepCopy(),
390390
Node: testNodeUnknown400,
391391
nodeMissing: false,
392392
}
393-
nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
394-
nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed:\n * Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
393+
nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown with reason NodeStatusUnknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
394+
nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed:\n * Condition Ready on Node is reporting status Unknown with reason NodeStatusUnknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
395395

396396
// Target for when a node is healthy
397397
testNodeHealthy := newTestNode("node1")
@@ -405,7 +405,7 @@ func TestHealthCheckTargets(t *testing.T) {
405405
}
406406

407407
// Machine unhealthy for shorter than timeout
408-
testMachineUnhealthy200 := newTestUnhealthyMachine("machine1", namespace, clusterName, "node1", mhcSelector, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, metav1.ConditionFalse, 200*time.Second)
408+
testMachineUnhealthy200 := newTestUnhealthyMachine("machine1", namespace, clusterName, "node1", mhcSelector, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, metav1.ConditionFalse, controlplanev1.KubeadmControlPlaneMachinePodFailedReason, 200*time.Second)
409409
machineUnhealthy200 := healthCheckTarget{
410410
Cluster: cluster,
411411
MHC: testMHC,
@@ -415,7 +415,7 @@ func TestHealthCheckTargets(t *testing.T) {
415415
}
416416

417417
// Machine unhealthy for longer than timeout
418-
testMachineUnhealthy400 := newTestUnhealthyMachine("machine1", namespace, clusterName, "node1", mhcSelector, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, metav1.ConditionFalse, 400*time.Second)
418+
testMachineUnhealthy400 := newTestUnhealthyMachine("machine1", namespace, clusterName, "node1", mhcSelector, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, metav1.ConditionFalse, controlplanev1.KubeadmControlPlaneMachinePodFailedReason, 400*time.Second)
419419
machineUnhealthy400 := healthCheckTarget{
420420
Cluster: cluster,
421421
MHC: testMHC,
@@ -425,12 +425,12 @@ func TestHealthCheckTargets(t *testing.T) {
425425
}
426426
machineUnhealthy400Condition := newFailedHealthCheckV1Beta1Condition(
427427
clusterv1.UnhealthyMachineConditionV1Beta1Reason,
428-
"Condition EtcdPodHealthy on Machine is reporting status False for more than %s",
428+
"Condition EtcdPodHealthy on Machine is reporting status False with reason Failed for more than %s",
429429
(time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(),
430430
)
431431
machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition(
432432
clusterv1.MachineHealthCheckUnhealthyMachineReason,
433-
"Health check failed:\n * Condition EtcdPodHealthy on Machine is reporting status False for more than %s",
433+
"Health check failed:\n * Condition EtcdPodHealthy on Machine is reporting status False with reason Failed for more than %s",
434434
(time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(),
435435
)
436436

@@ -681,7 +681,7 @@ func newTestNode(name string) *corev1.Node {
681681
}
682682
}
683683

684-
func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, status corev1.ConditionStatus, unhealthyDuration time.Duration) *corev1.Node {
684+
func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, status corev1.ConditionStatus, reason string, unhealthyDuration time.Duration) *corev1.Node {
685685
return &corev1.Node{
686686
ObjectMeta: metav1.ObjectMeta{
687687
Name: name,
@@ -692,14 +692,15 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu
692692
{
693693
Type: condition,
694694
Status: status,
695+
Reason: reason,
695696
LastTransitionTime: metav1.NewTime(time.Now().Add(-unhealthyDuration)),
696697
},
697698
},
698699
},
699700
}
700701
}
701702

702-
func newTestUnhealthyMachine(name, namespace, clusterName, nodeName string, labels map[string]string, condition string, status metav1.ConditionStatus, unhealthyDuration time.Duration) *clusterv1.Machine {
703+
func newTestUnhealthyMachine(name, namespace, clusterName, nodeName string, labels map[string]string, condition string, status metav1.ConditionStatus, reason string, unhealthyDuration time.Duration) *clusterv1.Machine {
703704
// Copy the labels so that the map is unique to each test Machine
704705
l := make(map[string]string)
705706
for k, v := range labels {
@@ -725,6 +726,7 @@ func newTestUnhealthyMachine(name, namespace, clusterName, nodeName string, labe
725726
{
726727
Type: condition,
727728
Status: status,
729+
Reason: reason,
728730
LastTransitionTime: metav1.NewTime(time.Now().Add(-unhealthyDuration)),
729731
},
730732
},

0 commit comments

Comments
 (0)