Skip to content

Commit 02e7750

Browse files
authored
Merge pull request #12986 from k8s-infra-cherrypick-robot/cherry-pick-12980-to-release-1.11
[release-1.11] 🌱 Fix race condition on KCP initialized condition
2 parents aee3312 + c3bd79c commit 02e7750

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

controlplane/kubeadm/internal/controllers/status.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,21 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro
155155
// Note: This only gets initialized once and does not change if the kubeadm config map goes away.
156156
func setControlPlaneInitialized(ctx context.Context, controlPlane *internal.ControlPlane) error {
157157
if !ptr.Deref(controlPlane.KCP.Status.Initialization.ControlPlaneInitialized, false) {
158+
// If the control plane has only one machine, and this machine is marked for remediation or in the process of deleting,
159+
// do not check for control plane initialized.
160+
// This prevents an issue that happens if kubeadm init completes in the short timeframe between when machine deletion is triggered
161+
// to when the machine goes away; this issue, if not properly handled, will lead to an inconsistent state where
162+
// cluster is initialized, no CP machine exists, and the replacement CP machine fails when trying to join.
163+
if len(controlPlane.Machines) == 1 {
164+
m := controlPlane.Machines.UnsortedList()[0]
165+
if collections.IsUnhealthyAndOwnerRemediated(m) {
166+
return nil
167+
}
168+
if !m.DeletionTimestamp.IsZero() {
169+
return nil
170+
}
171+
}
172+
158173
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
159174
if err != nil {
160175
return errors.Wrap(err, "failed to create remote cluster client")

controlplane/kubeadm/internal/controllers/status_test.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ func TestKubeadmControlPlaneReconciler_setControlPlaneInitialized(t *testing.T)
7575
controlPlane := &internal.ControlPlane{
7676
Cluster: &clusterv1.Cluster{},
7777
KCP: &controlplanev1.KubeadmControlPlane{},
78+
Machines: collections.FromMachines(
79+
&clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}},
80+
),
7881
}
7982
controlPlane.InjectTestManagementCluster(&fakeManagementCluster{
8083
Workload: &fakeWorkloadCluster{
@@ -98,6 +101,85 @@ func TestKubeadmControlPlaneReconciler_setControlPlaneInitialized(t *testing.T)
98101
Reason: controlplanev1.KubeadmControlPlaneInitializedReason,
99102
}, conditions.IgnoreLastTransitionTime(true)))
100103
})
104+
t.Run("kubeadm config exists is ignored if there is a single CP machine and it is marked for remediation", func(t *testing.T) {
105+
g := NewWithT(t)
106+
controlPlane := &internal.ControlPlane{
107+
Cluster: &clusterv1.Cluster{},
108+
KCP: &controlplanev1.KubeadmControlPlane{},
109+
Machines: collections.FromMachines(
110+
&clusterv1.Machine{
111+
ObjectMeta: metav1.ObjectMeta{Name: "m1"},
112+
Status: clusterv1.MachineStatus{
113+
Conditions: []metav1.Condition{
114+
{
115+
Type: clusterv1.MachineHealthCheckSucceededCondition,
116+
Status: metav1.ConditionFalse,
117+
Reason: clusterv1.MachineHealthCheckNodeDeletedReason,
118+
},
119+
{
120+
Type: clusterv1.MachineOwnerRemediatedCondition,
121+
Status: metav1.ConditionFalse,
122+
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationReason,
123+
Message: "Waiting for remediation",
124+
},
125+
},
126+
},
127+
},
128+
),
129+
}
130+
controlPlane.InjectTestManagementCluster(&fakeManagementCluster{
131+
Workload: &fakeWorkloadCluster{
132+
Status: internal.ClusterStatus{
133+
HasKubeadmConfig: true,
134+
},
135+
},
136+
})
137+
138+
err := setControlPlaneInitialized(ctx, controlPlane)
139+
g.Expect(err).ToNot(HaveOccurred())
140+
141+
g.Expect(ptr.Deref(controlPlane.KCP.Status.Initialization.ControlPlaneInitialized, false)).To(BeFalse())
142+
143+
setInitializedCondition(ctx, controlPlane.KCP)
144+
c := conditions.Get(controlPlane.KCP, controlplanev1.KubeadmControlPlaneInitializedCondition)
145+
g.Expect(c).ToNot(BeNil())
146+
g.Expect(*c).To(conditions.MatchCondition(metav1.Condition{
147+
Type: controlplanev1.KubeadmControlPlaneInitializedCondition,
148+
Status: metav1.ConditionFalse,
149+
Reason: controlplanev1.KubeadmControlPlaneNotInitializedReason,
150+
}, conditions.IgnoreLastTransitionTime(true)))
151+
})
152+
t.Run("kubeadm config exists is ignored if there is a single CP machine and it is deleting", func(t *testing.T) {
153+
g := NewWithT(t)
154+
controlPlane := &internal.ControlPlane{
155+
Cluster: &clusterv1.Cluster{},
156+
KCP: &controlplanev1.KubeadmControlPlane{},
157+
Machines: collections.FromMachines(
158+
&clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1", DeletionTimestamp: ptr.To(metav1.Now())}},
159+
),
160+
}
161+
controlPlane.InjectTestManagementCluster(&fakeManagementCluster{
162+
Workload: &fakeWorkloadCluster{
163+
Status: internal.ClusterStatus{
164+
HasKubeadmConfig: true,
165+
},
166+
},
167+
})
168+
169+
err := setControlPlaneInitialized(ctx, controlPlane)
170+
g.Expect(err).ToNot(HaveOccurred())
171+
172+
g.Expect(ptr.Deref(controlPlane.KCP.Status.Initialization.ControlPlaneInitialized, false)).To(BeFalse())
173+
174+
setInitializedCondition(ctx, controlPlane.KCP)
175+
c := conditions.Get(controlPlane.KCP, controlplanev1.KubeadmControlPlaneInitializedCondition)
176+
g.Expect(c).ToNot(BeNil())
177+
g.Expect(*c).To(conditions.MatchCondition(metav1.Condition{
178+
Type: controlplanev1.KubeadmControlPlaneInitializedCondition,
179+
Status: metav1.ConditionFalse,
180+
Reason: controlplanev1.KubeadmControlPlaneNotInitializedReason,
181+
}, conditions.IgnoreLastTransitionTime(true)))
182+
})
101183
}
102184

103185
func TestSetReplicas(t *testing.T) {

0 commit comments

Comments
 (0)