Skip to content

Commit a6d4f80

Browse files
committed
ci: Add workload cluster creation and kubectl verification to smoke test
This ensures the Cluster API Operator can successfully create and manage a functional Kubernetes cluster where kubectl commands work properly. Signed-off-by: kahirokunn <[email protected]>
1 parent c697f7f commit a6d4f80

File tree

1 file changed

+297
-3
lines changed

1 file changed

+297
-3
lines changed

.github/workflows/smoke-test.yaml

Lines changed: 297 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ jobs:
3333
chmod +x kubectl
3434
sudo mv kubectl /usr/local/bin/
3535
36+
- name: Install yq
37+
run: |
38+
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O yq
39+
chmod +x yq
40+
sudo mv yq /usr/local/bin/
41+
3642
- name: Install Helm
3743
run: |
3844
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
@@ -59,7 +65,21 @@ jobs:
5965
run: |
6066
chmod +x ./hack/ensure-kind.sh
6167
./hack/ensure-kind.sh
62-
kind create cluster --name capi-operator-smoke-test --wait 5m
68+
69+
# Create kind cluster with Docker socket mount for CAPD
70+
cat <<EOF > /tmp/kind-config.yaml
71+
kind: Cluster
72+
apiVersion: kind.x-k8s.io/v1alpha4
73+
nodes:
74+
- role: control-plane
75+
extraMounts:
76+
- hostPath: /var/run/docker.sock
77+
containerPath: /var/run/docker.sock
78+
- hostPath: /var/lib/docker
79+
containerPath: /var/lib/docker
80+
EOF
81+
82+
kind create cluster --name capi-operator-smoke-test --config /tmp/kind-config.yaml --wait 5m
6383
kubectl cluster-info --context kind-capi-operator-smoke-test
6484
6585
- name: Load Docker image to kind
@@ -115,9 +135,26 @@ jobs:
115135
core:
116136
cluster-api:
117137
namespace: capi-system
138+
bootstrap:
139+
kubeadm:
140+
namespace: capi-kubeadm-bootstrap-system
141+
controlPlane:
142+
kubeadm:
143+
namespace: capi-kubeadm-control-plane-system
118144
infrastructure:
119145
docker:
120146
namespace: capd-system
147+
manager:
148+
featureGates:
149+
core:
150+
ClusterTopology: true
151+
ClusterResourceSet: true
152+
MachinePool: true
153+
kubeadm:
154+
ClusterTopology: true
155+
MachinePool: true
156+
docker:
157+
ClusterTopology: true
121158
EOF
122159
123160
# Use exact providers chart filename based on HELM_CHART_TAG
@@ -137,16 +174,30 @@ jobs:
137174
138175
- name: Wait for providers to be ready
139176
run: |
140-
echo "Waiting for Core Provider to be ready..."
177+
echo "=== Waiting for Core Provider to be ready ==="
141178
kubectl wait --for=condition=Ready --timeout=300s -n capi-system coreprovider/cluster-api || true
142179
143-
echo "Waiting for Infrastructure Provider to be ready..."
180+
echo -e "\n=== Waiting for Bootstrap Provider to be ready ==="
181+
kubectl wait --for=condition=Ready --timeout=300s -n capi-kubeadm-bootstrap-system bootstrapprovider/kubeadm || true
182+
183+
echo -e "\n=== Waiting for Control Plane Provider to be ready ==="
184+
kubectl wait --for=condition=Ready --timeout=300s -n capi-kubeadm-control-plane-system controlplaneprovider/kubeadm || true
185+
186+
echo -e "\n=== Waiting for Infrastructure Provider to be ready ==="
144187
kubectl wait --for=condition=Ready --timeout=300s -n capd-system infrastructureprovider/docker || true
145188
146189
# Additional wait for deployments
190+
echo -e "\n=== Waiting for provider deployments ==="
147191
kubectl wait --for=condition=Available --timeout=300s -n capi-system deployment/capi-controller-manager || true
192+
kubectl wait --for=condition=Available --timeout=300s -n capi-kubeadm-bootstrap-system deployment/capi-kubeadm-bootstrap-controller-manager || true
193+
kubectl wait --for=condition=Available --timeout=300s -n capi-kubeadm-control-plane-system deployment/capi-kubeadm-control-plane-controller-manager || true
148194
kubectl wait --for=condition=Available --timeout=300s -n capd-system deployment/capd-controller-manager || true
149195
196+
# Wait for webhooks to be ready
197+
echo -e "\n=== Waiting for webhook services ==="
198+
kubectl wait --for=jsonpath='{.status.loadBalancer}' --timeout=300s -n capi-kubeadm-bootstrap-system service/capi-kubeadm-bootstrap-webhook-service || true
199+
kubectl wait --for=jsonpath='{.status.loadBalancer}' --timeout=300s -n capi-kubeadm-control-plane-system service/capi-kubeadm-control-plane-webhook-service || true
200+
150201
- name: Verify installation
151202
run: |
152203
echo "=== Cluster API Operator Status ==="
@@ -156,13 +207,27 @@ jobs:
156207
kubectl get coreprovider -A -o wide
157208
kubectl describe coreprovider -n capi-system cluster-api || true
158209
210+
echo -e "\n=== Bootstrap Provider Status ==="
211+
kubectl get bootstrapprovider -A -o wide
212+
kubectl describe bootstrapprovider -n capi-kubeadm-bootstrap-system kubeadm || true
213+
214+
echo -e "\n=== Control Plane Provider Status ==="
215+
kubectl get controlplaneprovider -A -o wide
216+
kubectl describe controlplaneprovider -n capi-kubeadm-control-plane-system kubeadm || true
217+
159218
echo -e "\n=== Infrastructure Provider Status ==="
160219
kubectl get infrastructureprovider -A -o wide
161220
kubectl describe infrastructureprovider -n capd-system docker || true
162221
163222
echo -e "\n=== All Pods ==="
164223
kubectl get pods -A | grep -E "(capi-|capd-)"
165224
225+
echo -e "\n=== Webhook Services ==="
226+
kubectl get svc -A | grep webhook
227+
228+
echo -e "\n=== Webhook Certificates ==="
229+
kubectl get certificate,certificaterequest -A | grep -E "(capi-|capd-)"
230+
166231
echo -e "\n=== CRDs ==="
167232
kubectl get crds | grep -E "(cluster.x-k8s.io|operator.cluster.x-k8s.io)"
168233
@@ -176,6 +241,22 @@ jobs:
176241
exit 1
177242
fi
178243
244+
# Check if bootstrap provider is ready
245+
BOOTSTRAP_READY=$(kubectl get bootstrapprovider -n capi-kubeadm-bootstrap-system kubeadm -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
246+
if [ "$BOOTSTRAP_READY" != "True" ]; then
247+
echo "Bootstrap provider is not ready"
248+
kubectl get bootstrapprovider -n capi-kubeadm-bootstrap-system kubeadm -o yaml
249+
exit 1
250+
fi
251+
252+
# Check if control plane provider is ready
253+
CONTROLPLANE_READY=$(kubectl get controlplaneprovider -n capi-kubeadm-control-plane-system kubeadm -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
254+
if [ "$CONTROLPLANE_READY" != "True" ]; then
255+
echo "Control plane provider is not ready"
256+
kubectl get controlplaneprovider -n capi-kubeadm-control-plane-system kubeadm -o yaml
257+
exit 1
258+
fi
259+
179260
# Check if infrastructure provider is ready
180261
INFRA_READY=$(kubectl get infrastructureprovider -n capd-system docker -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
181262
if [ "$INFRA_READY" != "True" ]; then
@@ -186,6 +267,140 @@ jobs:
186267
187268
echo "All providers are ready!"
188269
270+
# Additional webhook readiness check
271+
echo -e "\n=== Checking webhook endpoints ==="
272+
kubectl get endpoints -A | grep webhook
273+
274+
- name: Download cluster manifest
275+
run: |
276+
echo "=== Downloading cluster manifest ==="
277+
curl -L https://raw.githubusercontent.com/kubernetes-sigs/cluster-api/refs/heads/main/test/infrastructure/docker/examples/simple-cluster.yaml -o simple-cluster.yaml
278+
279+
# Show the manifest for debugging
280+
echo "=== Cluster manifest ==="
281+
cat simple-cluster.yaml
282+
283+
# Extract cluster name from the manifest using yq
284+
CLUSTER_NAME=$(yq eval 'select(.kind == "Cluster") | .metadata.name' simple-cluster.yaml)
285+
286+
# Ensure cluster name was extracted successfully
287+
if [ -z "$CLUSTER_NAME" ]; then
288+
echo "ERROR: Failed to extract cluster name from simple-cluster.yaml"
289+
echo "Please check the manifest structure"
290+
exit 1
291+
fi
292+
293+
echo "Detected cluster name: $CLUSTER_NAME"
294+
echo "CLUSTER_NAME=$CLUSTER_NAME" >> $GITHUB_ENV
295+
296+
- name: Create workload cluster
297+
run: |
298+
echo "=== Pre-creation diagnostics ==="
299+
echo "Checking webhook services..."
300+
kubectl get svc -A | grep webhook
301+
302+
echo -e "\nChecking webhook endpoints..."
303+
kubectl get endpoints -A | grep webhook
304+
305+
echo -e "\nChecking webhook certificates..."
306+
kubectl get secret -A | grep webhook-service-cert
307+
308+
echo -e "\n=== Analyzing cluster manifest for CNI configuration ==="
309+
echo "Checking for CNI-related settings in simple-cluster.yaml:"
310+
grep -i "cni\|calico\|flannel\|weave\|cilium" simple-cluster.yaml || echo "No CNI configuration found in manifest"
311+
312+
echo -e "\n=== Checking KubeadmControlPlane configuration ==="
313+
yq eval 'select(.kind == "KubeadmControlPlane") | .spec' simple-cluster.yaml || echo "Could not extract KubeadmControlPlane spec"
314+
315+
echo -e "\n=== Creating workload cluster ==="
316+
kubectl apply -f simple-cluster.yaml
317+
318+
echo -e "\n=== Cluster resources created ==="
319+
kubectl get cluster,dockercluster,kubeadmcontrolplane,machinedeployment -A
320+
321+
- name: Get workload cluster kubeconfig
322+
run: |
323+
echo "=== Getting workload cluster kubeconfig ==="
324+
# Get kubeconfig from the cluster
325+
kubectl get secret ${CLUSTER_NAME}-kubeconfig -o jsonpath='{.data.value}' | base64 -d > ${CLUSTER_NAME}.kubeconfig
326+
327+
echo "=== Testing kubeconfig ==="
328+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig cluster-info || echo "Cluster API endpoint may not be ready yet"
329+
330+
- name: Install CNI plugin (Calico) using Helm
331+
run: |
332+
echo "=== Installing Calico CNI plugin using Helm ==="
333+
334+
# Add Calico Helm repository
335+
helm repo add projectcalico https://docs.tigera.io/calico/charts --kubeconfig=${CLUSTER_NAME}.kubeconfig
336+
helm repo update --kubeconfig=${CLUSTER_NAME}.kubeconfig
337+
338+
# Install Calico using Helm with values from CAPI Azure provider
339+
helm install calico projectcalico/tigera-operator \
340+
--kubeconfig=${CLUSTER_NAME}.kubeconfig \
341+
-f https://raw.githubusercontent.com/kubernetes-sigs/cluster-api-provider-azure/main/templates/addons/calico/values.yaml \
342+
--namespace tigera-operator \
343+
--create-namespace \
344+
--wait \
345+
--timeout 5m
346+
347+
echo "=== Waiting for Calico to be ready ==="
348+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s pods -n tigera-operator -l app.kubernetes.io/name=tigera-operator
349+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s pods -n calico-system --all
350+
351+
echo "=== Calico installation complete ==="
352+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n tigera-operator
353+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n calico-system
354+
355+
- name: Wait for nodes to be ready
356+
run: |
357+
echo "=== Waiting for control plane node to be ready ==="
358+
# Wait for the node to become ready after CNI installation
359+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s nodes --all
360+
361+
echo "=== Checking node status ==="
362+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get nodes -o wide
363+
364+
echo "=== Waiting for control plane replicas ==="
365+
kubectl wait --for=jsonpath='{.status.readyReplicas}'=1 --timeout=300s kubeadmcontrolplane -l cluster.x-k8s.io/cluster-name=${CLUSTER_NAME}
366+
367+
echo "=== Final cluster status ==="
368+
kubectl get cluster ${CLUSTER_NAME} -o wide
369+
kubectl get machines -l cluster.x-k8s.io/cluster-name=${CLUSTER_NAME}
370+
371+
- name: Verify kubectl commands work on workload cluster
372+
run: |
373+
echo "=== Testing kubectl get po on workload cluster ==="
374+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get po -A
375+
376+
echo -e "\n=== Testing kubectl get nodes ==="
377+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get nodes
378+
379+
echo -e "\n=== Verifying CNI is working ==="
380+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n calico-system
381+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n tigera-operator
382+
383+
echo -e "\n=== Waiting for system pods to be ready ==="
384+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s pods -n kube-system -l k8s-app=kube-proxy
385+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s pods -n kube-system -l component=kube-apiserver
386+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s pods -n kube-system -l component=kube-controller-manager
387+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig wait --for=condition=Ready --timeout=300s pods -n kube-system -l component=kube-scheduler
388+
389+
- name: Verify cluster functionality
390+
run: |
391+
echo "=== Final cluster verification ==="
392+
echo "Cluster nodes:"
393+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get nodes -o wide
394+
395+
echo -e "\nAll pods:"
396+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get po -A
397+
398+
echo -e "\nAll services:"
399+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get svc -A
400+
401+
echo -e "\nCluster info:"
402+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig cluster-info
403+
189404
- name: Collect debug information on failure
190405
if: failure()
191406
run: |
@@ -198,17 +413,96 @@ jobs:
198413
echo -e "\n=== Core Provider Logs ==="
199414
kubectl logs -n capi-system deployment/capi-controller-manager --tail=100 || true
200415
416+
echo -e "\n=== Bootstrap Provider Logs ==="
417+
kubectl logs -n capi-kubeadm-bootstrap-system deployment/capi-kubeadm-bootstrap-controller-manager --tail=100 || true
418+
419+
echo -e "\n=== Control Plane Provider Logs ==="
420+
kubectl logs -n capi-kubeadm-control-plane-system deployment/capi-kubeadm-control-plane-controller-manager --tail=100 || true
421+
201422
echo -e "\n=== Infrastructure Provider Logs ==="
202423
kubectl logs -n capd-system deployment/capd-controller-manager --tail=100 || true
203424
425+
echo -e "\n=== Webhook Services and Endpoints ==="
426+
kubectl get svc,endpoints -A | grep webhook || true
427+
428+
echo -e "\n=== Webhook Certificates ==="
429+
kubectl get certificate,certificaterequest,secret -A | grep -E "(webhook|serving-cert)" || true
430+
431+
echo -e "\n=== Cluster Resources ==="
432+
kubectl get cluster,dockercluster,kubeadmcontrolplane,machine,dockermachine -A -o wide || true
433+
434+
echo -e "\n=== Describe Cluster ==="
435+
kubectl describe cluster ${CLUSTER_NAME} || true
436+
437+
echo -e "\n=== Describe Machines ==="
438+
kubectl describe machines -l cluster.x-k8s.io/cluster-name=${CLUSTER_NAME} || true
439+
440+
echo -e "\n=== Docker Containers ==="
441+
docker ps -a | grep -E "(smoke-test|kind)" || true
442+
443+
echo -e "\n=== Kind Clusters ==="
444+
kind get clusters || true
445+
204446
echo -e "\n=== Describe Failed Pods ==="
205447
kubectl get pods -A | grep -v Running | grep -v Completed | tail -n +2 | while read namespace name ready status restarts age; do
206448
echo "Describing pod $name in namespace $namespace"
207449
kubectl describe pod -n $namespace $name
208450
echo "---"
209451
done
210452
453+
echo -e "\n=== CNI Diagnostics ==="
454+
echo "Checking Calico installation status..."
455+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n tigera-operator -o wide || true
456+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n calico-system -o wide || true
457+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig get pods -n calico-apiserver -o wide || true
458+
459+
echo -e "\n=== Calico logs ==="
460+
kubectl --kubeconfig=${CLUSTER_NAME}.kubeconfig logs -n tigera-operator -l app.kubernetes.io/name=tigera-operator --tail=50 || true
461+
462+
echo -e "\n=== Node CNI status ==="
463+
CONTROL_PLANE_CONTAINER=$(docker ps -a | grep ${CLUSTER_NAME}-controlplane | awk '{print $1}' | head -1)
464+
if [ ! -z "$CONTROL_PLANE_CONTAINER" ]; then
465+
echo "Control plane container: $CONTROL_PLANE_CONTAINER"
466+
467+
echo "=== Checking CNI binaries ==="
468+
docker exec $CONTROL_PLANE_CONTAINER ls -la /opt/cni/bin/ || echo "CNI binaries directory not found"
469+
470+
echo -e "\n=== Checking CNI configuration ==="
471+
docker exec $CONTROL_PLANE_CONTAINER ls -la /etc/cni/net.d/ || echo "CNI config directory not found"
472+
docker exec $CONTROL_PLANE_CONTAINER cat /etc/cni/net.d/* 2>/dev/null || echo "No CNI config files found"
473+
474+
echo -e "\n=== Checking kubelet configuration ==="
475+
docker exec $CONTROL_PLANE_CONTAINER cat /var/lib/kubelet/kubeadm-flags.env || true
476+
docker exec $CONTROL_PLANE_CONTAINER ps aux | grep kubelet || true
477+
478+
echo -e "\n=== Node status inside container ==="
479+
docker exec $CONTROL_PLANE_CONTAINER kubectl --kubeconfig=/etc/kubernetes/admin.conf get nodes -o wide || true
480+
docker exec $CONTROL_PLANE_CONTAINER kubectl --kubeconfig=/etc/kubernetes/admin.conf describe nodes || true
481+
fi
482+
483+
echo -e "\n=== CAPD Provider Configuration ==="
484+
kubectl get dockercluster ${CLUSTER_NAME} -o yaml || true
485+
kubectl get dockermachinetemplate -A -o yaml || true
486+
487+
echo -e "\n=== Helm releases ==="
488+
helm list --all-namespaces --kubeconfig=${CLUSTER_NAME}.kubeconfig || true
489+
211490
- name: Clean up
212491
if: always()
213492
run: |
493+
echo "=== Cleaning up kind clusters ==="
494+
# List all kind clusters before cleanup
495+
echo "Current kind clusters:"
496+
kind get clusters || true
497+
498+
# Delete workload cluster if it exists
499+
echo "Deleting workload cluster: ${CLUSTER_NAME}"
500+
kind delete cluster --name ${CLUSTER_NAME} || true
501+
502+
# Delete management cluster
503+
echo "Deleting management cluster: capi-operator-smoke-test"
214504
kind delete cluster --name capi-operator-smoke-test || true
505+
506+
# Verify all clusters are deleted
507+
echo "Remaining kind clusters:"
508+
kind get clusters || true

0 commit comments

Comments
 (0)