Skip to content

Commit 0e396b2

Browse files
committed
Merge pull request kubernetes#20407 from gmarek/1000-kube-up
Allow some NotReady nodes in 1000 node clusters
2 parents 8f4a10e + 6aaabc6 commit 0e396b2

File tree

2 files changed

+48
-2
lines changed

2 files changed

+48
-2
lines changed

cluster/validate-cluster.sh

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
# limitations under the License.
1616

1717
# Validates that the cluster is healthy.
18+
# Error codes are:
19+
# 0 - success
20+
# 1 - fatal (cluster is unlikely to work)
21+
# 2 - non-fatal (encountered some errors, but cluster should be working correctly)
1822

1923
set -o errexit
2024
set -o nounset
@@ -29,11 +33,14 @@ fi
2933
source "${KUBE_ROOT}/cluster/kube-env.sh"
3034
source "${KUBE_ROOT}/cluster/kube-util.sh"
3135

36+
ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
37+
3238
EXPECTED_NUM_NODES="${NUM_NODES}"
3339
if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
3440
EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1))
3541
fi
3642
# Make several attempts to deal with slow cluster birth.
43+
return_value=0
3744
attempt=0
3845
while true; do
3946
# The "kubectl get nodes -o template" exports node information.
@@ -59,7 +66,12 @@ while true; do
5966
if (( attempt > 100 )); then
6067
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
6168
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
62-
exit 2
69+
if [ "$((${EXPECTED_NUM_NODES} - ${found}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
70+
exit 1
71+
else
72+
return_value=2
73+
break
74+
fi
6375
else
6476
echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
6577
fi
@@ -99,4 +111,10 @@ done
99111

100112
echo "Validate output:"
101113
"${KUBE_ROOT}/cluster/kubectl.sh" get cs
102-
echo -e "${color_green}Cluster validation succeeded${color_norm}"
114+
if [ "${return_value}" == "0" ]; then
115+
echo -e "${color_green}Cluster validation succeeded${color_norm}"
116+
else
117+
echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
118+
fi
119+
120+
exit "${return_value}"

hack/jenkins/e2e.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ case ${JOB_NAME} in
533533

534534
# Runs the performance/scalability test on huge 1000-node cluster on GCE.
535535
# Flannel is used as network provider.
536+
# Allows a couple of nodes to be NotReady during startup
536537
kubernetes-e2e-gce-enormous-cluster)
537538
: ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-cluster"}
538539
: ${E2E_NETWORK:="e2e-enormous-cluster"}
@@ -551,6 +552,32 @@ case ${JOB_NAME} in
551552
NODE_SIZE="n1-standard-1"
552553
NODE_DISK_SIZE="50GB"
553554
NUM_NODES="1000"
555+
ALLOWED_NOTREADY_NODES="2"
556+
# Reduce logs verbosity
557+
TEST_CLUSTER_LOG_LEVEL="--v=1"
558+
# Increase resync period to simulate production
559+
TEST_CLUSTER_RESYNC_PERIOD="--min-resync-period=12h"
560+
;;
561+
562+
# Starts and tears down 1000-node cluster on GCE using flannel networking
563+
# Requires all 1000 nodes to come up.
564+
kubernetes-e2e-gce-enormous-startup)
565+
: ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-startup"}
566+
# TODO: increase a quota for networks in kubernetes-scale and move this test to its own network
567+
: ${E2E_NETWORK:="e2e-enormous-cluster"}
568+
: ${E2E_TEST:="false"}
569+
: ${KUBE_GCE_INSTANCE_PREFIX:="e2e-enormous-startup"}
570+
: ${PROJECT:="kubernetes-scale"}
571+
# Override GCE defaults.
572+
NETWORK_PROVIDER="flannel"
573+
# Temporarily switch of Heapster, as this will not schedule anywhere.
574+
# TODO: Think of a solution to enable it.
575+
ENABLE_CLUSTER_MONITORING="none"
576+
E2E_ZONE="asia-east1-a"
577+
MASTER_SIZE="n1-standard-32"
578+
NODE_SIZE="n1-standard-1"
579+
NODE_DISK_SIZE="50GB"
580+
NUM_NODES="1000"
554581
# Reduce logs verbosity
555582
TEST_CLUSTER_LOG_LEVEL="--v=1"
556583
# Increase resync period to simulate production
@@ -925,6 +952,7 @@ export KUBE_GCE_NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-}
925952
export KUBE_OS_DISTRIBUTION=${KUBE_OS_DISTRIBUTION:-}
926953
export GCE_SERVICE_ACCOUNT=$(gcloud auth list 2> /dev/null | grep active | cut -f3 -d' ')
927954
export FAIL_ON_GCP_RESOURCE_LEAK="${FAIL_ON_GCP_RESOURCE_LEAK:-false}"
955+
export ALLOWED_NOTREADY_NODES=${ALLOWED_NOTREADY_NODES:-}
928956

929957
# GKE variables
930958
export CLUSTER_NAME=${E2E_CLUSTER_NAME}

0 commit comments

Comments
 (0)