Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update GKE deployment script for kubernetes 1.32 #220

Merged
merged 1 commit into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions demo/clusters/gke/create-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,38 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"
NODE_VERSION="1.31.1"
NODE_VERSION="1.32"
ROUTER_REGION="us-central1"
REGION="us-central1-c"

## Create the Network for the cluster
gcloud compute networks create "${NETWORK_NAME}" \
--quiet \
--project="${PROJECT_NAME}" \
--description=Manually\ created\ network\ for\ TMS\ DRA\ Alpha\ cluster \
--description="Manually created network for DRA beta test cluster" \
--subnet-mode=auto \
--mtu=1460 \
--bgp-routing-mode=regional

## Create the cluster
gcloud container clusters create "${CLUSTER_NAME}" \
--quiet \
--enable-kubernetes-alpha \
--enable-kubernetes-unstable-apis="resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices" \
--release-channel=rapid \
--no-enable-autorepair \
--no-enable-autoupgrade \
--region us-west1 \
--enable-autoupgrade \
--region "${REGION}" \
--num-nodes "1" \
--network "${NETWORK_NAME}" \
--cluster-version "${NODE_VERSION}" \
--node-version "${NODE_VERSION}"
--node-version "${NODE_VERSION}" \

# Create t4 node pool
gcloud beta container node-pools create "pool-1" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--region "${REGION}" \
--node-version "${NODE_VERSION}" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-t4,count=1" \
Expand All @@ -77,19 +80,19 @@ gcloud beta container node-pools create "pool-1" \
--min-nodes "2" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-locations "${REGION}" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true

# Create v100 node pool
gcloud beta container node-pools create "pool-2" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--region "${REGION}" \
--node-version "${NODE_VERSION}" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-v100,count=1" \
Expand All @@ -103,30 +106,30 @@ gcloud beta container node-pools create "pool-2" \
--min-nodes "1" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-locations "${REGION}" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true

## Allow the GPU nodes access to the internet
gcloud compute routers create ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--network "${NETWORK_NAME}" \
--region "us-west1"
--region "${ROUTER_REGION}" \

gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--nat-all-subnet-ip-ranges \
--auto-allocate-nat-external-ips \
--router-region "us-west1"
--router-region "${ROUTER_REGION}" \

## Start using this cluster for kubectl
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="${REGION}"

## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
kubectl label node --overwrite -l nvidia.com/gpu.present=true cloud.google.com/gke-gpu-driver-version-
Expand Down
8 changes: 5 additions & 3 deletions demo/clusters/gke/delete-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,27 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"
ROUTER_REGION="us-central1"
REGION="us-central1-c"

## Delete the cluster
gcloud container clusters delete "${CLUSTER_NAME}" \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"
--region "${REGION}"

## Delete the nat config
gcloud compute routers nats delete "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--router-region "us-west1"
--router-region "${ROUTER_REGION}"

## Delete the nat router
gcloud compute routers delete ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"
--region "${ROUTER_REGION}"

## Delete the network
gcloud compute networks delete "${NETWORK_NAME}" \
Expand Down
2 changes: 1 addition & 1 deletion demo/clusters/gke/install-dra-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: ${IMAGE_REGISTRY:=ghcr.io/nvidia}
: ${IMAGE_NAME:=${DRIVER_NAME}}
: ${IMAGE_TAG:=32805fec-ubi8}
: ${IMAGE_TAG:=6c34f5fb-ubi8}

helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \
Expand Down
Loading