Skip to content

Commit

Permalink
Futzing with the staging bringup script
Browse files Browse the repository at this point in the history
  • Loading branch information
szinn committed Feb 19, 2025
1 parent 795bb18 commit 780315f
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 62 deletions.
108 changes: 46 additions & 62 deletions kubernetes/staging/bootstrap/apps/resources/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,177 +2,161 @@

set -euo pipefail

# Set default values for the 'gum log' command
readonly LOG_ARGS=("log" "--time=rfc3339" "--formatter=text" "--structured" "--level")

# Verify required CLI tools are installed
function check_dependencies() {
local deps=("gum" "jq" "kubectl" "kustomize" "op" "talosctl" "yq")
local missing=()

for dep in "${deps[@]}"; do
if ! command -v "${dep}" &>/dev/null; then
missing+=("${dep}")
fi
done

if [ ${#missing[@]} -ne 0 ]; then
if ! command -v gum &>/dev/null; then
printf "%s \033[1;95m%s\033[0m Missing required dependencies \033[0;30mdependencies=\033[0m\"%s\"\n" \
"$(date --iso-8601=seconds)" "FATAL" "${missing[*]}"
exit 1
fi
gum "${LOG_ARGS[@]}" fatal "Missing required dependencies" dependencies "${missing[*]}"
fi
}
# shellcheck disable=SC2155
export ROOT_DIR="$(git rev-parse --show-toplevel)"
# shellcheck disable=SC1091
source "${ROOT_DIR}/scripts/common.sh"

# Talos requires the nodes to be 'Ready=False' before applying resources
function wait_for_nodes() {
gum "${LOG_ARGS[@]}" debug "Waiting for nodes to be available"
log debug "Waiting for nodes to be available"

# Skip waiting if all nodes are 'Ready=True'
if kubectl --context "${CLUSTER_CONTEXT}" wait nodes --for=condition=Ready=True --all --timeout=10s &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Nodes are available and ready, skipping wait for nodes"
if kubectl_cmd wait nodes --for=condition=Ready=True --all --timeout=10s &>/dev/null; then
log info "Nodes are available and ready, skipping wait for nodes"
return
fi

# Wait for all nodes to be 'Ready=False'
until kubectl --context "${CLUSTER_CONTEXT}" wait nodes --for=condition=Ready=False --all --timeout=10s &>/dev/null; do
gum "${LOG_ARGS[@]}" info "Nodes are not available, waiting for nodes to be available"
until kubectl_cmd wait nodes --for=condition=Ready=False --all --timeout=10s &>/dev/null; do
log info "Nodes are not available, waiting for nodes to be available"
sleep 10
done
}

# Applications in the helmfile require Prometheus custom resources (e.g. servicemonitors)
function apply_prometheus_crds() {
gum "${LOG_ARGS[@]}" debug "Applying Prometheus CRDs"
log debug "Applying Prometheus CRDs"

# renovate: datasource=github-releases depName=prometheus-operator/prometheus-operator
local -r version=v0.80.0
local resources crds

# Fetch resources using kustomize build
if ! resources=$(kustomize build "https://github.com/prometheus-operator/prometheus-operator/?ref=${version}" 2>/dev/null) || [[ -z "${resources}" ]]; then
gum "${LOG_ARGS[@]}" fatal "Failed to fetch Prometheus CRDs, check the version or the repository URL"
log fatal "Failed to fetch Prometheus CRDs, check the version or the repository URL"
fi

# Extract only CustomResourceDefinitions
if ! crds=$(echo "${resources}" | yq '. | select(.kind == "CustomResourceDefinition")' 2>/dev/null) || [[ -z "${crds}" ]]; then
gum "${LOG_ARGS[@]}" fatal "No CustomResourceDefinitions found in the fetched resources"
log fatal "No CustomResourceDefinitions found in the fetched resources"
fi

# Check if the CRDs are up-to-date
if echo "${crds}" | kubectl --context "${CLUSTER_CONTEXT}" diff --filename - &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Prometheus CRDs are up-to-date"
if echo "${crds}" | kubectl_cmd diff --filename - &>/dev/null; then
log info "Prometheus CRDs are up-to-date"
return
fi

# Apply the CRDs
if echo "${crds}" | kubectl --context "${CLUSTER_CONTEXT}" apply --server-side --filename - &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Prometheus CRDs applied successfully"
if echo "${crds}" | kubectl_cmd apply --server-side --filename - &>/dev/null; then
log info "Prometheus CRDs applied successfully"
else
gum "${LOG_ARGS[@]}" fatal "Failed to apply Prometheus CRDs"
log fatal "Failed to apply Prometheus CRDs"
fi
}

# The application namespaces are created before applying the resources
function apply_namespaces() {
gum "${LOG_ARGS[@]}" debug "Applying namespaces"
log debug "Applying namespaces"

local -r apps_dir="${KUBERNETES_DIR}/apps"

if [[ ! -d "${apps_dir}" ]]; then
gum "${LOG_ARGS[@]}" fatal "Directory does not exist" directory "${apps_dir}"
log fatal "Directory does not exist" directory "${apps_dir}"
fi

for app in "${apps_dir}"/*/; do
namespace=$(basename "${app}")

# Check if the namespace resources are up-to-date
if --context "${CLUSTER_CONTEXT}" get namespace "${namespace}" &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Namespace resource is up-to-date" resource "${namespace}"
log info "Namespace resource is up-to-date" resource "${namespace}"
continue
fi

# Apply the namespace resources
if kubectl --context "${CLUSTER_CONTEXT}" create namespace "${namespace}" --dry-run=client --output=yaml \
| kubectl --context "${CLUSTER_CONTEXT}" apply --server-side --filename - &>/dev/null;
if kubectl_cmd create namespace "${namespace}" --dry-run=client --output=yaml \
| kubectl_cmd apply --server-side --filename - &>/dev/null;
then
gum "${LOG_ARGS[@]}" info "Namespace resource applied" resource "${namespace}"
log info "Namespace resource applied" resource "${namespace}"
else
gum "${LOG_ARGS[@]}" fatal "Failed to apply namespace resource" resource "${namespace}"
log fatal "Failed to apply namespace resource" resource "${namespace}"
fi
done
}

# Secrets to be applied before the helmfile charts are installed
function apply_secrets() {
gum "${LOG_ARGS[@]}" debug "Applying secrets"
log debug "Applying secrets"

local -r secrets_file="${KUBERNETES_DIR}/bootstrap/apps/resources/secrets.yaml.tpl"
local resources

if [[ ! -f "${secrets_file}" ]]; then
gum "${LOG_ARGS[@]}" fatal "File does not exist" file "${secrets_file}"
log fatal "File does not exist" file "${secrets_file}"
fi

# Inject secrets into the template
if ! resources=$(op inject --in-file "${secrets_file}" 2>/dev/null) || [[ -z "${resources}" ]]; then
gum "${LOG_ARGS[@]}" fatal "Failed to inject secrets" file "${secrets_file}"
log fatal "Failed to inject secrets" file "${secrets_file}"
fi

# Check if the secret resources are up-to-date
if echo "${resources}" | kubectl --context "${CLUSTER_CONTEXT}" diff --filename - &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Secret resources are up-to-date"
if echo "${resources}" | kubectl_cmd diff --filename - &>/dev/null; then
log info "Secret resources are up-to-date"
return
fi

# Apply secret resources
if echo "${resources}" | kubectl --context "${CLUSTER_CONTEXT}" apply --server-side --filename - &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Secret resources applied"
if echo "${resources}" | kubectl_cmd apply --server-side --filename - &>/dev/null; then
log info "Secret resources applied"
else
gum "${LOG_ARGS[@]}" fatal "Failed to apply secret resources"
log fatal "Failed to apply secret resources"
fi
}

# Disks in use by rook-ceph must be wiped before Rook is installed
function wipe_rook_disks() {
gum "${LOG_ARGS[@]}" debug "Wiping Rook disks"
log debug "Wiping Rook disks"

if [[ -z "${ROOK_DISK:-}" ]]; then
gum "${LOG_ARGS[@]}" fatal "Environment variable not set" env_var ROOK_DISK
log fatal "Environment variable not set" env_var ROOK_DISK
fi

# Skip disk wipe if Rook is detected running in the cluster
if --context "${CLUSTER_CONTEXT}" --namespace rook-ceph get kustomization rook-ceph &>/dev/null; then
gum "${LOG_ARGS[@]}" warn "Rook is detected running in the cluster, skipping disk wipe"
log warn "Rook is detected running in the cluster, skipping disk wipe"
return
fi

# Wipe disks on each node that match the ROOK_DISK environment variable
for node in $(talosctl --context "${CLUSTER_CONTEXT}" config info --output json | jq --raw-output '.nodes | .[]'); do
for node in $(talosctl_cmd config info --output json | jq --raw-output '.nodes | .[]'); do
disk=$(
talosctl --context "${CLUSTER_CONTEXT}" --nodes "${node}" get disks --output json \
talosctl_cmd --nodes "${node}" get disks --output json \
| jq --raw-output 'select(.spec.model == env.ROOK_DISK) | .metadata.id' \
| xargs
)

if [[ -n "${disk}" ]]; then
gum "${LOG_ARGS[@]}" debug "Discovered Talos node and disk" node "${node}" disk "${disk}"
log debug "Discovered Talos node and disk" node "${node}" disk "${disk}"

if talosctl --context "${CLUSTER_CONTEXT}" --nodes "${node}" wipe disk "${disk}" &>/dev/null; then
gum "${LOG_ARGS[@]}" info "Disk wiped" node "${node}" disk "${disk}"
if talosctl_cmd --nodes "${node}" wipe disk "${disk}" &>/dev/null; then
log info "Disk wiped" node "${node}" disk "${disk}"
else
gum "${LOG_ARGS[@]}" fatal "Failed to wipe disk" node "${node}" disk "${disk}"
log fatal "Failed to wipe disk" node "${node}" disk "${disk}"
fi
else
gum "${LOG_ARGS[@]}" warn "No disks found" node "${node}" model "${ROOK_DISK:-}"
log warn "No disks found" node "${node}" model "${ROOK_DISK:-}"
fi
done
}

function main() {
check_dependencies
# Verifications before bootstrapping the cluster
check_env CLUSTER_CONTEXT
check_cli helmfile jq kubectl kustomize minijinja-cli op talosctl yq

wait_for_nodes
apply_prometheus_crds
apply_namespaces
Expand Down
91 changes: 91 additions & 0 deletions scripts/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env bash

set -euo pipefail

# Log messages with different levels
function log() {
local level="${1:-info}"
shift

local -A colors=(
[info]="\033[1m\033[38;5;87m" # Cyan
[warn]="\033[1m\033[38;5;192m" # Yellow
[error]="\033[1m\033[38;5;198m" # Red
[debug]="\033[1m\033[38;5;63m" # Blue
[fatal]="\033[1m\033[38;5;92m" # Purple
)

if [[ ! ${colors[$level]} ]]; then
level="info"
fi

local color="${colors[$level]}"
local msg="$1"
shift

local data=
if [[ $# -gt 0 ]]; then
for item in "$@"; do
if [[ "${item}" == *=* ]]; then
data+="\033[1m\033[38;5;236m${item%%=*}=\033[0m\"${item#*=}\" "
else
data+="${item} "
fi
done
fi

printf "%s %b%s%b %s %b\n" "$(date --iso-8601=seconds)" \
"${color}" "${level^^}" "\033[0m" "${msg}" "${data}"

if [[ "$level" == "fatal" ]]; then
exit 1
fi
}

# Check if required environment variables are set
function check_env() {
local envs=("${@}")
local missing=()

for env in "${envs[@]}"; do
if [[ -z "${!env-}" ]]; then
missing+=("${env}")
fi
done

if [ ${#missing[@]} -ne 0 ]; then
log fatal "Missing required env variables" "envs=${missing[*]}"
fi

log debug "Env variables are set" "envs=${envs[*]}"
}

# Check if required CLI tools are installed
function check_cli() {
local deps=("${@}")
local missing=()

for dep in "${deps[@]}"; do
if ! command -v "${dep}" &>/dev/null; then
missing+=("${dep}")
fi
done

if [ ${#missing[@]} -ne 0 ]; then
log fatal "Missing required deps" "deps=${missing[*]}"
fi

log debug "Deps are installed" "deps=${deps[*]}"
}

# Execute talosctl with the cluster context
function talosctl_cmd() {
# shellcheck disable=SC2068
talosctl --context "${CLUSTER_CONTEXT}" $@
}

# Execute kubectl with the cluster context
function kubectl_cmd() {
# shellcheck disable=SC2068
kubectl --context "${CLUSTER_CONTEXT}" $@
}

0 comments on commit 780315f

Please sign in to comment.