diff --git a/Makefile b/Makefile index d12b3db626..ba4aa3af18 100644 --- a/Makefile +++ b/Makefile @@ -141,6 +141,9 @@ test-python-integration: ## Run Python integration test. pytest ./test/integration/initializer ##@ Helm +.PHONY: sync-manifests +sync-manifests: ## Sync Kustomize manifests from manifests templated from Helm chart. + hack/sync-manifests.sh .PHONY: helm-unittest helm-unittest: helm-unittest-plugin ## Run Helm chart unittests. diff --git a/charts/trainer/templates/controller/_helpers.tpl b/charts/trainer/templates/controller/_helpers.tpl index 8a2f8d0c2a..a472de8c77 100644 --- a/charts/trainer/templates/controller/_helpers.tpl +++ b/charts/trainer/templates/controller/_helpers.tpl @@ -45,41 +45,6 @@ app.kubernetes.io/component: controller {{- end }} {{- end -}} -{{/* -Create the name of the controller service account. -*/}} -{{- define "trainer.controller.serviceAccountName" -}} -{{ include "trainer.controller.name" . }} -{{- end -}} - -{{/* -Create the name of the controller cluster role. -*/}} -{{- define "trainer.controller.clusterRoleName" -}} -{{ include "trainer.controller.name" . }} -{{- end -}} - -{{/* -Create the name of the controller cluster role binding. -*/}} -{{- define "trainer.controller.clusterRoleBindingName" -}} -{{ include "trainer.controller.name" . }} -{{- end -}} - -{{/* -Create the name of the controller role. -*/}} -{{- define "trainer.controller.roleName" -}} -{{ include "trainer.controller.name" . }} -{{- end -}} - -{{/* -Create the name of the controller role binding. -*/}} -{{- define "trainer.controller.roleBindingName" -}} -{{ include "trainer.controller.name" . }} -{{- end -}} - {{/* Create the name of the controller deployment. */}} @@ -90,24 +55,3 @@ Create the name of the controller deployment. {{- define "trainer.controller.serviceName" -}} {{ include "trainer.controller.name" . }}-service {{- end -}} - -{{/* -Create the name of the webhook. -*/}} -{{- define "trainer.webhook.name" -}} -{{ include "trainer.name" . }}-webhook -{{- end -}} - -{{/* -Create the name of the webhook secret. -*/}} -{{- define "trainer.webhook.secretName" -}} -{{ include "trainer.webhook.name" . }}-cert -{{- end -}} - -{{/* -Create the name of the validating webhook configuration. -*/}} -{{- define "trainer.validatingWebhookConfigurationName" -}} -validator.trainer.kubeflow.org -{{- end -}} diff --git a/charts/trainer/templates/rbac/_helpers.tpl b/charts/trainer/templates/rbac/_helpers.tpl new file mode 100644 index 0000000000..a10ec053e0 --- /dev/null +++ b/charts/trainer/templates/rbac/_helpers.tpl @@ -0,0 +1,50 @@ +{{/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{/* +Create the name of the controller service account. +*/}} +{{- define "trainer.controller.serviceAccountName" -}} +{{ include "trainer.controller.name" . }} +{{- end -}} + +{{/* +Create the name of the controller cluster role. +*/}} +{{- define "trainer.controller.clusterRoleName" -}} +{{ include "trainer.controller.name" . }} +{{- end -}} + +{{/* +Create the name of the controller cluster role binding. +*/}} +{{- define "trainer.controller.clusterRoleBindingName" -}} +{{ include "trainer.controller.name" . }} +{{- end -}} + +{{/* +Create the name of the controller role. +*/}} +{{- define "trainer.controller.roleName" -}} +{{ include "trainer.controller.name" . }} +{{- end -}} + +{{/* +Create the name of the controller role binding. +*/}} +{{- define "trainer.controller.roleBindingName" -}} +{{ include "trainer.controller.name" . }} +{{- end -}} diff --git a/charts/trainer/templates/controller/rbac.yaml b/charts/trainer/templates/rbac/clusterrole.yaml similarity index 60% rename from charts/trainer/templates/controller/rbac.yaml rename to charts/trainer/templates/rbac/clusterrole.yaml index 1fba61937f..156ef90c35 100644 --- a/charts/trainer/templates/controller/rbac.yaml +++ b/charts/trainer/templates/rbac/clusterrole.yaml @@ -18,6 +18,8 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: {{ include "trainer.controller.clusterRoleName" . }} + labels: + {{- include "trainer.controller.labels" . | nindent 4 }} rules: - apiGroups: - "" @@ -92,49 +94,3 @@ rules: - get - update - patch - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "trainer.controller.clusterRoleBindingName" . }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "trainer.controller.clusterRoleName" . }} -subjects: -- kind: ServiceAccount - name: {{ include "trainer.controller.serviceAccountName" . }} - namespace: {{ .Release.Namespace }} - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "trainer.controller.roleName" . }} - namespace: {{ .Release.Namespace }} -rules: -- apiGroups: - - "" - resources: - - secrets - verbs: - - get - - list - - watch - - update - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "trainer.controller.roleBindingName" . }} - namespace: {{ .Release.Namespace }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ include "trainer.controller.roleName" . }} -subjects: -- kind: ServiceAccount - name: {{ include "trainer.controller.serviceAccountName" . }} - namespace: {{ .Release.Namespace }} diff --git a/charts/trainer/templates/rbac/clusterrolebinding.yaml b/charts/trainer/templates/rbac/clusterrolebinding.yaml new file mode 100644 index 0000000000..2531a53dbf --- /dev/null +++ b/charts/trainer/templates/rbac/clusterrolebinding.yaml @@ -0,0 +1,30 @@ +{{/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "trainer.controller.clusterRoleBindingName" . }} + labels: + {{- include "trainer.controller.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "trainer.controller.clusterRoleName" . }} +subjects: +- kind: ServiceAccount + name: {{ include "trainer.controller.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} diff --git a/charts/trainer/templates/rbac/role.yaml b/charts/trainer/templates/rbac/role.yaml new file mode 100644 index 0000000000..08cffde7a0 --- /dev/null +++ b/charts/trainer/templates/rbac/role.yaml @@ -0,0 +1,33 @@ +{{/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "trainer.controller.roleName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "trainer.controller.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - watch + - update diff --git a/charts/trainer/templates/rbac/rolebinding.yaml b/charts/trainer/templates/rbac/rolebinding.yaml new file mode 100644 index 0000000000..08fe158305 --- /dev/null +++ b/charts/trainer/templates/rbac/rolebinding.yaml @@ -0,0 +1,31 @@ +{{/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "trainer.controller.roleBindingName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "trainer.controller.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "trainer.controller.roleName" . }} +subjects: +- kind: ServiceAccount + name: {{ include "trainer.controller.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} diff --git a/charts/trainer/templates/controller/serviceaccount.yaml b/charts/trainer/templates/rbac/serviceaccount.yaml similarity index 100% rename from charts/trainer/templates/controller/serviceaccount.yaml rename to charts/trainer/templates/rbac/serviceaccount.yaml diff --git a/charts/trainer/templates/webhook/_helpers.tpl b/charts/trainer/templates/webhook/_helpers.tpl new file mode 100644 index 0000000000..3da1f32306 --- /dev/null +++ b/charts/trainer/templates/webhook/_helpers.tpl @@ -0,0 +1,45 @@ +{{/* +Copyright 2024 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{/* +Create the name of the webhook. +*/}} +{{- define "trainer.webhook.name" -}} +{{ include "trainer.name" . }}-webhook +{{- end -}} + +{{/* +Common labels for the webhook. +*/}} +{{- define "trainer.webhook.labels" -}} +{{ include "trainer.labels" . }} +app.kubernetes.io/part-of: kubeflow +app.kubernetes.io/component: webhook +{{- end -}} + +{{/* +Create the name of the webhook secret. +*/}} +{{- define "trainer.webhook.secretName" -}} +{{ include "trainer.webhook.name" . }}-cert +{{- end -}} + +{{/* +Create the name of the validating webhook configuration. +*/}} +{{- define "trainer.validatingWebhookConfigurationName" -}} +{{ include "trainer.name" . }}-validating-webhook +{{- end -}} diff --git a/charts/trainer/templates/controller/secret.yaml b/charts/trainer/templates/webhook/secret.yaml similarity index 92% rename from charts/trainer/templates/controller/secret.yaml rename to charts/trainer/templates/webhook/secret.yaml index e6cc52b379..8f2d8a55b5 100644 --- a/charts/trainer/templates/controller/secret.yaml +++ b/charts/trainer/templates/webhook/secret.yaml @@ -20,6 +20,8 @@ kind: Secret metadata: name: {{ include "trainer.webhook.secretName" . }} namespace: {{ .Release.Namespace }} + labels: + {{- include "trainer.webhook.labels" . | nindent 4 }} data: ca.crt: "" ca.key: "" diff --git a/charts/trainer/templates/controller/validatingwebhookconfiguration.yaml b/charts/trainer/templates/webhook/validatingwebhookconfiguration.yaml similarity index 97% rename from charts/trainer/templates/controller/validatingwebhookconfiguration.yaml rename to charts/trainer/templates/webhook/validatingwebhookconfiguration.yaml index adf4b33ca6..bd691736c9 100644 --- a/charts/trainer/templates/controller/validatingwebhookconfiguration.yaml +++ b/charts/trainer/templates/webhook/validatingwebhookconfiguration.yaml @@ -19,6 +19,8 @@ apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: name: {{ include "trainer.validatingWebhookConfigurationName" . }} + labels: + {{- include "trainer.webhook.labels" . | nindent 4 }} webhooks: - name: validator.clustertrainingruntime.trainer.kubeflow.org admissionReviewVersions: diff --git a/charts/trainer/values.yaml b/charts/trainer/values.yaml index be5486170a..be9cb2ba01 100644 --- a/charts/trainer/values.yaml +++ b/charts/trainer/values.yaml @@ -31,7 +31,7 @@ image: # -- Image registry. registry: docker.io # -- Image repository. - repository: kubeflow/trainer-controller-controller + repository: kubeflow/trainer-controller-manager # -- Image tag. # @default -- If not set, the chart appVersion will be used. tag: latest diff --git a/hack/sync-manifests.sh b/hack/sync-manifests.sh new file mode 100755 index 0000000000..931762b8ce --- /dev/null +++ b/hack/sync-manifests.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +# Copyright 2024 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to automatically syncing the Kustomize manifests from manifests templated by trainer Helm chart. +# Run 'make sync-manifests' from the root directory of this repository. + +echo "Syncing Kustomize manifests from manifests templated by trainer Helm chart..." + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +# Helm chart release name and namespace. +TRAINER_CHART_DIR=charts/trainer +RELEASE_NAME=kubeflow-trainer +RELEASE_NAMESPACE=kubeflow-system + +# Source directories +SRC_CRD_DIR=manifests/base/crds + +# Destination directories +DST_CRD_DIR=charts/trainer/crds +DST_RBAC_DIR=manifests/base/rbac +DST_CONTROLLER_DIR=manifests/base/controller +DST_WEBHOOK_DIR=manifests/base/webhook +DST_RUNTIMES_DIR=manifests/base/runtimes + +MANIFESTS_FILE=$(mktemp -t kubeflow-trainer-manifests.yaml) +FIND_EXCLUDE_ARGS="-not -name kustomization.yaml -not -name kustomization.yml" +YQ_ARGS='.metadata.labels."app.kubernetes.io/managed-by" = "Kustomize" | del(.metadata.labels."helm.sh/chart")' + +setup() { + # yq is required to parse yaml files. + if ! command -v yq &>/dev/null; then + echo "'yq' is not installed, please install it first. Ref: https://github.com/mikefarah/yq." + exit 1 + fi + + # Create destination directory if it doesn't exist. + mkdir -p ${DST_CRD_DIR} ${DST_RBAC_DIR} ${DST_CONTROLLER_DIR} ${DST_WEBHOOK_DIR} ${DST_RUNTIMES_DIR}/pretraining + + helm template ${RELEASE_NAME} ${TRAINER_CHART_DIR} --namespace ${RELEASE_NAMESPACE} > "${MANIFESTS_FILE}" +} + +update_crds() { + # Copy all CRD files to destination directory. + # shellcheck disable=SC2086 + find ${SRC_CRD_DIR} -type f -name "*.yaml" ${FIND_EXCLUDE_ARGS} -exec cp {} ${DST_CRD_DIR} \; +} + +update_rbac() { + yq -e "select(.kind == \"ServiceAccount\" and .metadata.name == \"kubeflow-trainer-controller\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_RBAC_DIR}/serviceaccount.yaml + yq -e "select(.kind == \"ClusterRole\" and .metadata.name == \"kubeflow-trainer-controller\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_RBAC_DIR}/clusterrole.yaml + yq -e "select(.kind == \"ClusterRoleBinding\" and .metadata.name == \"kubeflow-trainer-controller\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_RBAC_DIR}/clusterrolebinding.yaml + yq -e "select(.kind == \"Role\" and .metadata.name == \"kubeflow-trainer-controller\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_RBAC_DIR}/role.yaml + yq -e "select(.kind == \"RoleBinding\" and .metadata.name == \"kubeflow-trainer-controller\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_RBAC_DIR}/rolebinding.yaml +} + +update_controller() { + yq -e "select(.kind == \"Deployment\" and .metadata.name == \"kubeflow-trainer-controller\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_CONTROLLER_DIR}/deployment.yaml + yq -e "select(.kind == \"Service\" and .metadata.name == \"kubeflow-trainer-controller-service\" | ${YQ_ARGS})" "${MANIFESTS_FILE}" > ${DST_CONTROLLER_DIR}/service.yaml +} + +update_webhook() { + yq -e "select(.kind == \"Secret\" and .metadata.name == \"kubeflow-trainer-webhook-cert\") | ${YQ_ARGS}" "${MANIFESTS_FILE}" > ${DST_WEBHOOK_DIR}/secret.yaml + yq -e "select(.kind == \"ValidatingWebhookConfiguration\" and .metadata.name == \"kubeflow-trainer-validating-webhook\" | ${YQ_ARGS})" "${MANIFESTS_FILE}" > ${DST_WEBHOOK_DIR}/validatingwebhookconfiguration.yaml +} + +update_runtimes() { + yq -e "select(.kind == \"ClusterTrainingRuntime\" and .metadata.name == \"torch-distributed\")" "${MANIFESTS_FILE}" > ${DST_RUNTIMES_DIR}/pretraining/torch-distributed.yaml +} + +cleanup() { + rm -f "${MANIFESTS_FILE}" +} + +setup +update_crds +update_rbac +update_controller +update_webhook +update_runtimes +cleanup diff --git a/manifests/base/controller/deployment.yaml b/manifests/base/controller/deployment.yaml new file mode 100644 index 0000000000..a1daeed0cd --- /dev/null +++ b/manifests/base/controller/deployment.yaml @@ -0,0 +1,69 @@ +# Source: kubeflow-trainer/templates/controller/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kubeflow-trainer-controller + namespace: kubeflow-system + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller + template: + metadata: + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller + spec: + containers: + - name: controller + image: docker.io/kubeflow/trainer-controller-manager:latest + imagePullPolicy: IfNotPresent + command: + - /manager + args: + - --metrics-bind-address=0 + - --metrics-secure=true + - --health-probe-bind-address=:8081 + - --enable-http2=false + - --webhook-server-port=9443 + - --webhook-service-name=kubeflow-trainer-controller-service + - --webhook-secret-name=kubeflow-trainer-webhook-cert + - --leader-elect=false + volumeMounts: + - name: webhook-cert + mountPath: /tmp/k8s-webhook-server/serving-certs + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 3 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 10 + periodSeconds: 15 + timeoutSeconds: 3 + volumes: + - name: webhook-cert + secret: + secretName: kubeflow-trainer-webhook-cert + defaultMode: 420 + serviceAccountName: kubeflow-trainer-controller + automountServiceAccountToken: true diff --git a/manifests/base/controller/kustomization.yaml b/manifests/base/controller/kustomization.yaml new file mode 100644 index 0000000000..93962466ae --- /dev/null +++ b/manifests/base/controller/kustomization.yaml @@ -0,0 +1,19 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +resources: + - deployment.yaml + - service.yaml diff --git a/manifests/base/controller/service.yaml b/manifests/base/controller/service.yaml new file mode 100644 index 0000000000..e1e68fb628 --- /dev/null +++ b/manifests/base/controller/service.yaml @@ -0,0 +1,28 @@ +# Source: kubeflow-trainer/templates/controller/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: kubeflow-trainer-controller-service + namespace: kubeflow-system + labels: + helm.sh/chart: kubeflow-trainer-2.0.0 + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller +spec: + ports: + - name: monitoring-port + port: 8080 + targetPort: 8080 + - name: webhook-server + port: 443 + protocol: TCP + targetPort: 9443 + selector: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller diff --git a/manifests/base/manager/kustomization.yaml b/manifests/base/manager/kustomization.yaml deleted file mode 100644 index 7394a6d059..0000000000 --- a/manifests/base/manager/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: - - manager.yaml diff --git a/manifests/base/manager/manager.yaml b/manifests/base/manager/manager.yaml deleted file mode 100644 index c6005ab153..0000000000 --- a/manifests/base/manager/manager.yaml +++ /dev/null @@ -1,65 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kubeflow-trainer-controller-manager - labels: - app.kubernetes.io/name: trainer - app.kubernetes.io/component: manager - app.kubernetes.io/part-of: kubeflow -spec: - selector: - matchLabels: - app.kubernetes.io/name: trainer - app.kubernetes.io/component: manager - app.kubernetes.io/part-of: kubeflow - template: - metadata: - labels: - app.kubernetes.io/name: trainer - app.kubernetes.io/component: manager - app.kubernetes.io/part-of: kubeflow - spec: - containers: - - name: manager - image: kubeflow/trainer-controller-manager - volumeMounts: - - mountPath: /tmp/k8s-webhook-server/serving-certs - name: cert - readOnly: true - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - timeoutSeconds: 3 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 10 - periodSeconds: 15 - timeoutSeconds: 3 - serviceAccountName: kubeflow-trainer-controller-manager - volumes: - - name: cert - secret: - defaultMode: 420 - secretName: kubeflow-trainer-webhook-cert ---- -apiVersion: v1 -kind: Service -metadata: - name: kubeflow-trainer-controller-manager -spec: - ports: - - name: monitoring-port - port: 8080 - targetPort: 8080 - - name: webhook-server - port: 443 - protocol: TCP - targetPort: 9443 - selector: - app.kubernetes.io/component: manager diff --git a/manifests/base/rbac/clusterrole.yaml b/manifests/base/rbac/clusterrole.yaml new file mode 100644 index 0000000000..d63241b2a7 --- /dev/null +++ b/manifests/base/rbac/clusterrole.yaml @@ -0,0 +1,86 @@ +# Source: kubeflow-trainer/templates/rbac/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kubeflow-trainer-controller + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller +rules: + - apiGroups: + - "" + resources: + - limitranges + verbs: + - get + - list + - watch + - apiGroups: + - node.k8s.io + resources: + - runtimeclasses + verbs: + - get + - list + - watch + - apiGroups: + - admissionregistration.k8s.io + resources: + - validatingwebhookconfigurations + verbs: + - get + - list + - watch + - update + - apiGroups: + - jobset.x-k8s.io + resources: + - jobsets + verbs: + - get + - list + - watch + - create + - apiGroups: + - scheduling.x-k8s.io + resources: + - podgroups + verbs: + - get + - list + - watch + - create + - apiGroups: + - trainer.kubeflow.org + resources: + - clustertrainingruntimes + - trainingruntimes + verbs: + - get + - list + - watch + - apiGroups: + - trainer.kubeflow.org + resources: + - trainjobs + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - trainer.kubeflow.org + resources: + - trainjobs/finalizers + - trainjobs/status + verbs: + - get + - update + - patch diff --git a/manifests/base/rbac/clusterrolebinding.yaml b/manifests/base/rbac/clusterrolebinding.yaml new file mode 100644 index 0000000000..de324c290a --- /dev/null +++ b/manifests/base/rbac/clusterrolebinding.yaml @@ -0,0 +1,20 @@ +# Source: kubeflow-trainer/templates/rbac/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kubeflow-trainer-controller + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kubeflow-trainer-controller +subjects: + - kind: ServiceAccount + name: kubeflow-trainer-controller + namespace: kubeflow-system diff --git a/manifests/base/rbac/kustomization.yaml b/manifests/base/rbac/kustomization.yaml index 25a37bf74f..031699f85c 100644 --- a/manifests/base/rbac/kustomization.yaml +++ b/manifests/base/rbac/kustomization.yaml @@ -1,4 +1,6 @@ resources: + - serviceaccount.yaml + - clusterrole.yaml + - clusterrolebinding.yaml - role.yaml - - role_binding.yaml - - service_account.yaml + - rolebinding.yaml diff --git a/manifests/base/rbac/role.yaml b/manifests/base/rbac/role.yaml index 77c8151eaa..6716c2f6c5 100644 --- a/manifests/base/rbac/role.yaml +++ b/manifests/base/rbac/role.yaml @@ -1,72 +1,23 @@ ---- +# Source: kubeflow-trainer/templates/rbac/role.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: kubeflow-trainer-controller-manager + name: kubeflow-trainer-controller + namespace: kubeflow-system + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller rules: -- apiGroups: - - "" - resources: - - secrets - verbs: - - get - - list - - update - - watch -- apiGroups: - - admissionregistration.k8s.io - resources: - - validatingwebhookconfigurations - verbs: - - get - - list - - update - - watch -- apiGroups: - - jobset.x-k8s.io - resources: - - jobsets - verbs: - - create - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - get - - list - - watch -- apiGroups: - - trainer.kubeflow.org - resources: - - clustertrainingruntimes - - trainingruntimes - verbs: - - get - - list - - watch -- apiGroups: - - trainer.kubeflow.org - resources: - - trainjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - trainer.kubeflow.org - resources: - - trainjobs/finalizers - - trainjobs/status - verbs: - - get - - patch - - update + - apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - watch + - update diff --git a/manifests/base/rbac/role_binding.yaml b/manifests/base/rbac/role_binding.yaml deleted file mode 100644 index e0b9d02ec0..0000000000 --- a/manifests/base/rbac/role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kubeflow-trainer-controller-manager -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kubeflow-trainer-controller-manager -subjects: - - kind: ServiceAccount - name: kubeflow-trainer-controller-manager diff --git a/manifests/base/rbac/rolebinding.yaml b/manifests/base/rbac/rolebinding.yaml new file mode 100644 index 0000000000..9c38c4a849 --- /dev/null +++ b/manifests/base/rbac/rolebinding.yaml @@ -0,0 +1,21 @@ +# Source: kubeflow-trainer/templates/rbac/rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kubeflow-trainer-controller + namespace: kubeflow-system + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kubeflow-trainer-controller +subjects: + - kind: ServiceAccount + name: kubeflow-trainer-controller + namespace: kubeflow-system diff --git a/manifests/base/rbac/service_account.yaml b/manifests/base/rbac/service_account.yaml deleted file mode 100644 index 977f9c852d..0000000000 --- a/manifests/base/rbac/service_account.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kubeflow-trainer-controller-manager diff --git a/manifests/base/rbac/serviceaccount.yaml b/manifests/base/rbac/serviceaccount.yaml new file mode 100644 index 0000000000..2f911df897 --- /dev/null +++ b/manifests/base/rbac/serviceaccount.yaml @@ -0,0 +1,15 @@ +--- +# Source: kubeflow-trainer/templates/rbac/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kubeflow-trainer-controller + namespace: kubeflow-system + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: controller +automountServiceAccountToken: true diff --git a/manifests/base/runtimes/pretraining/kustomization.yaml b/manifests/base/runtimes/pretraining/kustomization.yaml index 6facf87216..26e68e0f69 100644 --- a/manifests/base/runtimes/pretraining/kustomization.yaml +++ b/manifests/base/runtimes/pretraining/kustomization.yaml @@ -1,4 +1,20 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - torch_distributed.yaml + - torch-distributed.yaml diff --git a/manifests/base/runtimes/pretraining/torch_distributed.yaml b/manifests/base/runtimes/pretraining/torch-distributed.yaml similarity index 83% rename from manifests/base/runtimes/pretraining/torch_distributed.yaml rename to manifests/base/runtimes/pretraining/torch-distributed.yaml index 9b678ea33b..5add55072a 100644 --- a/manifests/base/runtimes/pretraining/torch_distributed.yaml +++ b/manifests/base/runtimes/pretraining/torch-distributed.yaml @@ -1,3 +1,4 @@ +# Source: kubeflow-trainer/templates/runtimes/pretraining/torch-distributed.yaml apiVersion: trainer.kubeflow.org/v1alpha1 kind: ClusterTrainingRuntime metadata: @@ -19,7 +20,7 @@ spec: spec: containers: - name: trainer - image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime + image: docker.io/pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime command: - /bin/bash - -c diff --git a/manifests/base/webhook/kustomization.yaml b/manifests/base/webhook/kustomization.yaml index 5723808d02..f25da456e0 100644 --- a/manifests/base/webhook/kustomization.yaml +++ b/manifests/base/webhook/kustomization.yaml @@ -1,12 +1,23 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - manifests.yaml -patches: - - path: patch.yaml - target: - group: admissionregistration.k8s.io - version: v1 - kind: ValidatingWebhookConfiguration + - secret.yaml + - validatingwebhookconfiguration.yaml configurations: - kustomizeconfig.yaml diff --git a/manifests/base/webhook/kustomizeconfig.yaml b/manifests/base/webhook/kustomizeconfig.yaml index 8b55ef316b..fe930e3b4d 100644 --- a/manifests/base/webhook/kustomizeconfig.yaml +++ b/manifests/base/webhook/kustomizeconfig.yaml @@ -1,3 +1,19 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + # the following config is for teaching kustomize where to look at when substituting vars. # It requires kustomize v2.1.0 or newer to work properly. namespace: diff --git a/manifests/base/webhook/manifests.yaml b/manifests/base/webhook/manifests.yaml deleted file mode 100644 index 721773af05..0000000000 --- a/manifests/base/webhook/manifests.yaml +++ /dev/null @@ -1,66 +0,0 @@ ---- -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: validating-webhook-configuration -webhooks: -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: webhook-service - namespace: system - path: /validate-trainer-kubeflow-org-v1alpha1-clustertrainingruntime - failurePolicy: Fail - name: validator.clustertrainingruntime.trainer.kubeflow.org - rules: - - apiGroups: - - trainer.kubeflow.org - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - clustertrainingruntimes - sideEffects: None -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: webhook-service - namespace: system - path: /validate-trainer-kubeflow-org-v1alpha1-trainingruntime - failurePolicy: Fail - name: validator.trainingruntime.trainer.kubeflow.org - rules: - - apiGroups: - - trainer.kubeflow.org - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - trainingruntimes - sideEffects: None -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: webhook-service - namespace: system - path: /validate-trainer-kubeflow-org-v1alpha1-trainjob - failurePolicy: Fail - name: validator.trainjob.trainer.kubeflow.org - rules: - - apiGroups: - - trainer.kubeflow.org - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - trainjobs - sideEffects: None diff --git a/manifests/base/webhook/patch.yaml b/manifests/base/webhook/patch.yaml deleted file mode 100644 index 831dcc988f..0000000000 --- a/manifests/base/webhook/patch.yaml +++ /dev/null @@ -1,12 +0,0 @@ -- op: replace - path: /webhooks/0/clientConfig/service/name - value: kubeflow-trainer-controller-manager -- op: replace - path: /webhooks/1/clientConfig/service/name - value: kubeflow-trainer-controller-manager -- op: replace - path: /webhooks/2/clientConfig/service/name - value: kubeflow-trainer-controller-manager -- op: replace - path: /metadata/name - value: validator.trainer.kubeflow.org diff --git a/manifests/base/webhook/secret.yaml b/manifests/base/webhook/secret.yaml new file mode 100644 index 0000000000..5e84d30417 --- /dev/null +++ b/manifests/base/webhook/secret.yaml @@ -0,0 +1,18 @@ +# Source: kubeflow-trainer/templates/webhook/secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: kubeflow-trainer-webhook-cert + namespace: kubeflow-system + labels: + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Kustomize + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: webhook +data: + ca.crt: "" + ca.key: "" + tls.crt: "" + tls.key: "" diff --git a/manifests/base/webhook/validatingwebhookconfiguration.yaml b/manifests/base/webhook/validatingwebhookconfiguration.yaml new file mode 100644 index 0000000000..d6d5857b61 --- /dev/null +++ b/manifests/base/webhook/validatingwebhookconfiguration.yaml @@ -0,0 +1,74 @@ +# Source: kubeflow-trainer/templates/webhook/validatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: kubeflow-trainer-validating-webhook + labels: + helm.sh/chart: kubeflow-trainer-2.0.0 + app.kubernetes.io/name: kubeflow-trainer + app.kubernetes.io/instance: kubeflow-trainer + app.kubernetes.io/version: "2.0.0" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/component: webhook +webhooks: + - name: validator.clustertrainingruntime.trainer.kubeflow.org + admissionReviewVersions: + - v1 + clientConfig: + service: + name: kubeflow-trainer-controller-service + namespace: kubeflow-system + path: /validate-trainer-kubeflow-org-v1alpha1-clustertrainingruntime + sideEffects: None + failurePolicy: Fail + rules: + - apiGroups: + - trainer.kubeflow.org + apiVersions: + - v1alpha1 + resources: + - clustertrainingruntimes + operations: + - CREATE + - UPDATE + - name: validator.trainingruntime.trainer.kubeflow.org + admissionReviewVersions: + - v1 + clientConfig: + service: + name: kubeflow-trainer-controller-service + namespace: kubeflow-system + path: /validate-trainer-kubeflow-org-v1alpha1-trainingruntime + sideEffects: None + failurePolicy: Fail + rules: + - apiGroups: + - trainer.kubeflow.org + apiVersions: + - v1alpha1 + resources: + - trainingruntimes + operations: + - CREATE + - UPDATE + - name: validator.trainjob.trainer.kubeflow.org + admissionReviewVersions: + - v1 + clientConfig: + service: + name: kubeflow-trainer-controller-service + namespace: kubeflow-system + path: /validate-trainer-kubeflow-org-v1alpha1-trainjob + sideEffects: None + failurePolicy: Fail + rules: + - apiGroups: + - trainer.kubeflow.org + apiVersions: + - v1alpha1 + resources: + - trainjobs + operations: + - CREATE + - UPDATE diff --git a/manifests/overlays/manager/kustomization.yaml b/manifests/overlays/manager/kustomization.yaml index 88ee213b2a..620fb54418 100644 --- a/manifests/overlays/manager/kustomization.yaml +++ b/manifests/overlays/manager/kustomization.yaml @@ -7,19 +7,7 @@ namespace: kubeflow-system resources: - namespace.yaml - ../../base/crds - - ../../base/manager - ../../base/rbac + - ../../base/controller - ../../base/webhook - ../../third-party/jobset # Comment this line if JobSet is installed on the Kubernetes cluster. - -# Update the Kubeflow Trainer controller manager image tag. -images: - - name: kubeflow/trainer-controller-manager - newTag: latest - -# Secret for the Kubeflow Training webhook. -secretGenerator: - - name: kubeflow-trainer-webhook-cert - namespace: kubeflow-system - options: - disableNameSuffixHash: true