From 4421847c788a8a9526dd22ed3b8317f5f25b45d9 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 12:20:06 +0100 Subject: [PATCH 01/11] Initial support for MPICH --- build/base/mpich-builder.Dockerfile | 7 +++ build/base/mpich-entrypoint.sh | 31 +++++++++++ build/base/mpich.Dockerfile | 8 +++ deploy/v2beta1/mpi-operator.yaml | 1 + examples/v2beta1/pi/mpich.Dockerfile | 8 +++ examples/v2beta1/pi/pi-mpich.yaml | 54 +++++++++++++++++++ manifests/base/crd.yaml | 2 +- sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md | 2 +- .../mpijob/models/v2beta1_mpi_job_spec.py | 4 +- v2/crd/kubeflow.org_mpijobs.yaml | 3 +- .../kubeflow/v2beta1/openapi_generated.go | 2 +- v2/pkg/apis/kubeflow/v2beta1/swagger.json | 2 +- v2/pkg/apis/kubeflow/v2beta1/types.go | 5 +- v2/pkg/apis/kubeflow/validation/validation.go | 3 +- v2/pkg/controller/mpi_job_controller.go | 22 +++++++- v2/pkg/controller/mpi_job_controller_test.go | 5 +- 16 files changed, 145 insertions(+), 14 deletions(-) create mode 100644 build/base/mpich-builder.Dockerfile create mode 100755 build/base/mpich-entrypoint.sh create mode 100644 build/base/mpich.Dockerfile create mode 100644 examples/v2beta1/pi/mpich.Dockerfile create mode 100644 examples/v2beta1/pi/pi-mpich.yaml diff --git a/build/base/mpich-builder.Dockerfile b/build/base/mpich-builder.Dockerfile new file mode 100644 index 000000000..95701b55e --- /dev/null +++ b/build/base/mpich-builder.Dockerfile @@ -0,0 +1,7 @@ +FROM debian:buster as builder + +RUN apt update \ + && apt install -y --no-install-recommends \ + g++ \ + libmpich-dev \ + && rm -rf /var/lib/apt/lists/* diff --git a/build/base/mpich-entrypoint.sh b/build/base/mpich-entrypoint.sh new file mode 100755 index 000000000..d62be11a5 --- /dev/null +++ b/build/base/mpich-entrypoint.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +function resolve_host() { + host="$1" + check="nslookup $host" + max_retry=10 + counter=0 + backoff=0.1 + until $check > /dev/null + do + if [ $counter -eq $max_retry ]; then + echo "Couldn't resolve $host" + return + fi + sleep $backoff + echo "Couldn't resolve $host... Retrying" + ((counter++)) + backoff=$(echo - | awk "{print $backoff + $backoff}") + done + echo "Resolved $host" +} + +if [ "$K_MPI_JOB_ROLE" == "launcher" ]; then + resolve_host "$HOSTNAME" + cat /etc/mpi/hostfile | while read host + do + resolve_host $host + done +fi + +exec "$@" \ No newline at end of file diff --git a/build/base/mpich.Dockerfile b/build/base/mpich.Dockerfile new file mode 100644 index 000000000..9189e0bf2 --- /dev/null +++ b/build/base/mpich.Dockerfile @@ -0,0 +1,8 @@ +FROM mpich-base + +RUN apt update \ + && apt install -y --no-install-recommends mpich dnsutils \ + && rm -rf /var/lib/apt/lists/* + +COPY mpich-entrypoint.sh /entrypoint.sh +ENTRYPOINT ["/entrypoint.sh"] diff --git a/deploy/v2beta1/mpi-operator.yaml b/deploy/v2beta1/mpi-operator.yaml index 5513136b8..a155afcff 100644 --- a/deploy/v2beta1/mpi-operator.yaml +++ b/deploy/v2beta1/mpi-operator.yaml @@ -121,6 +121,7 @@ spec: enum: - OpenMPI - Intel + - MPICH type: string mpiReplicaSpecs: properties: diff --git a/examples/v2beta1/pi/mpich.Dockerfile b/examples/v2beta1/pi/mpich.Dockerfile new file mode 100644 index 000000000..214a333f8 --- /dev/null +++ b/examples/v2beta1/pi/mpich.Dockerfile @@ -0,0 +1,8 @@ +FROM mpioperator/mpich-builder as builder + +COPY pi.cc /src/pi.cc +RUN mpic++ /src/pi.cc -o /pi + +FROM mpioperator/mpich + +COPY --from=builder /pi /home/mpiuser/pi \ No newline at end of file diff --git a/examples/v2beta1/pi/pi-mpich.yaml b/examples/v2beta1/pi/pi-mpich.yaml new file mode 100644 index 000000000..7c4a70cb7 --- /dev/null +++ b/examples/v2beta1/pi/pi-mpich.yaml @@ -0,0 +1,54 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: pi +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + sshAuthMountPath: /home/mpiuser/.ssh + mpiImplementation: MPICH + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: mpioperator/mpi-pi:mpich + imagePullPolicy: Always + name: mpi-launcher + securityContext: + runAsUser: 1000 + args: + - mpirun + - -n + - "2" + - /home/mpiuser/pi + resources: + limits: + cpu: 1 + memory: 1Gi + Worker: + replicas: 2 + template: + spec: + containers: + - image: mpioperator/mpi-pi:mpich + imagePullPolicy: Always + name: mpi-worker + securityContext: + runAsUser: 1000 + command: + args: + - /usr/sbin/sshd + - -De + - -f + - /home/mpiuser/.sshd_config + readinessProbe: + tcpSocket: + port: 2222 + initialDelaySeconds: 2 + resources: + limits: + cpu: 1 + memory: 1Gi diff --git a/manifests/base/crd.yaml b/manifests/base/crd.yaml index 3790b12c8..4652484de 100644 --- a/manifests/base/crd.yaml +++ b/manifests/base/crd.yaml @@ -127,7 +127,7 @@ spec: type: string mpiImplementation: type: string - enum: ["OpenMPI", "Intel"] + enum: ["OpenMPI", "Intel", "MPICH"] mpiReplicaSpecs: type: object properties: diff --git a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md index fc32dade9..490c67c18 100644 --- a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md +++ b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md @@ -4,7 +4,7 @@ ## Properties Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- -**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". | [optional] +**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". | [optional] **mpi_replica_specs** | [**dict(str, V1ReplicaSpec)**](V1ReplicaSpec.md) | MPIReplicaSpecs contains maps from `MPIReplicaType` to `ReplicaSpec` that specify the MPI replicas to run. | **run_policy** | [**V1RunPolicy**](V1RunPolicy.md) | | [optional] **slots_per_worker** | **int** | Specifies the number of slots per worker used in hostfile. Defaults to 1. | [optional] diff --git a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py index 7a0527a5f..37af94b95 100644 --- a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py +++ b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py @@ -75,7 +75,7 @@ def __init__(self, mpi_implementation=None, mpi_replica_specs=None, run_policy=N def mpi_implementation(self): """Gets the mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501 - MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501 + MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". # noqa: E501 :return: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501 :rtype: str @@ -86,7 +86,7 @@ def mpi_implementation(self): def mpi_implementation(self, mpi_implementation): """Sets the mpi_implementation of this V2beta1MPIJobSpec. - MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501 + MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\"". # noqa: E501 :param mpi_implementation: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501 :type mpi_implementation: str diff --git a/v2/crd/kubeflow.org_mpijobs.yaml b/v2/crd/kubeflow.org_mpijobs.yaml index d4f5de762..56d03ec7b 100644 --- a/v2/crd/kubeflow.org_mpijobs.yaml +++ b/v2/crd/kubeflow.org_mpijobs.yaml @@ -37,10 +37,11 @@ spec: mpiImplementation: default: OpenMPI description: MPIImplementation is the MPI implementation. Options - are "OpenMPI" (default) and "Intel". + are "OpenMPI" (default), "Intel" and "MPICH". enum: - OpenMPI - Intel + - MPICH type: string mpiReplicaSpecs: additionalProperties: diff --git a/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go b/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go index 125135b80..6f229bb23 100644 --- a/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go +++ b/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go @@ -451,7 +451,7 @@ func schema_pkg_apis_kubeflow_v2beta1_MPIJobSpec(ref common.ReferenceCallback) c }, "mpiImplementation": { SchemaProps: spec.SchemaProps{ - Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".", + Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".", Type: []string{"string"}, Format: "", }, diff --git a/v2/pkg/apis/kubeflow/v2beta1/swagger.json b/v2/pkg/apis/kubeflow/v2beta1/swagger.json index 52fb45a24..943edbc4e 100644 --- a/v2/pkg/apis/kubeflow/v2beta1/swagger.json +++ b/v2/pkg/apis/kubeflow/v2beta1/swagger.json @@ -223,7 +223,7 @@ ], "properties": { "mpiImplementation": { - "description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".", + "description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) \"Intel\" and \"MPICH\".", "type": "string" }, "mpiReplicaSpecs": { diff --git a/v2/pkg/apis/kubeflow/v2beta1/types.go b/v2/pkg/apis/kubeflow/v2beta1/types.go index 4ad08389e..d5a68b221 100644 --- a/v2/pkg/apis/kubeflow/v2beta1/types.go +++ b/v2/pkg/apis/kubeflow/v2beta1/types.go @@ -56,8 +56,8 @@ type MPIJobSpec struct { SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"` // MPIImplementation is the MPI implementation. - // Options are "OpenMPI" (default) and "Intel". - // +kubebuilder:validation:Enum:=OpenMPI;Intel + // Options are "OpenMPI" (default), "Intel" and "MPICH". + // +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH // +kubebuilder:default:=OpenMPI MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"` } @@ -78,4 +78,5 @@ type MPIImplementation string const ( MPIImplementationOpenMPI MPIImplementation = "OpenMPI" MPIImplementationIntel MPIImplementation = "Intel" + MPIImplementationMPICH MPIImplementation = "MPICH" ) diff --git a/v2/pkg/apis/kubeflow/validation/validation.go b/v2/pkg/apis/kubeflow/validation/validation.go index 3e98b8187..21a7f9577 100644 --- a/v2/pkg/apis/kubeflow/validation/validation.go +++ b/v2/pkg/apis/kubeflow/validation/validation.go @@ -35,7 +35,8 @@ var ( validMPIImplementations = sets.NewString( string(kubeflow.MPIImplementationOpenMPI), - string(kubeflow.MPIImplementationIntel)) + string(kubeflow.MPIImplementationIntel), + string(kubeflow.MPIImplementationMPICH)) validRestartPolicies = sets.NewString( string(common.RestartPolicyNever), diff --git a/v2/pkg/controller/mpi_job_controller.go b/v2/pkg/controller/mpi_job_controller.go index 57881227f..d0f9f40e6 100644 --- a/v2/pkg/controller/mpi_job_controller.go +++ b/v2/pkg/controller/mpi_job_controller.go @@ -115,6 +115,7 @@ const ( openMPISlotsEnv = "OMPI_MCA_orte_set_default_slots" intelMPISlotsEnv = "I_MPI_PERHOST" + mpichSlotsEnv = "TODO" ) var ( @@ -199,6 +200,16 @@ var ( Value: "-o ConnectionAttempts=10", }, } + mpichEnvVars = []corev1.EnvVar{ + { + Name: "HYDRA_HOST_FILE", + Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName), + }, + { + Name: "HYDRA_LAUNCH_EXTRA_ARGS", + Value: "-o ConnectionAttempts=10", + }, + } nvidiaDisableEnvVars = []corev1.EnvVar{ {Name: "NVIDIA_VISIBLE_DEVICES"}, {Name: "NVIDIA_DRIVER_CAPABILITIES"}, @@ -560,8 +571,9 @@ func (c *MPIJobController) syncHandler(key string) error { if err != nil { return err } - if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel { - // The Intel implementation requires workers to communicate with the + if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel || + mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationMPICH { + // The Intel and MPICH implementations require workers to communicate with the // launcher through its hostname. For that, we create a Service which // has the same name as the launcher's hostname. _, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob)) @@ -1374,6 +1386,12 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev Name: intelMPISlotsEnv, Value: slotsStr, }) + case kubeflow.MPIImplementationMPICH: + container.Env = append(container.Env, mpichEnvVars...) + container.Env = append(container.Env, corev1.EnvVar{ + Name: mpichSlotsEnv, + Value: slotsStr, + }) } container.Env = append(container.Env, diff --git a/v2/pkg/controller/mpi_job_controller_test.go b/v2/pkg/controller/mpi_job_controller_test.go index 37d224d54..fc5df59e2 100644 --- a/v2/pkg/controller/mpi_job_controller_test.go +++ b/v2/pkg/controller/mpi_job_controller_test.go @@ -457,7 +457,7 @@ func TestDoNothingWithInvalidMPIJob(t *testing.T) { } func TestAllResourcesCreated(t *testing.T) { - impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel} + impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH} for _, implementation := range impls { t.Run(string(implementation), func(t *testing.T) { f := newFixture(t) @@ -481,7 +481,8 @@ func TestAllResourcesCreated(t *testing.T) { for i := 0; i < 5; i++ { f.expectCreatePodAction(fmjc.newWorker(mpiJobCopy, i)) } - if implementation == kubeflow.MPIImplementationIntel { + if implementation == kubeflow.MPIImplementationIntel || + implementation == kubeflow.MPIImplementationMPICH { f.expectCreateServiceAction(newLauncherService(mpiJobCopy)) } f.expectCreateJobAction(fmjc.newLauncherJob(mpiJobCopy)) From 8e42d242d45e5499d2e975feb9cec8a7dd728745 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 12:28:10 +0100 Subject: [PATCH 02/11] Correct the source image used by MPICH docker image --- build/base/mpich.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/base/mpich.Dockerfile b/build/base/mpich.Dockerfile index 9189e0bf2..a9c843eeb 100644 --- a/build/base/mpich.Dockerfile +++ b/build/base/mpich.Dockerfile @@ -1,4 +1,4 @@ -FROM mpich-base +FROM mpioperator/base RUN apt update \ && apt install -y --no-install-recommends mpich dnsutils \ From 41ed59d7b0fe35c3fd45b5a4a7831f6d7d1ff403 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 21:13:25 +0100 Subject: [PATCH 03/11] Improve docs --- README.md | 8 +++++++- examples/v2beta1/pi/README.md | 8 +++++++- proposals/scalable-robust-operator.md | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 147fb1393..de0bb5b3d 100644 --- a/README.md +++ b/README.md @@ -209,7 +209,13 @@ total images/sec: 308.27 For a sample that uses Intel MPI, see: ```bash -cat examples/pi/pi-intel.yaml +cat examples/v2beta1/pi/pi-intel.yaml +``` + +For a sample that uses MPICH, see: + +```bash +cat examples/v2beta1/pi/pi-mpich.yaml ``` ## Exposed Metrics diff --git a/examples/v2beta1/pi/README.md b/examples/v2beta1/pi/README.md index f3e3aabbc..c829c15bf 100644 --- a/examples/v2beta1/pi/README.md +++ b/examples/v2beta1/pi/README.md @@ -19,9 +19,15 @@ For Intel MPI: docker build -t mpi-pi . -f intel.Dockerfile ``` +For MPICH: + +```bash +docker build -t mpi-pi . -f mpich.Dockerfile +``` + ## Create MPIJob -Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the +Modify `pi.yaml` (for OpenMPI), `pi-intel.yaml` (for Intel MPI) or `pi-mpich.yaml` (for MPICH) to set up the image name from your own registry. Then, run: diff --git a/proposals/scalable-robust-operator.md b/proposals/scalable-robust-operator.md index 01891389c..537f674e0 100644 --- a/proposals/scalable-robust-operator.md +++ b/proposals/scalable-robust-operator.md @@ -148,7 +148,7 @@ following changes: doesn’t support changes to the completions field. This can be supported starting from 1.23. In the meantime, we can replicate the behavior by creating a new Job and doing Pod adoption. - - For Intel MPI, we also need a headless Service to front the launcher, + - For Intel MPI and MPICH, we also need a headless Service to front the launcher, because workers communicate back to the launcher using its hostname. - **Revert the use of the Job API for the launcher.** - The Job controller handles retries when the launcher or any of the workers fail. From 8341e0ae4020df66f103dd26fafb825dafcc8124 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 21:13:53 +0100 Subject: [PATCH 04/11] Add mpich docker images to makefile --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index f11324c77..b3033c986 100644 --- a/Makefile +++ b/Makefile @@ -101,6 +101,9 @@ test_images: ${IMG_BUILDER} build -t mpioperator/intel build/base -f build/base/intel.Dockerfile ${IMG_BUILDER} build -t mpioperator/intel-builder build/base -f build/base/intel-builder.Dockerfile ${IMG_BUILDER} build -t mpioperator/mpi-pi:intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile + ${IMG_BUILDER} build -t mpioperator/mpich build/base -f build/base/mpich.Dockerfile + ${IMG_BUILDER} build -t mpioperator/mpich-builder build/base -f build/base/mpich-builder.Dockerfile + ${IMG_BUILDER} build -t mpioperator/mpi-pi:mpich examples/v2beta1/pi -f examples/v2beta1/pi/mpich.Dockerfile .PHONY: tidy tidy: From 7c1ae22ee0857aabfaadbed999ac5c894082f8eb Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 21:14:38 +0100 Subject: [PATCH 05/11] Initial version for testing --- v2/test/e2e/e2e_suite_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/v2/test/e2e/e2e_suite_test.go b/v2/test/e2e/e2e_suite_test.go index 072de973d..a19ac1caa 100644 --- a/v2/test/e2e/e2e_suite_test.go +++ b/v2/test/e2e/e2e_suite_test.go @@ -45,6 +45,7 @@ const ( defaultKindImage = "kindest/node:v1.21.2" openMPIImage = "mpioperator/mpi-pi:openmpi" intelMPIImage = "mpioperator/mpi-pi:intel" + mpichMPIImage = "mpioperator/mpi-pi:mpich" rootPath = "../../.." kubectlPath = rootPath + "/bin/kubectl" operatorManifestsPath = rootPath + "/manifests/overlays/dev" From 4e15756fc270bdeb103d25032ae579abb3402f13 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 12:20:06 +0100 Subject: [PATCH 06/11] Initial support for MPICH --- build/base/mpich-builder.Dockerfile | 7 +++ build/base/mpich-entrypoint.sh | 31 +++++++++++ build/base/mpich.Dockerfile | 8 +++ deploy/v2beta1/mpi-operator.yaml | 1 + examples/v2beta1/pi/mpich.Dockerfile | 8 +++ examples/v2beta1/pi/pi-mpich.yaml | 54 +++++++++++++++++++ manifests/base/crd.yaml | 2 +- sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md | 2 +- .../mpijob/models/v2beta1_mpi_job_spec.py | 4 +- v2/crd/kubeflow.org_mpijobs.yaml | 3 +- .../kubeflow/v2beta1/openapi_generated.go | 2 +- v2/pkg/apis/kubeflow/v2beta1/swagger.json | 2 +- v2/pkg/apis/kubeflow/v2beta1/types.go | 5 +- v2/pkg/apis/kubeflow/validation/validation.go | 3 +- v2/pkg/controller/mpi_job_controller.go | 22 +++++++- v2/pkg/controller/mpi_job_controller_test.go | 5 +- 16 files changed, 145 insertions(+), 14 deletions(-) create mode 100644 build/base/mpich-builder.Dockerfile create mode 100755 build/base/mpich-entrypoint.sh create mode 100644 build/base/mpich.Dockerfile create mode 100644 examples/v2beta1/pi/mpich.Dockerfile create mode 100644 examples/v2beta1/pi/pi-mpich.yaml diff --git a/build/base/mpich-builder.Dockerfile b/build/base/mpich-builder.Dockerfile new file mode 100644 index 000000000..95701b55e --- /dev/null +++ b/build/base/mpich-builder.Dockerfile @@ -0,0 +1,7 @@ +FROM debian:buster as builder + +RUN apt update \ + && apt install -y --no-install-recommends \ + g++ \ + libmpich-dev \ + && rm -rf /var/lib/apt/lists/* diff --git a/build/base/mpich-entrypoint.sh b/build/base/mpich-entrypoint.sh new file mode 100755 index 000000000..d62be11a5 --- /dev/null +++ b/build/base/mpich-entrypoint.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +function resolve_host() { + host="$1" + check="nslookup $host" + max_retry=10 + counter=0 + backoff=0.1 + until $check > /dev/null + do + if [ $counter -eq $max_retry ]; then + echo "Couldn't resolve $host" + return + fi + sleep $backoff + echo "Couldn't resolve $host... Retrying" + ((counter++)) + backoff=$(echo - | awk "{print $backoff + $backoff}") + done + echo "Resolved $host" +} + +if [ "$K_MPI_JOB_ROLE" == "launcher" ]; then + resolve_host "$HOSTNAME" + cat /etc/mpi/hostfile | while read host + do + resolve_host $host + done +fi + +exec "$@" \ No newline at end of file diff --git a/build/base/mpich.Dockerfile b/build/base/mpich.Dockerfile new file mode 100644 index 000000000..9189e0bf2 --- /dev/null +++ b/build/base/mpich.Dockerfile @@ -0,0 +1,8 @@ +FROM mpich-base + +RUN apt update \ + && apt install -y --no-install-recommends mpich dnsutils \ + && rm -rf /var/lib/apt/lists/* + +COPY mpich-entrypoint.sh /entrypoint.sh +ENTRYPOINT ["/entrypoint.sh"] diff --git a/deploy/v2beta1/mpi-operator.yaml b/deploy/v2beta1/mpi-operator.yaml index 5513136b8..a155afcff 100644 --- a/deploy/v2beta1/mpi-operator.yaml +++ b/deploy/v2beta1/mpi-operator.yaml @@ -121,6 +121,7 @@ spec: enum: - OpenMPI - Intel + - MPICH type: string mpiReplicaSpecs: properties: diff --git a/examples/v2beta1/pi/mpich.Dockerfile b/examples/v2beta1/pi/mpich.Dockerfile new file mode 100644 index 000000000..214a333f8 --- /dev/null +++ b/examples/v2beta1/pi/mpich.Dockerfile @@ -0,0 +1,8 @@ +FROM mpioperator/mpich-builder as builder + +COPY pi.cc /src/pi.cc +RUN mpic++ /src/pi.cc -o /pi + +FROM mpioperator/mpich + +COPY --from=builder /pi /home/mpiuser/pi \ No newline at end of file diff --git a/examples/v2beta1/pi/pi-mpich.yaml b/examples/v2beta1/pi/pi-mpich.yaml new file mode 100644 index 000000000..7c4a70cb7 --- /dev/null +++ b/examples/v2beta1/pi/pi-mpich.yaml @@ -0,0 +1,54 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: pi +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + sshAuthMountPath: /home/mpiuser/.ssh + mpiImplementation: MPICH + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: mpioperator/mpi-pi:mpich + imagePullPolicy: Always + name: mpi-launcher + securityContext: + runAsUser: 1000 + args: + - mpirun + - -n + - "2" + - /home/mpiuser/pi + resources: + limits: + cpu: 1 + memory: 1Gi + Worker: + replicas: 2 + template: + spec: + containers: + - image: mpioperator/mpi-pi:mpich + imagePullPolicy: Always + name: mpi-worker + securityContext: + runAsUser: 1000 + command: + args: + - /usr/sbin/sshd + - -De + - -f + - /home/mpiuser/.sshd_config + readinessProbe: + tcpSocket: + port: 2222 + initialDelaySeconds: 2 + resources: + limits: + cpu: 1 + memory: 1Gi diff --git a/manifests/base/crd.yaml b/manifests/base/crd.yaml index 3790b12c8..4652484de 100644 --- a/manifests/base/crd.yaml +++ b/manifests/base/crd.yaml @@ -127,7 +127,7 @@ spec: type: string mpiImplementation: type: string - enum: ["OpenMPI", "Intel"] + enum: ["OpenMPI", "Intel", "MPICH"] mpiReplicaSpecs: type: object properties: diff --git a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md index fc32dade9..490c67c18 100644 --- a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md +++ b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md @@ -4,7 +4,7 @@ ## Properties Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- -**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". | [optional] +**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". | [optional] **mpi_replica_specs** | [**dict(str, V1ReplicaSpec)**](V1ReplicaSpec.md) | MPIReplicaSpecs contains maps from `MPIReplicaType` to `ReplicaSpec` that specify the MPI replicas to run. | **run_policy** | [**V1RunPolicy**](V1RunPolicy.md) | | [optional] **slots_per_worker** | **int** | Specifies the number of slots per worker used in hostfile. Defaults to 1. | [optional] diff --git a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py index 7a0527a5f..37af94b95 100644 --- a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py +++ b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py @@ -75,7 +75,7 @@ def __init__(self, mpi_implementation=None, mpi_replica_specs=None, run_policy=N def mpi_implementation(self): """Gets the mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501 - MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501 + MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". # noqa: E501 :return: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501 :rtype: str @@ -86,7 +86,7 @@ def mpi_implementation(self): def mpi_implementation(self, mpi_implementation): """Sets the mpi_implementation of this V2beta1MPIJobSpec. - MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501 + MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\"". # noqa: E501 :param mpi_implementation: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501 :type mpi_implementation: str diff --git a/v2/crd/kubeflow.org_mpijobs.yaml b/v2/crd/kubeflow.org_mpijobs.yaml index d4f5de762..56d03ec7b 100644 --- a/v2/crd/kubeflow.org_mpijobs.yaml +++ b/v2/crd/kubeflow.org_mpijobs.yaml @@ -37,10 +37,11 @@ spec: mpiImplementation: default: OpenMPI description: MPIImplementation is the MPI implementation. Options - are "OpenMPI" (default) and "Intel". + are "OpenMPI" (default), "Intel" and "MPICH". enum: - OpenMPI - Intel + - MPICH type: string mpiReplicaSpecs: additionalProperties: diff --git a/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go b/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go index 125135b80..6f229bb23 100644 --- a/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go +++ b/v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go @@ -451,7 +451,7 @@ func schema_pkg_apis_kubeflow_v2beta1_MPIJobSpec(ref common.ReferenceCallback) c }, "mpiImplementation": { SchemaProps: spec.SchemaProps{ - Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".", + Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".", Type: []string{"string"}, Format: "", }, diff --git a/v2/pkg/apis/kubeflow/v2beta1/swagger.json b/v2/pkg/apis/kubeflow/v2beta1/swagger.json index 52fb45a24..943edbc4e 100644 --- a/v2/pkg/apis/kubeflow/v2beta1/swagger.json +++ b/v2/pkg/apis/kubeflow/v2beta1/swagger.json @@ -223,7 +223,7 @@ ], "properties": { "mpiImplementation": { - "description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".", + "description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) \"Intel\" and \"MPICH\".", "type": "string" }, "mpiReplicaSpecs": { diff --git a/v2/pkg/apis/kubeflow/v2beta1/types.go b/v2/pkg/apis/kubeflow/v2beta1/types.go index 4ad08389e..d5a68b221 100644 --- a/v2/pkg/apis/kubeflow/v2beta1/types.go +++ b/v2/pkg/apis/kubeflow/v2beta1/types.go @@ -56,8 +56,8 @@ type MPIJobSpec struct { SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"` // MPIImplementation is the MPI implementation. - // Options are "OpenMPI" (default) and "Intel". - // +kubebuilder:validation:Enum:=OpenMPI;Intel + // Options are "OpenMPI" (default), "Intel" and "MPICH". + // +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH // +kubebuilder:default:=OpenMPI MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"` } @@ -78,4 +78,5 @@ type MPIImplementation string const ( MPIImplementationOpenMPI MPIImplementation = "OpenMPI" MPIImplementationIntel MPIImplementation = "Intel" + MPIImplementationMPICH MPIImplementation = "MPICH" ) diff --git a/v2/pkg/apis/kubeflow/validation/validation.go b/v2/pkg/apis/kubeflow/validation/validation.go index 3e98b8187..21a7f9577 100644 --- a/v2/pkg/apis/kubeflow/validation/validation.go +++ b/v2/pkg/apis/kubeflow/validation/validation.go @@ -35,7 +35,8 @@ var ( validMPIImplementations = sets.NewString( string(kubeflow.MPIImplementationOpenMPI), - string(kubeflow.MPIImplementationIntel)) + string(kubeflow.MPIImplementationIntel), + string(kubeflow.MPIImplementationMPICH)) validRestartPolicies = sets.NewString( string(common.RestartPolicyNever), diff --git a/v2/pkg/controller/mpi_job_controller.go b/v2/pkg/controller/mpi_job_controller.go index 57881227f..d0f9f40e6 100644 --- a/v2/pkg/controller/mpi_job_controller.go +++ b/v2/pkg/controller/mpi_job_controller.go @@ -115,6 +115,7 @@ const ( openMPISlotsEnv = "OMPI_MCA_orte_set_default_slots" intelMPISlotsEnv = "I_MPI_PERHOST" + mpichSlotsEnv = "TODO" ) var ( @@ -199,6 +200,16 @@ var ( Value: "-o ConnectionAttempts=10", }, } + mpichEnvVars = []corev1.EnvVar{ + { + Name: "HYDRA_HOST_FILE", + Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName), + }, + { + Name: "HYDRA_LAUNCH_EXTRA_ARGS", + Value: "-o ConnectionAttempts=10", + }, + } nvidiaDisableEnvVars = []corev1.EnvVar{ {Name: "NVIDIA_VISIBLE_DEVICES"}, {Name: "NVIDIA_DRIVER_CAPABILITIES"}, @@ -560,8 +571,9 @@ func (c *MPIJobController) syncHandler(key string) error { if err != nil { return err } - if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel { - // The Intel implementation requires workers to communicate with the + if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel || + mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationMPICH { + // The Intel and MPICH implementations require workers to communicate with the // launcher through its hostname. For that, we create a Service which // has the same name as the launcher's hostname. _, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob)) @@ -1374,6 +1386,12 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev Name: intelMPISlotsEnv, Value: slotsStr, }) + case kubeflow.MPIImplementationMPICH: + container.Env = append(container.Env, mpichEnvVars...) + container.Env = append(container.Env, corev1.EnvVar{ + Name: mpichSlotsEnv, + Value: slotsStr, + }) } container.Env = append(container.Env, diff --git a/v2/pkg/controller/mpi_job_controller_test.go b/v2/pkg/controller/mpi_job_controller_test.go index 37d224d54..fc5df59e2 100644 --- a/v2/pkg/controller/mpi_job_controller_test.go +++ b/v2/pkg/controller/mpi_job_controller_test.go @@ -457,7 +457,7 @@ func TestDoNothingWithInvalidMPIJob(t *testing.T) { } func TestAllResourcesCreated(t *testing.T) { - impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel} + impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH} for _, implementation := range impls { t.Run(string(implementation), func(t *testing.T) { f := newFixture(t) @@ -481,7 +481,8 @@ func TestAllResourcesCreated(t *testing.T) { for i := 0; i < 5; i++ { f.expectCreatePodAction(fmjc.newWorker(mpiJobCopy, i)) } - if implementation == kubeflow.MPIImplementationIntel { + if implementation == kubeflow.MPIImplementationIntel || + implementation == kubeflow.MPIImplementationMPICH { f.expectCreateServiceAction(newLauncherService(mpiJobCopy)) } f.expectCreateJobAction(fmjc.newLauncherJob(mpiJobCopy)) From 98febe8ded3cb88e92d57c2177bf5ad1acd056e6 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 12:28:10 +0100 Subject: [PATCH 07/11] Correct the source image used by MPICH docker image --- build/base/mpich.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/base/mpich.Dockerfile b/build/base/mpich.Dockerfile index 9189e0bf2..a9c843eeb 100644 --- a/build/base/mpich.Dockerfile +++ b/build/base/mpich.Dockerfile @@ -1,4 +1,4 @@ -FROM mpich-base +FROM mpioperator/base RUN apt update \ && apt install -y --no-install-recommends mpich dnsutils \ From c83c7e734aa0c8cadd370c44dd21e388ba9f7fdd Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 21:13:25 +0100 Subject: [PATCH 08/11] Improve docs --- README.md | 8 +++++++- examples/v2beta1/pi/README.md | 8 +++++++- proposals/scalable-robust-operator.md | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 147fb1393..de0bb5b3d 100644 --- a/README.md +++ b/README.md @@ -209,7 +209,13 @@ total images/sec: 308.27 For a sample that uses Intel MPI, see: ```bash -cat examples/pi/pi-intel.yaml +cat examples/v2beta1/pi/pi-intel.yaml +``` + +For a sample that uses MPICH, see: + +```bash +cat examples/v2beta1/pi/pi-mpich.yaml ``` ## Exposed Metrics diff --git a/examples/v2beta1/pi/README.md b/examples/v2beta1/pi/README.md index f3e3aabbc..c829c15bf 100644 --- a/examples/v2beta1/pi/README.md +++ b/examples/v2beta1/pi/README.md @@ -19,9 +19,15 @@ For Intel MPI: docker build -t mpi-pi . -f intel.Dockerfile ``` +For MPICH: + +```bash +docker build -t mpi-pi . -f mpich.Dockerfile +``` + ## Create MPIJob -Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the +Modify `pi.yaml` (for OpenMPI), `pi-intel.yaml` (for Intel MPI) or `pi-mpich.yaml` (for MPICH) to set up the image name from your own registry. Then, run: diff --git a/proposals/scalable-robust-operator.md b/proposals/scalable-robust-operator.md index 01891389c..537f674e0 100644 --- a/proposals/scalable-robust-operator.md +++ b/proposals/scalable-robust-operator.md @@ -148,7 +148,7 @@ following changes: doesn’t support changes to the completions field. This can be supported starting from 1.23. In the meantime, we can replicate the behavior by creating a new Job and doing Pod adoption. - - For Intel MPI, we also need a headless Service to front the launcher, + - For Intel MPI and MPICH, we also need a headless Service to front the launcher, because workers communicate back to the launcher using its hostname. - **Revert the use of the Job API for the launcher.** - The Job controller handles retries when the launcher or any of the workers fail. From e8c7520540e48721de7ba5b50e6d4030ac86f35a Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 21:13:53 +0100 Subject: [PATCH 09/11] Add mpich docker images to makefile --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index f11324c77..b3033c986 100644 --- a/Makefile +++ b/Makefile @@ -101,6 +101,9 @@ test_images: ${IMG_BUILDER} build -t mpioperator/intel build/base -f build/base/intel.Dockerfile ${IMG_BUILDER} build -t mpioperator/intel-builder build/base -f build/base/intel-builder.Dockerfile ${IMG_BUILDER} build -t mpioperator/mpi-pi:intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile + ${IMG_BUILDER} build -t mpioperator/mpich build/base -f build/base/mpich.Dockerfile + ${IMG_BUILDER} build -t mpioperator/mpich-builder build/base -f build/base/mpich-builder.Dockerfile + ${IMG_BUILDER} build -t mpioperator/mpi-pi:mpich examples/v2beta1/pi -f examples/v2beta1/pi/mpich.Dockerfile .PHONY: tidy tidy: From 837cb3daf3c670cde6e068dc48423cf119780a3b Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Tue, 11 Oct 2022 21:14:38 +0100 Subject: [PATCH 10/11] Initial version for testing --- v2/test/e2e/e2e_suite_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/v2/test/e2e/e2e_suite_test.go b/v2/test/e2e/e2e_suite_test.go index 072de973d..a19ac1caa 100644 --- a/v2/test/e2e/e2e_suite_test.go +++ b/v2/test/e2e/e2e_suite_test.go @@ -45,6 +45,7 @@ const ( defaultKindImage = "kindest/node:v1.21.2" openMPIImage = "mpioperator/mpi-pi:openmpi" intelMPIImage = "mpioperator/mpi-pi:intel" + mpichMPIImage = "mpioperator/mpi-pi:mpich" rootPath = "../../.." kubectlPath = rootPath + "/bin/kubectl" operatorManifestsPath = rootPath + "/manifests/overlays/dev" From c7b3faf6494fd7e6aac4ca2d983dc568550935f2 Mon Sep 17 00:00:00 2001 From: Mateusz Kubica Date: Wed, 7 Jun 2023 05:39:59 +0100 Subject: [PATCH 11/11] Temporary: manual trigger --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b21e3bbc1..d90db1ff9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,5 @@ name: build -on: [push, pull_request] +on: [push, pull_request, workflow_dispatch] jobs: validate: name: Validate