Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPICH support #478

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: build
on: [push, pull_request]
on: [push, pull_request, workflow_dispatch]
jobs:
validate:
name: Validate
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ test_images:
${IMG_BUILDER} build -t mpioperator/intel build/base -f build/base/intel.Dockerfile
${IMG_BUILDER} build -t mpioperator/intel-builder build/base -f build/base/intel-builder.Dockerfile
${IMG_BUILDER} build -t mpioperator/mpi-pi:intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile
${IMG_BUILDER} build -t mpioperator/mpich build/base -f build/base/mpich.Dockerfile
${IMG_BUILDER} build -t mpioperator/mpich-builder build/base -f build/base/mpich-builder.Dockerfile
${IMG_BUILDER} build -t mpioperator/mpi-pi:mpich examples/v2beta1/pi -f examples/v2beta1/pi/mpich.Dockerfile

.PHONY: tidy
tidy:
Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,13 @@ total images/sec: 308.27
For a sample that uses Intel MPI, see:

```bash
cat examples/pi/pi-intel.yaml
cat examples/v2beta1/pi/pi-intel.yaml
```

For a sample that uses MPICH, see:

```bash
cat examples/v2beta1/pi/pi-mpich.yaml
```

## Exposed Metrics
Expand Down
7 changes: 7 additions & 0 deletions build/base/mpich-builder.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM debian:buster as builder

RUN apt update \
&& apt install -y --no-install-recommends \
g++ \
libmpich-dev \
&& rm -rf /var/lib/apt/lists/*
31 changes: 31 additions & 0 deletions build/base/mpich-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash

function resolve_host() {
host="$1"
check="nslookup $host"
max_retry=10
counter=0
backoff=0.1
until $check > /dev/null
do
if [ $counter -eq $max_retry ]; then
echo "Couldn't resolve $host"
return
fi
sleep $backoff
echo "Couldn't resolve $host... Retrying"
((counter++))
backoff=$(echo - | awk "{print $backoff + $backoff}")
done
echo "Resolved $host"
}

if [ "$K_MPI_JOB_ROLE" == "launcher" ]; then
resolve_host "$HOSTNAME"
cat /etc/mpi/hostfile | while read host
do
resolve_host $host
done
fi

exec "$@"
8 changes: 8 additions & 0 deletions build/base/mpich.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM mpioperator/base

RUN apt update \
&& apt install -y --no-install-recommends mpich dnsutils \
&& rm -rf /var/lib/apt/lists/*

COPY mpich-entrypoint.sh /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
1 change: 1 addition & 0 deletions deploy/v2beta1/mpi-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ spec:
enum:
- OpenMPI
- Intel
- MPICH
type: string
mpiReplicaSpecs:
properties:
Expand Down
8 changes: 7 additions & 1 deletion examples/v2beta1/pi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ For Intel MPI:
docker build -t mpi-pi . -f intel.Dockerfile
```

For MPICH:

```bash
docker build -t mpi-pi . -f mpich.Dockerfile
```

## Create MPIJob

Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the
Modify `pi.yaml` (for OpenMPI), `pi-intel.yaml` (for Intel MPI) or `pi-mpich.yaml` (for MPICH) to set up the
image name from your own registry.

Then, run:
Expand Down
8 changes: 8 additions & 0 deletions examples/v2beta1/pi/mpich.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM mpioperator/mpich-builder as builder

COPY pi.cc /src/pi.cc
RUN mpic++ /src/pi.cc -o /pi

FROM mpioperator/mpich

COPY --from=builder /pi /home/mpiuser/pi
54 changes: 54 additions & 0 deletions examples/v2beta1/pi/pi-mpich.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: pi
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: MPICH
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
containers:
- image: mpioperator/mpi-pi:mpich
imagePullPolicy: Always
name: mpi-launcher
securityContext:
runAsUser: 1000
args:
- mpirun
- -n
- "2"
- /home/mpiuser/pi
resources:
limits:
cpu: 1
memory: 1Gi
Worker:
replicas: 2
template:
spec:
containers:
- image: mpioperator/mpi-pi:mpich
imagePullPolicy: Always
name: mpi-worker
securityContext:
runAsUser: 1000
command:
args:
- /usr/sbin/sshd
- -De
- -f
- /home/mpiuser/.sshd_config
readinessProbe:
tcpSocket:
port: 2222
initialDelaySeconds: 2
resources:
limits:
cpu: 1
memory: 1Gi
2 changes: 1 addition & 1 deletion manifests/base/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ spec:
type: string
mpiImplementation:
type: string
enum: ["OpenMPI", "Intel"]
enum: ["OpenMPI", "Intel", "MPICH"]
mpiReplicaSpecs:
type: object
properties:
Expand Down
2 changes: 1 addition & 1 deletion proposals/scalable-robust-operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ following changes:
doesn’t support changes to the completions field. This can be supported
starting from 1.23. In the meantime, we can replicate the behavior by
creating a new Job and doing Pod adoption.
- For Intel MPI, we also need a headless Service to front the launcher,
- For Intel MPI and MPICH, we also need a headless Service to front the launcher,
because workers communicate back to the launcher using its hostname.
- **Revert the use of the Job API for the launcher.**
- The Job controller handles retries when the launcher or any of the workers fail.
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion v2/crd/kubeflow.org_mpijobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ spec:
mpiImplementation:
default: OpenMPI
description: MPIImplementation is the MPI implementation. Options
are "OpenMPI" (default) and "Intel".
are "OpenMPI" (default), "Intel" and "MPICH".
enum:
- OpenMPI
- Intel
- MPICH
type: string
mpiReplicaSpecs:
additionalProperties:
Expand Down
2 changes: 1 addition & 1 deletion v2/pkg/apis/kubeflow/v2beta1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion v2/pkg/apis/kubeflow/v2beta1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@
],
"properties": {
"mpiImplementation": {
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".",
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) \"Intel\" and \"MPICH\".",
"type": "string"
},
"mpiReplicaSpecs": {
Expand Down
5 changes: 3 additions & 2 deletions v2/pkg/apis/kubeflow/v2beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ type MPIJobSpec struct {
SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`

// MPIImplementation is the MPI implementation.
// Options are "OpenMPI" (default) and "Intel".
// +kubebuilder:validation:Enum:=OpenMPI;Intel
// Options are "OpenMPI" (default), "Intel" and "MPICH".
// +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH
// +kubebuilder:default:=OpenMPI
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
}
Expand All @@ -78,4 +78,5 @@ type MPIImplementation string
const (
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
MPIImplementationIntel MPIImplementation = "Intel"
MPIImplementationMPICH MPIImplementation = "MPICH"
)
3 changes: 2 additions & 1 deletion v2/pkg/apis/kubeflow/validation/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ var (

validMPIImplementations = sets.NewString(
string(kubeflow.MPIImplementationOpenMPI),
string(kubeflow.MPIImplementationIntel))
string(kubeflow.MPIImplementationIntel),
string(kubeflow.MPIImplementationMPICH))

validRestartPolicies = sets.NewString(
string(common.RestartPolicyNever),
Expand Down
22 changes: 20 additions & 2 deletions v2/pkg/controller/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ const (

openMPISlotsEnv = "OMPI_MCA_orte_set_default_slots"
intelMPISlotsEnv = "I_MPI_PERHOST"
mpichSlotsEnv = "TODO"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does anyone know how to figure out what should be provided here?
Interestingly, the pi example works as is, i.e. with mpichSlotsEnv == TODO

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be the name of the environment variable for configuring the number of workers per host.

It works in the sense that it runs, because this would just set an environment variable called TODO. But it's not doing the intended outcome.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's -ppn command line option (both for Intel and MPICH), but I don't think it can be controlled via an environemnt variable

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, it's hard to overwrite the command line, because users might be using wrappers around mpiexec. That's why environment variables is better. Another option that openMPI and Intel MPI support is to set them in the hostfile.

Actually, it might be useful to use that in general because of this #445

)

var (
Expand Down Expand Up @@ -199,6 +200,16 @@ var (
Value: "-o ConnectionAttempts=10",
},
}
mpichEnvVars = []corev1.EnvVar{
{
Name: "HYDRA_HOST_FILE",
Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName),
},
{
Name: "HYDRA_LAUNCH_EXTRA_ARGS",
Value: "-o ConnectionAttempts=10",
},
}
nvidiaDisableEnvVars = []corev1.EnvVar{
{Name: "NVIDIA_VISIBLE_DEVICES"},
{Name: "NVIDIA_DRIVER_CAPABILITIES"},
Expand Down Expand Up @@ -560,8 +571,9 @@ func (c *MPIJobController) syncHandler(key string) error {
if err != nil {
return err
}
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel {
// The Intel implementation requires workers to communicate with the
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel ||
mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationMPICH {
// The Intel and MPICH implementations require workers to communicate with the
// launcher through its hostname. For that, we create a Service which
// has the same name as the launcher's hostname.
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
Expand Down Expand Up @@ -1374,6 +1386,12 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev
Name: intelMPISlotsEnv,
Value: slotsStr,
})
case kubeflow.MPIImplementationMPICH:
container.Env = append(container.Env, mpichEnvVars...)
container.Env = append(container.Env, corev1.EnvVar{
Name: mpichSlotsEnv,
Value: slotsStr,
})
}

container.Env = append(container.Env,
Expand Down
5 changes: 3 additions & 2 deletions v2/pkg/controller/mpi_job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ func TestDoNothingWithInvalidMPIJob(t *testing.T) {
}

func TestAllResourcesCreated(t *testing.T) {
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel}
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH}
for _, implementation := range impls {
t.Run(string(implementation), func(t *testing.T) {
f := newFixture(t)
Expand All @@ -481,7 +481,8 @@ func TestAllResourcesCreated(t *testing.T) {
for i := 0; i < 5; i++ {
f.expectCreatePodAction(fmjc.newWorker(mpiJobCopy, i))
}
if implementation == kubeflow.MPIImplementationIntel {
if implementation == kubeflow.MPIImplementationIntel ||
implementation == kubeflow.MPIImplementationMPICH {
f.expectCreateServiceAction(newLauncherService(mpiJobCopy))
}
f.expectCreateJobAction(fmjc.newLauncherJob(mpiJobCopy))
Expand Down
1 change: 1 addition & 0 deletions v2/test/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ const (
defaultKindImage = "kindest/node:v1.21.2"
openMPIImage = "mpioperator/mpi-pi:openmpi"
intelMPIImage = "mpioperator/mpi-pi:intel"
mpichMPIImage = "mpioperator/mpi-pi:mpich"
rootPath = "../../.."
kubectlPath = rootPath + "/bin/kubectl"
operatorManifestsPath = rootPath + "/manifests/overlays/dev"
Expand Down