Skip to content

Commit

Permalink
feat(nvidia): build pytorch to get older cuda compute capabilities an…
Browse files Browse the repository at this point in the history
…d setup arm64 support
  • Loading branch information
ndbaker1 committed Feb 12, 2025
1 parent b5a9e87 commit 5a2805e
Show file tree
Hide file tree
Showing 23 changed files with 121 additions and 60 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training
- run: |
docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \
--build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
build-image-nvidia-inference:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference
- run: |
docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \
--build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
build-image-neuron-training:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion internal/deployers/eksapi/kubeconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ package eksapi

import (
"bytes"
"fmt"
"os"
"text/template"
"fmt"

"k8s.io/klog"
)
Expand Down
6 changes: 4 additions & 2 deletions test/cases/nvidia-training/bert_training_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ func TestBertTraining(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "bert-training-launcher", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(time.Minute*20))
wait.WithTimeout(time.Minute*20),
wait.WithContext(ctx),
)
if err != nil {
t.Fatal(err)
t.Error(err)
}

err = printJobLogs(ctx, cfg, "default", "bert-training-launcher")
Expand Down
13 changes: 6 additions & 7 deletions test/cases/nvidia-training/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"log"
"os"
"os/signal"
"slices"
"testing"
"time"
Expand Down Expand Up @@ -37,7 +38,10 @@ func TestMain(m *testing.M) {
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)

ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
testenv = env.NewWithConfig(cfg).WithContext(ctx)

manifests := [][]byte{
nvidiaDevicePluginManifest,
Expand Down Expand Up @@ -147,16 +151,11 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
return ctx, fmt.Errorf("no nodes found in the cluster")
}

singleNodeType := true
for i := 1; i < len(nodes.Items); i++ {
if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
singleNodeType = false
break
return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster")
}
}
if !singleNodeType {
return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster")
}

if *nodeType != "" {
count := 0
Expand Down
12 changes: 5 additions & 7 deletions test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import (
"fmt"
"log"
"os"
"os/signal"
"slices"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/internal/e2e"
"github.com/aws/aws-sdk-go-v2/aws"
Expand All @@ -31,6 +31,7 @@ var (
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
pytorchImage *string
skipUnitTestSubcommand *string
nodeCount int
gpuPerNode int
Expand Down Expand Up @@ -99,15 +100,11 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
return ctx, err
}

singleNodeType := true
for i := 1; i < len(nodes.Items)-1; i++ {
if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
singleNodeType = false
return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
}
}
if !singleNodeType {
return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
}

if *nodeType != "" {
for _, v := range nodes.Items {
Expand Down Expand Up @@ -135,6 +132,7 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
func TestMain(m *testing.M) {
nodeType = flag.String("nodeType", "", "node type for the tests")
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
pytorchImage = flag.String("pytorchImage", "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2", "pytorch cuda image for single node tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
skipUnitTestSubcommand = flag.String("skipUnitTestSubcommand", "", "optional command to skip specified unit test, `-s test1|test2|...`")
Expand All @@ -143,7 +141,7 @@ func TestMain(m *testing.M) {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)
ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
testenv = testenv.WithContext(ctx)

Expand Down
4 changes: 4 additions & 0 deletions test/cases/nvidia/manifests/job-hpc-benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,15 @@ spec:
- hpl.sh
- --mem-affinity
- 0:0:0:0:1:1:1:1
# --cpu-affinity needs to be tuned depending on the number of CPUs
# available on the instance type.
- --cpu-affinity
- 0-13:14-27:28-41:42-55:56-69:70-83:84-97:98-111
- --no-multinode
- --dat
- hpl-linux-x86_64/sample-dat/HPL-dgx-1N.dat
# TODO: the path differs for arm64
#- hpl-linux-aarch64-gpu/sample-dat/HPL-dgx-1N.dat
volumeMounts:
- mountPath: /dev/shm
name: dshm
Expand Down
10 changes: 7 additions & 3 deletions test/cases/nvidia/manifests/job-unit-test-single-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ spec:
- /bin/bash
- ./gpu_unit_tests/unit_test
env:
- name: SKIP_TESTS_SUBCOMMAND
value: {{.SkipTestSubcommand}}
- name: SKIP_TESTS_SUBCOMMAND
value: {{.SkipTestSubcommand}}
# because we started building these from source, this is just a
# regular binary.
- name: DEMO_SUITE_DIR
value: /usr/bin
imagePullPolicy: Always
resources:
limits:
Expand All @@ -29,4 +33,4 @@ spec:
cpu: "1"
memory: 1Gi
restartPolicy: Never
backoffLimit: 4
backoffLimit: 4
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
spec:
restartPolicy: OnFailure
containers:
- image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2
- image: {{.PytorchTestImage}}
name: gpu-test
command:
- mpirun
Expand Down Expand Up @@ -48,7 +48,7 @@ spec:
- MXNET_CUDNN_AUTOTUNE_DEFAULT=0
- python
- -c
- import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 1")
- import os; os.system("git clone https://github.com/pytorch/examples.git pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python pytorch-examples/mnist/main.py --epochs 1")
resources:
limits:
nvidia.com/gpu: 1
10 changes: 9 additions & 1 deletion test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,15 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
t.Log("Applying single node manifest")
err := fwext.ApplyManifests(cfg.Client().RESTConfig(), mpiJobPytorchTrainingSingleNodeManifest)
renderedSingleNodeManifest, err := fwext.RenderManifests(mpiJobPytorchTrainingSingleNodeManifest, struct {
PytorchTestImage string
}{
PytorchTestImage: *pytorchImage,
})
if err != nil {
t.Fatal(err)
}
err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
Expand Down
8 changes: 4 additions & 4 deletions test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ func TestSingleNodeUnitTest(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
})
if err != nil {
t.Fatal(err)
t.Error(err)
}
t.Log("Test log for unit-test-job:")
t.Log(log)
err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
if err != nil {
t.Fatal(err)
t.Error(err)
}
return ctx
}).
Expand Down Expand Up @@ -120,13 +120,13 @@ func TestSingleNodeUnitTest(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"},
})
if err != nil {
t.Fatal(err)
t.Error(err)
}
t.Log("Test log for hpc-benckmarks-job:")
t.Log(log)
err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobHpcBenchmarksSingleNodeManifest)
if err != nil {
t.Fatal(err)
t.Error(err)
}
return ctx
}).
Expand Down
20 changes: 20 additions & 0 deletions test/images/nvidia-inference/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8

ARG PYTORCH_BRANCH=v2.5.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

###############################################################################
# 1) System packages
###############################################################################
Expand Down Expand Up @@ -75,3 +78,20 @@ WORKDIR /app
COPY infer.py /app/
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

###############################################################################
# 4) Install Pytorch from Source
###############################################################################
# envs needed to make the path of NVCC known to the compilation
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
# this list could be minimized based on the supported GPUs
ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"

RUN pip3 install typing-extensions sympy
RUN git clone \
--recursive https://github.com/pytorch/pytorch.git \
--branch $PYTORCH_BRANCH \
&& cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
&& rm -rf pytorch
1 change: 0 additions & 1 deletion test/images/nvidia-inference/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch==2.5
transformers==4.33
numpy==1.26
26 changes: 20 additions & 6 deletions test/images/nvidia-training/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ ENV DEBIAN_FRONTEND=noninteractive
# Set default values for MASTER_ADDR, MASTER_PORT, and NUM_GPUS_PER_NODE
ENV MASTER_ADDR=127.0.0.1
ENV MASTER_PORT=12355
ENV NUM_GPUS_PER_NODE=8

# Python dependency version numbers
ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3

ARG PYTORCH_BRANCH=v2.3.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

RUN apt-get update \
&& apt-get upgrade -y \
Expand Down Expand Up @@ -58,10 +59,23 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
&& pip --no-cache-dir install --upgrade \
pip \
setuptools

# Install Pytorch from Source
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"

RUN pip install typing-extensions sympy pyyaml
RUN git clone \
--recursive https://github.com/pytorch/pytorch.git \
--branch $PYTORCH_BRANCH \
&& cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
&& rm -rf pytorch

# Set the working directory in the container
WORKDIR /app

Expand All @@ -74,7 +88,7 @@ RUN python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt

ARG EFA_INSTALLER_VERSION=latest
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG AWS_OFI_NCCL_VERSION=1.13.2
ARG NCCL_TESTS_VERSION=master

RUN apt-get update -y && \
Expand All @@ -94,7 +108,7 @@ RUN mkdir -p /var/run/sshd && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Set environment variables for OpenMPI, CUDA, EFA, and NCCL
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
Expand All @@ -107,7 +121,7 @@ RUN cd $HOME \

# Install NCCL (version specified)
RUN apt-key del 7fa2af80 && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(uname -m | sed 's/aarch64/sbsa/')/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

Expand Down
5 changes: 2 additions & 3 deletions test/images/nvidia-training/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch==2.3
transformers==4.29
numpy==1.23
transformers==4.33
numpy==1.26
3 changes: 1 addition & 2 deletions test/images/nvidia-training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ def main():
# Retrieve environment variables
rank = int(os.getenv("OMPI_COMM_WORLD_RANK", "0"))
world_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1"))
num_gpus_per_node = int(os.getenv("NUM_GPUS_PER_NODE", "8"))
local_rank = rank % num_gpus_per_node
local_rank = int(os.getenv("OMPI_COMM_WORLD_LOCAL_RANK", "0"))

print(f"Process started for rank {rank} with local rank {local_rank}")

Expand Down
Loading

0 comments on commit 5a2805e

Please sign in to comment.