Skip to content

Commit 208f4c1

Browse files
authored
Azure ci pipeline for distributed environment tests (microsoft#5881)
1 parent 353e071 commit 208f4c1

File tree

5 files changed

+100
-5
lines changed

5 files changed

+100
-5
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License.
4+
5+
import sys
6+
import argparse
7+
8+
from _test_commons import run_subprocess
9+
10+
import logging
11+
12+
logging.basicConfig(
13+
format="%(asctime)s %(name)s [%(levelname)s] - %(message)s",
14+
level=logging.DEBUG)
15+
log = logging.getLogger("DistributedTests")
16+
17+
def parse_arguments():
18+
parser = argparse.ArgumentParser()
19+
parser.add_argument("--cwd", help="Path to the current working directory")
20+
return parser.parse_args()
21+
22+
def main():
23+
import torch
24+
ngpus = torch.cuda.device_count()
25+
26+
if ngpus < 2:
27+
raise RuntimeError("Cannot run distributed tests with less than 2 gpus.")
28+
29+
args = parse_arguments()
30+
cwd = args.cwd
31+
32+
log.info("Running distributed tests pipeline")
33+
34+
# TODO: Add distributed test suite here.
35+
36+
return 0
37+
38+
39+
if __name__ == "__main__":
40+
sys.exit(main())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
trigger: none
2+
3+
jobs:
4+
- job: Onnxruntime_Linux_GPU_Distributed_Test
5+
6+
timeoutInMinutes: 120
7+
pool: 'Linux-Multi-GPU-V100'
8+
9+
steps:
10+
- checkout: self
11+
clean: true
12+
submodules: recursive
13+
14+
- template: templates/run-docker-build-steps.yml
15+
parameters:
16+
RunDockerBuildArgs: |
17+
-o ubuntu16.04 -d gpu -r $(Build.BinariesDirectory) \
18+
-t onnxruntime_distributed_tests_image \
19+
-x " \
20+
--config RelWithDebInfo \
21+
--enable_training \
22+
--update --build \
23+
--build_wheel \
24+
" \
25+
-m
26+
DisplayName: 'Build'
27+
28+
# all distributed tests
29+
- script: |
30+
docker run \
31+
--gpus all \
32+
--shm-size=1024m \
33+
--rm \
34+
--volume $(Build.SourcesDirectory):/onnxruntime_src \
35+
--volume $(Build.BinariesDirectory):/build \
36+
onnxruntime_distributed_tests_image \
37+
/build/RelWithDebInfo/launch_test.py \
38+
--cmd_line_with_args "python orttraining_distributed_tests.py" \
39+
--cwd /build/RelWithDebInfo \
40+
displayName: 'Run orttraining_distributed_tests.py'
41+
condition: succeededOrFailed()
42+
timeoutInMinutes: 30
43+
44+
- template: templates/component-governance-component-detection-steps.yml
45+
parameters:
46+
condition: 'succeeded'
47+
48+
- template: templates/clean-agent-build-directory-step.yml

tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
22

33
ARG PYTHON_VERSION=3.6
44
ARG BUILD_EXTR_PAR
5+
ARG INSTALL_DEPS_EXTRA_ARGS
56

67
ADD scripts /tmp/scripts
78
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && \
8-
/tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d gpu -x "$BUILD_EXTR_PAR" && \
9+
/tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d gpu -x "$BUILD_EXTR_PAR" $INSTALL_DEPS_EXTRA_ARGS && \
910
rm -rf /tmp/scripts
1011

1112
WORKDIR /root

tools/ci_build/github/linux/docker/scripts/install_deps.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
set -e -x
33

44
SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
5+
INSTALL_DEPS_DISTRIBUTED_SETUP=false
56

6-
while getopts p:d:x: parameter_Option
7+
while getopts p:d:x:m parameter_Option
78
do case "${parameter_Option}"
89
in
910
p) PYTHON_VER=${OPTARG};;
1011
d) DEVICE_TYPE=${OPTARG};;
1112
x) BUILD_EXTR_PAR=${OPTARG};;
13+
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true
1214
esac
1315
done
1416

@@ -113,7 +115,7 @@ if [ $DEVICE_TYPE = "gpu" ]; then
113115
if [[ $BUILD_EXTR_PAR = *--enable_training* ]]; then
114116
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt}
115117

116-
if [[ $BUILD_EXTR_PAR = *--enable_training_python_frontend_e2e_tests* || $BUILD_EXTR_PAR = *enable_training_pipeline_e2e_tests* ]]; then
118+
if [[ $BUILD_EXTR_PAR = *--enable_training_python_frontend_e2e_tests* || $BUILD_EXTR_PAR = *enable_training_pipeline_e2e_tests* || $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then
117119
source ${0/%install_deps.sh/install_openmpi.sh}
118120
fi
119121
fi

tools/ci_build/github/linux/run_dockerbuild.sh

+6-2
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
55
SOURCE_ROOT=$(realpath $SCRIPT_DIR/../../../../)
66
CUDA_VER=cuda10.1-cudnn7.6
77
YOCTO_VERSION="4.19"
8+
INSTALL_DEPS_DISTRIBUTED_SETUP=false
89
ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV="ALLOW_RELEASED_ONNX_OPSET_ONLY="$ALLOW_RELEASED_ONNX_OPSET_ONLY
910
echo "ALLOW_RELEASED_ONNX_OPSET_ONLY environment variable is set as "$ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV
1011

11-
while getopts c:o:d:r:p:x:a:v:y:t: parameter_Option
12+
while getopts c:o:d:r:p:x:a:v:y:t:m parameter_Option
1213
do case "${parameter_Option}"
1314
in
1415
#android, ubuntu16.04, manylinux2010, ubuntu18.04, CentOS7
@@ -31,6 +32,8 @@ y) YOCTO_VERSION=${OPTARG};;
3132
# an additional name for the resulting docker image (created with "docker tag")
3233
# this is useful for referencing the image outside of this script
3334
t) EXTRA_IMAGE_TAG=${OPTARG};;
35+
# install distributed setup dependencies
36+
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
3437
esac
3538
done
3639

@@ -86,8 +89,9 @@ else
8689
if [ $CUDA_VER = "cuda9.1-cudnn7.1" ]; then
8790
DOCKER_FILE=Dockerfile.ubuntu_gpu_cuda9
8891
fi
92+
[[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]] && INSTALL_DEPS_EXTRA_ARGS="-m" || INSTALL_DEPS_EXTRA_ARGS=""
8993
$GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
90-
--docker-build-args="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg BUILD_EXTR_PAR=\"${BUILD_EXTR_PAR}\"" \
94+
--docker-build-args="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg BUILD_EXTR_PAR=\"${BUILD_EXTR_PAR}\" --build-arg INSTALL_DEPS_EXTRA_ARGS=${INSTALL_DEPS_EXTRA_ARGS}" \
9195
--dockerfile $DOCKER_FILE --context .
9296
elif [ $BUILD_DEVICE = "tensorrt" ]; then
9397
# TensorRT container release 20.07

0 commit comments

Comments
 (0)