forked from draskot/Vini
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_number_gpu_cards_per_node
executable file
·52 lines (39 loc) · 2.26 KB
/
get_number_gpu_cards_per_node
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
partition=$1
walltime=$2
scheduler=`cat $WORKDIR/scheduler`
version=`cat $WORKDIR/version`
job_submit=`cat $WORKDIR/job_submit`
job_cancel=`cat $WORKDIR/job_cancel`
partition=`cat $WORKDIR/${partition}_partition`
excluded_nodes=`cat $WORKDIR/excluded_${partition}_nodes`
rm -f $WORKDIR/gpus.*
echo "Trying to determine the number of Nvidia gpu cards per ${partition} node. This may last up to $walltime seconds, please do not interrupt."
echo "#! /bin/bash" > $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --job-name=gpus" >> $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --output=$WORKDIR/gpus.out" >> $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --error=$WORKDIR/gpus.err" >> $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --time=00:00:"$walltime >> $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --cpus-per-task=1" >> $WORKDIR/get_number_gpus_per_node
if [[ $partition == gpu ]] && [[ -e /etc/slurm/gres.conf ]]
then
echo "#SBATCH --gres=gpu:0" >> $WORKDIR/get_number_gpus_per_node
fi
echo "#SBATCH --mem=2gb" >> $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --partition="$partition >> $WORKDIR/get_number_gpus_per_node
echo "#SBATCH --exclude=${excluded_nodes}" >> $WORKDIR/get_number_gpus_per_node
echo "WORKDIR=$WORKDIR" >> $WORKDIR/get_number_gpus_per_node
echo "lspci" >> $WORKDIR/get_number_gpus_per_node
chmod u+x $WORKDIR/get_number_gpus_per_node
rm -f $WORKDIR/gpus.*
sbatch $WORKDIR/get_number_gpus_per_node
timeout ${walltime}s $vini_dir/wait_until_jobs_finish
if [ -e $WORKDIR/gpus.out ]
then
grep NVIDIA $WORKDIR/gpus.out > tmp
gpus=`wc -l < tmp`
echo "Found $gpus GPU cards per gpu node."
else
${job_cancel} -u $USER ; sh $vini_dir/wait_until_jobs_finish ; echo
read -p "Failed to access any ${partition} node. The possible reason is that nodes are temporarily allocated or unavailable. Write the number of gpu cards per ${partition} node here:" gpus
fi
echo $gpus > $WORKDIR/gpus