Skip to content
This repository was archived by the owner on Sep 26, 2023. It is now read-only.

Commit 1519ad8

Browse files
committed
FS and slurm clients ok
1 parent 7db0dc9 commit 1519ad8

10 files changed

+241
-22
lines changed

Diff for: README.md

+55-7
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,62 @@ This is a Vagrant environment that provides a minimal virtual HPC cluster with
44
some commonly encountered filesystems and tools:
55

66
- [X] CentOS 7 Base
7-
- [ ] 1 master node, 2 compute nodes
8-
- [ ] NFS exported from master to compute nodes
9-
- [ ] Lustre exported from master to compute nodes
10-
- [ ] Slurm 19.05.2 installed and configured
11-
- [ ] Singularity 3.4.0 installed on all nodes
12-
7+
- [X] 1 master node, 2 compute nodes
8+
- [X] NFS exported from master to compute nodes
9+
- [X] Lustre exported from master to compute nodes
10+
- [X] Slurm 19.05.2 installed and configured
11+
- [X] Singularity from EPEL
12+
- [X] Dev tools to build Singularity from source
13+
- [X] OpenMPI (from repos, 3.0.2-1)
14+
15+
16+
## How to use:
17+
18+
```
19+
# bring up the master
20+
#
21+
# This will fail the first time, as CentOS base image has an IDE controller
22+
# We need a SATA controller, and there is no easy idempotent way to add one in a
23+
# Vagrantfile such that initial and re-provisioning will succeeed.
24+
#
25+
vagrant up master
26+
27+
# Add the SATA controller we need
28+
VBoxManage storagectl $(cat .vagrant/machines/master/virtualbox/id) --add sata --name "SATA Controller"
29+
30+
# Bring up the master properly now
31+
vagrant up master
32+
33+
# bring up the compute nodes
34+
vagrant up compute01 compute02
35+
36+
```
37+
38+
In this minimal cluster we can be naughty and use the master node as a login
39+
node. For your experiments, note that currently:
40+
41+
- The cluster has a common `vagrant` user across all nodes - this is the user
42+
you should use for Slurm jobs
43+
- $HOME is not mounted over NFS (yet)
44+
- Put anything you need across the cluster into `/nfs` or `/lustre`
45+
- OpenMPI is available as a module via `module load mpi/openmpi3-x86_64`
46+
47+
48+
```
49+
# login to master
50+
$ vagrant ssh
51+
52+
# Run hostname across the cluster (2 nodes)
53+
$ srun -N 2 hostname
54+
55+
# /nfs & /lustre are mounted across master and nodes
56+
# Run an interactive session on a node
57+
# Look in the shared filesystems
58+
$ srun --pty /bin/bash
59+
$ ls /nfs /lustre
60+
```
1361

14-
Caveats:
62+
## Caveats
1563

1664
* The lustre setup is very simple. There are 1 each of MGS/MDT/OST and they
1765
are all running on the master VM.

Diff for: Vagrantfile

+26-8
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,12 @@ Vagrant.configure(2) do |config|
5454
# Provisioning scripts
5555
master.vm.provision "shell", inline: "hostnamectl set-hostname master"
5656
master.vm.provision "shell", path: "common-1-initial.sh"
57-
master.vm.provision "shell", path: "common-2-lustre_kernel.sh"
57+
master.vm.provision "shell", path: "common-2-software.sh"
58+
master.vm.provision "shell", path: "master-1-lustre_kernel.sh"
5859
master.vm.provision :reload
59-
master.vm.provision "shell", path: "master-1-lustre_fs.sh"
60-
master.vm.provision "shell", path: "master-2-nfs_fs.sh"
61-
master.vm.provision "shell", path: "master-3-slurm_master.sh"
60+
master.vm.provision "shell", path: "master-2-lustre_fs.sh"
61+
master.vm.provision "shell", path: "master-3-nfs_fs.sh"
62+
master.vm.provision "shell", path: "master-4-slurm_master.sh"
6263
end
6364

6465
# Compute node 1
@@ -73,11 +74,28 @@ Vagrant.configure(2) do |config|
7374
end
7475
compute01.vm.provision "shell", inline: "hostnamectl set-hostname compute01"
7576
compute01.vm.provision "shell", path: "common-1-initial.sh"
76-
compute01.vm.provision "shell", path: "common-2-lustre_kernel.sh"
77-
compute01.vm.provision :reload
77+
compute01.vm.provision "shell", path: "common-2-software.sh"
7878
compute01.vm.provision "shell", path: "compute-1-lustre_client.sh"
79-
# compute01.vm.provision "shell", path: "compute-2-nfs_client.sh"
80-
# compute01.vm.provision "shell", path: "compute-3-slurm_client.sh"
79+
compute01.vm.provision "shell", path: "compute-2-nfs_client.sh"
80+
compute01.vm.provision "shell", path: "compute-3-slurm_client.sh"
81+
end
82+
83+
# Compute node 2
84+
config.vm.define "compute02" do |compute02|
85+
compute02.vm.box = "centos/7"
86+
compute02.vm.synced_folder ".", "/vagrant", disabled: true
87+
compute02.vm.network "private_network", ip: "10.0.4.102", nic_type: "virtio"
88+
compute02.vm.provider "virtualbox" do |v|
89+
v.memory = 1024
90+
v.cpus = 1
91+
v.customize ['modifyvm', :id, '--nictype1', 'virtio']
92+
end
93+
compute02.vm.provision "shell", inline: "hostnamectl set-hostname compute01"
94+
compute02.vm.provision "shell", path: "common-1-initial.sh"
95+
compute02.vm.provision "shell", path: "common-2-software.sh"
96+
compute02.vm.provision "shell", path: "compute-1-lustre_client.sh"
97+
compute02.vm.provision "shell", path: "compute-2-nfs_client.sh"
98+
compute02.vm.provision "shell", path: "compute-3-slurm_client.sh"
8199
end
82100

83101
end

Diff for: common-2-software.sh

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
3+
#########################################
4+
# Useful software to have on the cluster
5+
#########################################
6+
7+
# Utilities
8+
yum -y install vim-enhanced htop iftop
9+
10+
# Development
11+
yum -y groupinstall "Development Tools"
12+
sudo yum install -y git golang openssl-devel libuuid-devel \
13+
libseccomp-devel squashfs-tools cryptsetup
14+
15+
# EPEL Singularity
16+
yum -y install singularity
17+
18+
# OpenMPI
19+
yum -y install openmpi3

Diff for: compute-1-lustre_client.sh

+27-3
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,35 @@
44
# Install Lustre Kernel Modules and Setup client
55
################################################
66

7-
# Install modules into the now running lustre kernel
8-
yum --nogpgcheck --enablerepo=lustre-client-2.10.0 install \
7+
# Setup repo
8+
cat >/etc/yum.repos.d/lustre-repo.repo <<\__EOF
9+
[lustre-server]
10+
name=lustre-server
11+
baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/server
12+
# exclude=*debuginfo*
13+
gpgcheck=0
14+
15+
[lustre-client]
16+
name=lustre-client
17+
baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/client
18+
# exclude=*debuginfo*
19+
gpgcheck=0
20+
21+
[e2fsprogs-wc]
22+
name=e2fsprogs-wc
23+
baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7
24+
# exclude=*debuginfo*
25+
gpgcheck=0
26+
__EOF
27+
28+
# Install client modules
29+
yum -y --nogpgcheck --enablerepo=lustre-client install \
930
kmod-lustre-client \
1031
lustre-client
1132

33+
modprobe lustre
34+
modprobe lnet
35+
1236
# Setup our lnet network
1337
if ! lnetctl ping master; then
1438
lnetctl net add --net tcp0 --if eth1
@@ -18,7 +42,7 @@ fi
1842
# Setup mounts & mount them
1943
mkdir -p /lustre
2044
if ! grep -q "lustre" /etc/fstab; then
21-
echo "master@tcp01:/lustre1 /lustre lustre defaults_netdev 0 0" >> /etc/fstab
45+
echo "master@tcp0:/lustre1 /lustre lustre defaults,_netdev 0 0" >> /etc/fstab
2246
fi
2347

2448
if ! mountpoint -q /lustre; then

Diff for: compute-2-nfs_client.sh

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
################################################
4+
# Mount NFS share from master
5+
################################################
6+
7+
# Setup mounts & mount them
8+
mkdir -p /nfs
9+
if ! grep -q "nfs" /etc/fstab; then
10+
echo "master:/nfsdata /nfs nfs4 defaults,_netdev 0 0" >> /etc/fstab
11+
fi
12+
13+
if ! mountpoint -q /nfs; then
14+
mount /nfs
15+
fi
16+

Diff for: compute-3-slurm_client.sh

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
3+
##############################################################
4+
# Setup slurm client
5+
##############################################################
6+
7+
# Setup users
8+
export MUNGEUSER=991
9+
groupadd -g $MUNGEUSER munge
10+
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
11+
export SLURMUSER=992
12+
groupadd -g $SLURMUSER slurm
13+
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
14+
15+
# Install munge
16+
yum -y install epel-release
17+
yum -y install munge munge-libs munge-devel
18+
19+
# Create a munge key
20+
echo "THIS_IS_NOT_A_VERY_SECURE_MUNGE_KEY_BUT_IT_WILL_DO" > /etc/munge/munge.key
21+
chown munge.munge /etc/munge/munge.key
22+
chmod 400 /etc/munge/munge.key
23+
24+
# Correct perms & enable service
25+
chown -R munge: /etc/munge/ /var/log/munge/
26+
chmod 0700 /etc/munge/ /var/log/munge/
27+
systemctl enable munge
28+
systemctl start munge
29+
30+
# Install slurm RPMs (built on master)
31+
yum -y install /nfs/slurm-rpms/slurm-*.rpm
32+
33+
# Minimal slurm config
34+
cat <<EOF >> /etc/slurm/slurm.conf
35+
ControlMachine=master
36+
ControlAddr=10.0.4.100
37+
MpiDefault=none
38+
ProctrackType=proctrack/pgid
39+
ReturnToService=1
40+
SlurmctldPidFile=/var/run/slurmctld.pid
41+
SlurmdPidFile=/var/run/slurmd.pid
42+
SlurmdSpoolDir=/var/spool/slurmd
43+
SlurmUser=slurm
44+
StateSaveLocation=/var/spool/slurmctld
45+
SwitchType=switch/none
46+
TaskPlugin=task/none
47+
FastSchedule=1
48+
SchedulerType=sched/backfill
49+
SelectType=select/linear
50+
AccountingStorageType=accounting_storage/none
51+
ClusterName=dev
52+
JobAcctGatherType=jobacct_gather/none
53+
SlurmctldLogFile=/var/log/slurmctld.log
54+
SlurmdLogFile=/var/log/slurmd.log
55+
#
56+
# COMPUTE NODES
57+
NodeName=compute01 NodeAddr=10.0.4.101 CPUs=1 State=UNKNOWN
58+
NodeName=compute02 NodeAddr=10.0.4.102 CPUs=1 State=UNKNOWN
59+
PartitionName=super Nodes=compute0[1-2] Default=YES MaxTime=INFINITE State=UP
60+
EOF
61+
62+
# Start slurm
63+
systemctl enable slurmd
64+
systemctl start slurmd
65+
66+
# Check sinfo works
67+
sinfo
68+
69+
# Mark node ready to rock
70+
scontrol update nodename=$(hostname) State=resume
71+
File renamed without changes.

Diff for: master-1-lustre_fs.sh renamed to master-2-lustre_fs.sh

+14
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,17 @@ fi
5656
if ! mountpoint -q /ost; then
5757
mount /ost
5858
fi
59+
60+
# Mount the filesystem to our own host
61+
mkdir -p /lustre
62+
if ! grep -q "/lustre" /etc/fstab; then
63+
echo "master@tcp0:/lustre1 /lustre lustre defaults_netdev 0 0" >> /etc/fstab
64+
fi
65+
66+
if ! mountpoint -q /lustre; then
67+
mount /lustre
68+
fi
69+
70+
# Make perms wide open on it
71+
chmod 777 /lustre
72+

Diff for: master-2-nfs_fs.sh renamed to master-3-nfs_fs.sh

+9
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,12 @@ systemctl enable nfs-server rpcbind
3131
echo "/nfsdata 10.0.4.0/24(rw,sync,no_root_squash)" > /etc/exports
3232
exportfs -r
3333

34+
# Mount the filesystem to our own host
35+
mkdir -p /nfs
36+
if ! grep -q "nfs" /etc/fstab; then
37+
echo "master:/nfsdata /nfs nfs4 defaults,_netdev 0 0" >> /etc/fstab
38+
fi
39+
40+
if ! mountpoint -q /nfs; then
41+
mount /nfs
42+
fi

Diff for: master-3-slurm_master.sh renamed to master-4-slurm_master.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ mkdir -p /nfsdata/slurm-rpms
4141
cp /root/rpmbuild/RPMS/x86_64/slurm*.rpm /nfsdata/slurm-rpms/
4242

4343
# Correct perms
44-
mkdir /var/spool/slurmctld
44+
mkdir -p /var/spool/slurmctld
4545
chown slurm: /var/spool/slurmctld
4646
chmod 755 /var/spool/slurmctld
4747
touch /var/log/slurmctld.log
@@ -52,7 +52,7 @@ chown slurm: /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log
5252
# Minimal slurm config
5353
cat <<EOF >> /etc/slurm/slurm.conf
5454
ControlMachine=master
55-
ControlAddr=10.0.4.1
55+
ControlAddr=10.0.4.100
5656
MpiDefault=none
5757
ProctrackType=proctrack/pgid
5858
ReturnToService=1
@@ -73,8 +73,8 @@ SlurmctldLogFile=/var/log/slurmctld.log
7373
SlurmdLogFile=/var/log/slurmd.log
7474
#
7575
# COMPUTE NODES
76-
NodeName=compute01 NodeAddr=10.0.4.2 CPUs=1 State=UNKNOWN
77-
NodeName=compute02 NodeAddr=10.0.4.3 CPUs=1 State=UNKNOWN
76+
NodeName=compute01 NodeAddr=10.0.4.101 CPUs=1 State=UNKNOWN
77+
NodeName=compute02 NodeAddr=10.0.4.102 CPUs=1 State=UNKNOWN
7878
PartitionName=super Nodes=compute0[1-2] Default=YES MaxTime=INFINITE State=UP
7979
EOF
8080

0 commit comments

Comments
 (0)