Skip to content

Testing PySlurm with Docker

Giovanni Torres edited this page Nov 5, 2016 · 5 revisions

Create Dockerfile

Create a Dockerfile based on a CentOS 7 image:

FROM centos:7
MAINTAINER "Giovanni Torres" <[email protected]>

RUN groupadd -r slurm && useradd -r -g slurm slurm

RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum -y install wget bzip2 perl gcc vim-enhanced git make munge munge-devel \
    supervisor python-devel python-pip
RUN pip install Cython nose

ENV SLURM_VERSION 16.05.6
ENV SLURM_DOWNLOAD_MD5 0c7911e52443e9f5ad1fc381085ec183
ENV SLURM_DOWNLOAD_URL http://www.schedmd.com/download/latest/slurm-"$SLURM_VERSION".tar.bz2

RUN set -x \
    && wget -O slurm.tar.bz2 "$SLURM_DOWNLOAD_URL" \
    && echo "$SLURM_DOWNLOAD_MD5" slurm.tar.bz2 | md5sum -c - \
    && mkdir /usr/local/src/slurm \
    && tar jxf slurm.tar.bz2 -C /usr/local/src/slurm --strip-components=1 \
    && rm slurm.tar.bz2 \
    && cd /usr/local/src/slurm \
    && ./configure --enable-debug --enable-front-end --prefix=/usr --sysconfdir=/etc/slurm \
    && make install \
    && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
    && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
    && install -D -m644 etc/slurm.epilog.clean /etc/slurm/slurm.epilog.clean \
    && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
    && cd \
    && rm -rf /usr/local/src/slurm \
    && mkdir /etc/sysconfig/slurm \
    && mkdir /var/spool/slurmd \
    && chown slurm:slurm /var/spool/slurmd \
    && mkdir /var/run/slurmd \
    && chown slurm:slurm /var/run/slurmd \
    && mkdir /var/lib/slurmd \
    && chown slurm:slurm /var/lib/slurmd \
    && /sbin/create-munge-key

COPY slurm.conf /etc/slurm/slurm.conf

COPY supervisord.conf /etc/
ENTRYPOINT /usr/bin/supervisord -c /etc/supervisord.conf && /bin/bash

Create a slurm.conf

Create a slurm.conf with a few nodes that will all run on the same node for testing:

# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=ernie
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
#SlurmctldLogFile=
SlurmdDebug=3
#SlurmdLogFile=
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=c[1-10] NodeHostName=localhost NodeAddr=127.0.0.1 RealMemory=1000
#
# PARTITIONS
PartitionName=normal Default=yes Nodes=c[1-5] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
PartitionName=normal Nodes=c[6-10] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP

Create a supervisord configuration file

This will use supervisord as a process manager inside the docker container. It will be responsible for starting slurmd, slurmctld, and munged:

[unix_http_server]
file=/var/run/supervisor/supervisor.sock

[supervisord]
logfile=/var/log/supervisor/supervisord.log
logfile_maxbytes=5MB
logfile_backups=10
loglevel=info
pidfile=/var/run/supervisord.pid
nodaemon=false

[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

[supervisorctl]
serverurl=unix:///var/run/supervisor/supervisor.sock

[program:munged]
user=munge
command=/usr/sbin/munged -F
autostart=true
autorestart=false
startsecs=5
startretries=2
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/munged.log
stdout_logfile_maxbytes=1MB
stdout_logfile_backups=5
stderr_logfile=/var/log/supervisor/munged.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5

[program:slurmctld]
user=root
command=/usr/sbin/slurmctld -D -vvvvv
autostart=true
autorestart=false
startsecs=5
startretries=2
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmctld.log
stdout_logfile_maxbytes=1MB
stdout_logfile_backups=5
stderr_logfile=/var/log/supervisor/slurmctld.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5

[program:slurmd]
user=root
command=/usr/sbin/slurmd -D -vvvvv
autostart=true
autorestart=false
startsecs=5
startretries=2
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmd.log
stdout_logfile_maxbytes=1MB
stdout_logfile_backups=5
stderr_logfile=/var/log/supervisor/slurmd.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5

Build the Slurm container

Before building the container, be sure to edit the following ENV values in the Dockerfile depending on your desired version of Slurm:

ENV SLURM_VERSION 16.05.6 ENV SLURM_DOWNLOAD_MD5 0c7911e52443e9f5ad1fc381085ec183

You can get the MD5 hashes from http://www.schedmd.com/#repos

Put all three files above in a directory. From that directory, run the following to build the containter:

docker build -t slurm-16.05.6-1 .

Run the container

Notice in slurm.conf, the ControlMachine is given the name ernie. Therefore, run the container with the following to keep the hostname, otherwise slurmctld will fail due to a mismatched hostname:

docker run -it -h ernie slurm-16.05.6-1

This should take you right to a bash shell inside the container:

[root@ernie /]# sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
normal*      up 5-00:00:00      5   idle c[1-5]
[root@ernie /]# scontrol show node c1
NodeName=c1 CoresPerSocket=1
   CPUAlloc=0 CPUErr=0 CPUTot=1 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=127.0.0.1 NodeHostName=localhost Version=(null)
   RealMemory=1000 AllocMem=0 FreeMem=N/A Sockets=1 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   BootTime=2016-10-23T22:22:09 SlurmdStartTime=2016-11-05T16:38:10
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

Build PySlurm

Slurm is already installed and the libraries are in the system defaults (/usr/lib/slurm). Therefore, PySlurm will find them without specifying paths to libraries and include headers.

Clone the repo:

git clone https://github.com/PySlurm/pyslurm.git

Build and install PySlurm

cd pyslurm
python setup.py build
python setup.py install

Run a test job:

sbatch --wrap="sleep 1000" --partition=normal -N 1

Change directories and run tests:

cd && nosetests -v /pyslurm/tests

At this point, some of the tests will fail because some OS features are not populated inside the container whereas it would be on baremetal or virtual machines.