@@ -127,151 +127,112 @@ qsub run-R.sh
127
127
128
128
## Example multi-node parallel job using Rmpi and snow
129
129
130
- This script uses Rmpi and snow to allow it to run across multiple nodes using MPI.
130
+ This script uses Rmpi and doMPI to allow it to run across multiple nodes using MPI.
131
+
132
+ To try this example, save this job script and the following R script (` doMPI_example.R ` ) in your home directory.
131
133
132
134
```
133
135
#!/bin/bash -l
134
136
135
- # Example jobscript to run an R MPI parallel job
137
+ # Batch script to run an MPI parallel R job using the doMPI package
138
+ # with the upgraded software stack under SGE with OpenMPI.
136
139
137
140
# Request ten minutes of wallclock time (format hours:minutes:seconds).
138
141
#$ -l h_rt=0:10:0
139
142
140
143
# Request 1 gigabyte of RAM per process.
141
144
#$ -l mem=1G
142
145
143
- # Request 15 gigabytes of TMPDIR space per node (default is 10 GB)
144
- #$ -l tmpfs=15G
146
+ # Set tmpfs to 1 gigabyte of TMPDIR space (default is 10 GB)
147
+ # Remove this for clusters without temporary filesystems, e.g. Kathleen
148
+ #$ -l tmpfs=1G
145
149
146
150
# Set the name of the job.
147
- #$ -N snow_monte_carlo
151
+ #$ -N R-doMPI-example
148
152
149
- # Select the MPI parallel environment with 32 processes
150
- #$ -pe mpi 32
153
+ # Select the MPI parallel environment with 12 processes, the maximum possible
154
+ # on Myriad would be 36. On Kathleen, request at least 41 processes.
155
+ #$ -pe mpi 12
151
156
152
- # Set the working directory to somewhere in your scratch space. This is
153
- # necessary because the compute nodes cannot write to your $HOME
154
- # NOTE: this directory must exist.
155
- # Replace "<your_UCL_id>" with your UCL user ID
156
- #$ -wd /home/<your_UCL_id>/Scratch/R_output
157
+ # Set the working directory to the current directory. In this case, we use the home directory.
158
+ #$ -cwd
157
159
158
- # Load the R module
159
160
module -f unload compilers mpi gcc-libs
160
- module load r/recommended
161
+ module load r/r-4.4.2_bc-3.20
161
162
162
- # Copy example files in to the working directory (not necessary if already there)
163
- cp ~/R/Examples/snow_example.R .
164
- cp ~/R/Examples/monte_carlo.R .
163
+ # Run our MPI job. GERun is a wrapper that launches MPI jobs on UCL clusters.
165
164
166
- # Run our MPI job. GERun is our wrapper for mpirun, which launches MPI jobs
167
- gerun RMPISNOW < snow_example.R > snow.out.${JOB_ID}
165
+ gerun Rscript doMPI_example.R
168
166
```
169
- The output file is saved in ` $HOME/Scratch/R_examples/snow/snow.out. ${JOB_ID}` .
167
+ The output is saved in ` ~/R-doMPI-example.o ${JOB_ID}` .
170
168
171
- If your jobscript is called ` run-R-snow .sh ` then your job submission command would be:
169
+ If your jobscript is called ` run-R-doMPI .sh ` then your job submission command would be:
172
170
```
173
- qsub run-R-snow .sh
171
+ qsub run-R-doMPI .sh
174
172
```
175
173
176
174
### Example R script using Rmpi and snow
177
175
178
- This R script has been written to use Rmpi and snow and can be used with the above jobscript. It is ` snow_example .R` above.
176
+ This R script has been written to use Rmpi and doMPI and can be used with the above jobscript. It is ` doMPI_example .R` above.
179
177
180
178
```
181
- #Load the snow and random number package.
182
- library(snow)
183
- library(Rmpi)
184
-
185
- # This example uses the already installed LEcuyers RNG library(rlecuyer)
186
- library(rlecuyer)
187
-
188
- # Set up our input/output
189
- source('./monte_carlo.R')
190
- sink('./monte_carlo_output.txt')
191
-
192
- # Get a reference to our snow cluster that has been set up by the RMPISNOW
193
- # script.
194
- cl <- getMPIcluster ()
195
-
196
- # Display info about each process in the cluster
197
- print(clusterCall(cl, function() Sys.info()))
198
-
199
- # Load the random number package on each R process
200
- clusterEvalQ (cl, library (rlecuyer))
201
-
202
- # Generate a seed for the pseudorandom number generator, unique to each
203
- # processor in the cluster.
204
-
205
- #Uncomment below line for default (unchanging) random number seed.
206
- #clusterSetupRNG(cl, type = 'RNGstream')
207
-
208
- #The lines below set up a time-based random number seed. Note that
209
- #this only demonstrates the virtues of changing the seed; no guarantee
210
- #is made that this seed is at all useful. Comment out if you uncomment
211
- #the above line.
212
- s <- sum(strtoi(charToRaw(date()), base = 32))
213
- clusterSetupRNGstream(cl, seed=rep(s,6))
214
-
215
- #Choose which of the following blocks best fit your own needs.
216
-
217
- # BLOCK 1
218
- # Set up the input to our Monte Carlo function.
219
- # Input is identical across the batch, only RNG seed has changed.
220
- # For this example, both clusters will roll one die.
221
-
222
- nrolls <- 2
223
- print("Roll the dice once...")
224
- output <- clusterCall(cl, monte_carlo, nrolls)
225
- output
226
- print("Roll the dice again...")
227
- output <- clusterCall(cl, monte_carlo, nrolls)
228
- output
229
-
230
- # Output should show the results of two rolls of a six-sided die.
231
-
232
- #BLOCK 2
233
- # Input is different for each processor
234
- print("Second example: coin flip plus 3 dice")
235
- input <- array(1:2) # Set up array of inputs, with each entry
236
- input[1] <- 1 # corresponding to one processor.
237
- input[2] <- 3
238
- parameters <- array(1:2) # Set up inputs that will be used by each cluster.
239
- parameters[1] <- 2 # These will be passed to monte_carlo as its
240
- parameters[2] <- 6 # second argument.
241
- output <- clusterApply(cl, input, monte_carlo, parameters)
242
-
243
- # Output should show the results of a coin flip and the roll of three
244
- # six-sided die.
245
-
246
- # Output the output.
247
- output
248
-
249
- inputStrings <- array(1:2)
250
- inputStrings[1] <- 'abc'
251
- inputStrings[2] <- 'def'
252
- output <- clusterApply(cl, inputStrings, paste, 'foo')
253
- output
254
-
255
- #clusterEvalQ(cl, sinkWorkerOutput("snow_monte_carlo.out"))
256
-
257
- # Clean up the cluster and release the relevant resources.
258
- stopCluster(cl)
259
- sink()
260
-
261
- mpi.quit()
262
- ```
179
+ # This example uses one of the historic datasets from the HistData package
180
+ # and is from Princeton University
263
181
264
- This is ` monte_carlo.R ` which is called by ` snow_example.R ` :
265
- ```
266
- monte_carlo <- function(x, numsides=6){
267
- streamname <- .lec.GetStreams ()
268
- dice <- .lec.uniform.int(streamname[1], n = 1, a=1, b=numsides)
269
- outp <- sum(dice)
270
- return(outp)
182
+ # load the Rmpi, doMPI and HistData packages - already installed on UCL clusters.
183
+
184
+ library (Rmpi)
185
+ library (doMPI)
186
+ library (HistData)
187
+
188
+ # This is Galton's data mapping the height of sons to their fathers
189
+ # ~900 rows of 2 columns
190
+ data (Galton)
191
+
192
+ # Set up the cluster
193
+
194
+ cl <- startMPIcluster ()
195
+ registerDoMPI (cl)
196
+
197
+ # Splitting the Galton data frame (mapped to df) into
198
+ # units of 100 rows max.
199
+
200
+ df <- Galton
201
+ n <- 100
202
+ nr <- nrow (df)
203
+
204
+ # uses rep to specify the break points without having to manually call each
205
+ split_df <- split (df, rep(1:ceiling (nr/n), each=n, length.out=nr))
206
+
207
+ # We might want to know the ratio of the parent's height vs the child's
208
+
209
+ # foreach takes parameters and passes them to the MPI worker processes
210
+ # using the dimension of the parameter with the longest length (only one here)
211
+ # .combine= specifies a function that will be used to combine the results, i.e.
212
+ # cbind, rbind, c, etc.
213
+
214
+ df$results <- foreach(i=1:length(split_df), .combine='rbind') %dopar% {
215
+
216
+ # We take the split lists of data frame, add a column called $ratio
217
+ # and assign the result just as we would with a non-parallelized operation
218
+
219
+ result <- split_df[[i]]$parent/split_df[[i]]$child
220
+ as.data.frame (result)
221
+
222
+ # this result gets rbind-ed together as a column on our df.
271
223
}
224
+
225
+ # Take a look at the df we got back that we could continue working on if we wanted to
226
+ head (df)
227
+
228
+ # close the cluster to properly free up the MPI resources so GE can see that
229
+ # the job has finished.
230
+
231
+ closeCluster (cl)
232
+ Rmpi::mpi.quit ()
272
233
```
273
234
274
- This example was based on [ SHARCNET 's Using R and MPI ] ( https://web.archive.org/web/20190107091729/https://www.sharcnet.ca/help/index.php/Using_R_and_MPI ) .
235
+ This example was based on Princeton 's [ Using R on the Research Computing Clusters ] ( https://github.com/PrincetonUniversity/HPC_R_Workshop/blob/6ddac56324021277f163789f7f501fa82d92deca/04_doMPI/04_doMPI.R ) repository .
275
236
276
237
## Using your own R packages
277
238
@@ -324,4 +285,3 @@ If you want to keep some libraries separate, you can have multiple colon-separat
324
285
If you are installing extra packages for BioConductor, check that you are using the same version that the R module you have loaded is using.
325
286
326
287
Eg. you can find the [ BioConductor 3.15 package downloads here] ( http://www.bioconductor.org/packages/3.15/BiocViews.html#___Software ) .
327
-
0 commit comments