@@ -100,6 +100,7 @@ class MultiProcPlugin(DistributedPluginBase):
100
100
101
101
- non_daemon: boolean flag to execute as non-daemon processes
102
102
- n_procs: maximum number of threads to be executed in parallel
103
+ - n_gpu_procs: maximum number of GPU threads to be executed in parallel
103
104
- memory_gb: maximum memory (in GB) that can be used at once.
104
105
- raise_insufficient: raise error if the requested resources for
105
106
a node over the maximum `n_procs` and/or `memory_gb`
@@ -130,10 +131,22 @@ def __init__(self, plugin_args=None):
130
131
)
131
132
self .raise_insufficient = self .plugin_args .get ("raise_insufficient" , True )
132
133
134
+ # GPU found on system
135
+ self .n_gpus_visible = MultiProcPlugin .gpu_count ()
136
+ # proc per GPU set by user
137
+ self .n_gpu_procs = plugin_args .get ('n_gpu_procs' , self .n_gpus_visible )
138
+
139
+ # total no. of processes allowed on all gpus
140
+ if self .n_gpu_procs > self .n_gpus_visible :
141
+ logger .info (
142
+ 'Total number of GPUs proc requested (%d) exceeds the available number of GPUs (%d) on the system. Using requested GPU slots at your own risk!' % (
143
+ self .n_gpu_procs , self .n_gpus_visible ))
144
+
133
145
# Instantiate different thread pools for non-daemon processes
134
146
logger .debug (
135
- "[MultiProc] Starting (n_procs=%d, mem_gb=%0.2f, cwd=%s)" ,
147
+ "[MultiProc] Starting (n_procs=%d, n_gpu_procs=%d, mem_gb=%0.2f, cwd=%s)" ,
136
148
self .processors ,
149
+ self .n_gpu_procs ,
137
150
self .memory_gb ,
138
151
self ._cwd ,
139
152
)
@@ -184,9 +197,12 @@ def _prerun_check(self, graph):
184
197
"""Check if any node exceeds the available resources"""
185
198
tasks_mem_gb = []
186
199
tasks_num_th = []
200
+ tasks_gpu_th = []
187
201
for node in graph .nodes ():
188
202
tasks_mem_gb .append (node .mem_gb )
189
203
tasks_num_th .append (node .n_procs )
204
+ if node .is_gpu_node ():
205
+ tasks_gpu_th .append (node .n_procs )
190
206
191
207
if np .any (np .array (tasks_mem_gb ) > self .memory_gb ):
192
208
logger .warning (
@@ -203,6 +219,12 @@ def _prerun_check(self, graph):
203
219
)
204
220
if self .raise_insufficient :
205
221
raise RuntimeError ("Insufficient resources available for job" )
222
+ if np .any (np .array (tasks_gpu_th ) > self .n_gpu_procs ):
223
+ logger .warning (
224
+ 'Nodes demand more GPU than allowed (%d).' ,
225
+ self .n_gpu_procs )
226
+ if self .raise_insufficient :
227
+ raise RuntimeError ('Insufficient GPU resources available for job' )
206
228
207
229
def _postrun_check (self ):
208
230
self .pool .shutdown ()
@@ -213,11 +235,14 @@ def _check_resources(self, running_tasks):
213
235
"""
214
236
free_memory_gb = self .memory_gb
215
237
free_processors = self .processors
238
+ free_gpu_slots = self .n_gpu_procs
216
239
for _ , jobid in running_tasks :
217
240
free_memory_gb -= min (self .procs [jobid ].mem_gb , free_memory_gb )
218
241
free_processors -= min (self .procs [jobid ].n_procs , free_processors )
242
+ if self .procs [jobid ].is_gpu_node ():
243
+ free_gpu_slots -= min (self .procs [jobid ].n_procs , free_gpu_slots )
219
244
220
- return free_memory_gb , free_processors
245
+ return free_memory_gb , free_processors , free_gpu_slots
221
246
222
247
def _send_procs_to_workers (self , updatehash = False , graph = None ):
223
248
"""
@@ -232,7 +257,7 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
232
257
)
233
258
234
259
# Check available resources by summing all threads and memory used
235
- free_memory_gb , free_processors = self ._check_resources (self .pending_tasks )
260
+ free_memory_gb , free_processors , free_gpu_slots = self ._check_resources (self .pending_tasks )
236
261
237
262
stats = (
238
263
len (self .pending_tasks ),
@@ -241,6 +266,8 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
241
266
self .memory_gb ,
242
267
free_processors ,
243
268
self .processors ,
269
+ free_gpu_slots ,
270
+ self .n_gpu_procs
244
271
)
245
272
if self ._stats != stats :
246
273
tasks_list_msg = ""
@@ -256,13 +283,15 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
256
283
tasks_list_msg = indent (tasks_list_msg , " " * 21 )
257
284
logger .info (
258
285
"[MultiProc] Running %d tasks, and %d jobs ready. Free "
259
- "memory (GB): %0.2f/%0.2f, Free processors: %d/%d.%s" ,
286
+ "memory (GB): %0.2f/%0.2f, Free processors: %d/%d, Free GPU slot:%d/%d .%s" ,
260
287
len (self .pending_tasks ),
261
288
len (jobids ),
262
289
free_memory_gb ,
263
290
self .memory_gb ,
264
291
free_processors ,
265
292
self .processors ,
293
+ free_gpu_slots ,
294
+ self .n_gpu_procs ,
266
295
tasks_list_msg ,
267
296
)
268
297
self ._stats = stats
@@ -304,28 +333,36 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
304
333
# Check requirements of this job
305
334
next_job_gb = min (self .procs [jobid ].mem_gb , self .memory_gb )
306
335
next_job_th = min (self .procs [jobid ].n_procs , self .processors )
336
+ next_job_gpu_th = min (self .procs [jobid ].n_procs , self .n_gpu_procs )
337
+
338
+ is_gpu_node = self .procs [jobid ].is_gpu_node ()
307
339
308
340
# If node does not fit, skip at this moment
309
- if next_job_th > free_processors or next_job_gb > free_memory_gb :
341
+ if (next_job_th > free_processors or next_job_gb > free_memory_gb
342
+ or (is_gpu_node and next_job_gpu_th > free_gpu_slots )):
310
343
logger .debug (
311
- "Cannot allocate job %d (%0.2fGB, %d threads)." ,
344
+ "Cannot allocate job %d (%0.2fGB, %d threads, %d GPU slots )." ,
312
345
jobid ,
313
346
next_job_gb ,
314
347
next_job_th ,
348
+ next_job_gpu_th ,
315
349
)
316
350
continue
317
351
318
352
free_memory_gb -= next_job_gb
319
353
free_processors -= next_job_th
354
+ if is_gpu_node :
355
+ free_gpu_slots -= next_job_gpu_th
320
356
logger .debug (
321
357
"Allocating %s ID=%d (%0.2fGB, %d threads). Free: "
322
- "%0.2fGB, %d threads." ,
358
+ "%0.2fGB, %d threads, %d GPU slots ." ,
323
359
self .procs [jobid ].fullname ,
324
360
jobid ,
325
361
next_job_gb ,
326
362
next_job_th ,
327
363
free_memory_gb ,
328
364
free_processors ,
365
+ free_gpu_slots ,
329
366
)
330
367
331
368
# change job status in appropriate queues
@@ -352,6 +389,8 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
352
389
self ._remove_node_dirs ()
353
390
free_memory_gb += next_job_gb
354
391
free_processors += next_job_th
392
+ if is_gpu_node :
393
+ free_gpu_slots -= next_job_gpu_th
355
394
# Display stats next loop
356
395
self ._stats = None
357
396
@@ -379,3 +418,12 @@ def _sort_jobs(self, jobids, scheduler="tsort"):
379
418
key = lambda item : (self .procs [item ].mem_gb , self .procs [item ].n_procs ),
380
419
)
381
420
return jobids
421
+
422
+ @staticmethod
423
+ def gpu_count ():
424
+ n_gpus = 1
425
+ try :
426
+ import GPUtil
427
+ return len (GPUtil .getGPUs ())
428
+ except ImportError :
429
+ return n_gpus
0 commit comments