21
21
from ...utils .profiler import get_system_total_memory_gb
22
22
from ..engine import MapNode
23
23
from .base import DistributedPluginBase
24
+ from .tools import gpu_count
24
25
25
26
try :
26
27
from textwrap import indent
@@ -100,6 +101,7 @@ class MultiProcPlugin(DistributedPluginBase):
100
101
101
102
- non_daemon: boolean flag to execute as non-daemon processes
102
103
- n_procs: maximum number of threads to be executed in parallel
104
+ - n_gpu_procs: maximum number of GPU threads to be executed in parallel
103
105
- memory_gb: maximum memory (in GB) that can be used at once.
104
106
- raise_insufficient: raise error if the requested resources for
105
107
a node over the maximum `n_procs` and/or `memory_gb`
@@ -130,10 +132,24 @@ def __init__(self, plugin_args=None):
130
132
)
131
133
self .raise_insufficient = self .plugin_args .get ("raise_insufficient" , True )
132
134
135
+ # GPU found on system
136
+ self .n_gpus_visible = gpu_count ()
137
+ # proc per GPU set by user
138
+ self .n_gpu_procs = self .plugin_args .get ('n_gpu_procs' , self .n_gpus_visible )
139
+
140
+ # total no. of processes allowed on all gpus
141
+ if self .n_gpu_procs > self .n_gpus_visible :
142
+ logger .info (
143
+ 'Total number of GPUs proc requested (%d) exceeds the available number of GPUs (%d) on the system. Using requested GPU slots at your own risk!' ,
144
+ self .n_gpu_procs ,
145
+ self .n_gpus_visible ,
146
+ )
147
+
133
148
# Instantiate different thread pools for non-daemon processes
134
149
logger .debug (
135
- "[MultiProc] Starting (n_procs=%d, mem_gb=%0.2f, cwd=%s)" ,
150
+ "[MultiProc] Starting (n_procs=%d, n_gpu_procs=%d, mem_gb=%0.2f, cwd=%s)" ,
136
151
self .processors ,
152
+ self .n_gpu_procs ,
137
153
self .memory_gb ,
138
154
self ._cwd ,
139
155
)
@@ -184,9 +200,12 @@ def _prerun_check(self, graph):
184
200
"""Check if any node exceeds the available resources"""
185
201
tasks_mem_gb = []
186
202
tasks_num_th = []
203
+ tasks_gpu_th = []
187
204
for node in graph .nodes ():
188
205
tasks_mem_gb .append (node .mem_gb )
189
206
tasks_num_th .append (node .n_procs )
207
+ if node .is_gpu_node ():
208
+ tasks_gpu_th .append (node .n_procs )
190
209
191
210
if np .any (np .array (tasks_mem_gb ) > self .memory_gb ):
192
211
logger .warning (
@@ -203,6 +222,10 @@ def _prerun_check(self, graph):
203
222
)
204
223
if self .raise_insufficient :
205
224
raise RuntimeError ("Insufficient resources available for job" )
225
+ if np .any (np .array (tasks_gpu_th ) > self .n_gpu_procs ):
226
+ logger .warning ('Nodes demand more GPU than allowed (%d).' , self .n_gpu_procs )
227
+ if self .raise_insufficient :
228
+ raise RuntimeError ('Insufficient GPU resources available for job' )
206
229
207
230
def _postrun_check (self ):
208
231
self .pool .shutdown ()
@@ -213,11 +236,14 @@ def _check_resources(self, running_tasks):
213
236
"""
214
237
free_memory_gb = self .memory_gb
215
238
free_processors = self .processors
239
+ free_gpu_slots = self .n_gpu_procs
216
240
for _ , jobid in running_tasks :
217
241
free_memory_gb -= min (self .procs [jobid ].mem_gb , free_memory_gb )
218
242
free_processors -= min (self .procs [jobid ].n_procs , free_processors )
243
+ if self .procs [jobid ].is_gpu_node ():
244
+ free_gpu_slots -= min (self .procs [jobid ].n_procs , free_gpu_slots )
219
245
220
- return free_memory_gb , free_processors
246
+ return free_memory_gb , free_processors , free_gpu_slots
221
247
222
248
def _send_procs_to_workers (self , updatehash = False , graph = None ):
223
249
"""
@@ -232,7 +258,9 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
232
258
)
233
259
234
260
# Check available resources by summing all threads and memory used
235
- free_memory_gb , free_processors = self ._check_resources (self .pending_tasks )
261
+ free_memory_gb , free_processors , free_gpu_slots = self ._check_resources (
262
+ self .pending_tasks
263
+ )
236
264
237
265
stats = (
238
266
len (self .pending_tasks ),
@@ -241,6 +269,8 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
241
269
self .memory_gb ,
242
270
free_processors ,
243
271
self .processors ,
272
+ free_gpu_slots ,
273
+ self .n_gpu_procs ,
244
274
)
245
275
if self ._stats != stats :
246
276
tasks_list_msg = ""
@@ -256,13 +286,15 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
256
286
tasks_list_msg = indent (tasks_list_msg , " " * 21 )
257
287
logger .info (
258
288
"[MultiProc] Running %d tasks, and %d jobs ready. Free "
259
- "memory (GB): %0.2f/%0.2f, Free processors: %d/%d.%s" ,
289
+ "memory (GB): %0.2f/%0.2f, Free processors: %d/%d, Free GPU slot:%d/%d .%s" ,
260
290
len (self .pending_tasks ),
261
291
len (jobids ),
262
292
free_memory_gb ,
263
293
self .memory_gb ,
264
294
free_processors ,
265
295
self .processors ,
296
+ free_gpu_slots ,
297
+ self .n_gpu_procs ,
266
298
tasks_list_msg ,
267
299
)
268
300
self ._stats = stats
@@ -304,28 +336,39 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
304
336
# Check requirements of this job
305
337
next_job_gb = min (self .procs [jobid ].mem_gb , self .memory_gb )
306
338
next_job_th = min (self .procs [jobid ].n_procs , self .processors )
339
+ next_job_gpu_th = min (self .procs [jobid ].n_procs , self .n_gpu_procs )
340
+
341
+ is_gpu_node = self .procs [jobid ].is_gpu_node ()
307
342
308
343
# If node does not fit, skip at this moment
309
- if next_job_th > free_processors or next_job_gb > free_memory_gb :
344
+ if (
345
+ next_job_th > free_processors
346
+ or next_job_gb > free_memory_gb
347
+ or (is_gpu_node and next_job_gpu_th > free_gpu_slots )
348
+ ):
310
349
logger .debug (
311
- "Cannot allocate job %d (%0.2fGB, %d threads)." ,
350
+ "Cannot allocate job %d (%0.2fGB, %d threads, %d GPU slots )." ,
312
351
jobid ,
313
352
next_job_gb ,
314
353
next_job_th ,
354
+ next_job_gpu_th ,
315
355
)
316
356
continue
317
357
318
358
free_memory_gb -= next_job_gb
319
359
free_processors -= next_job_th
360
+ if is_gpu_node :
361
+ free_gpu_slots -= next_job_gpu_th
320
362
logger .debug (
321
363
"Allocating %s ID=%d (%0.2fGB, %d threads). Free: "
322
- "%0.2fGB, %d threads." ,
364
+ "%0.2fGB, %d threads, %d GPU slots ." ,
323
365
self .procs [jobid ].fullname ,
324
366
jobid ,
325
367
next_job_gb ,
326
368
next_job_th ,
327
369
free_memory_gb ,
328
370
free_processors ,
371
+ free_gpu_slots ,
329
372
)
330
373
331
374
# change job status in appropriate queues
@@ -355,6 +398,8 @@ def _send_procs_to_workers(self, updatehash=False, graph=None):
355
398
self ._remove_node_dirs ()
356
399
free_memory_gb += next_job_gb
357
400
free_processors += next_job_th
401
+ if is_gpu_node :
402
+ free_gpu_slots += next_job_gpu_th
358
403
# Display stats next loop
359
404
self ._stats = None
360
405
0 commit comments