19
19
from parsl .config import Config
20
20
from parsl .data_provider .files import File
21
21
from parsl .executors import HighThroughputExecutor , ThreadPoolExecutor
22
- from parsl .launchers .launchers import SimpleLauncher
22
+ from parsl .launchers .launchers import SimpleLauncher , SrunLauncher
23
23
from parsl .providers import * # noqa: F403
24
24
from parsl .providers .base import ExecutionProvider
25
25
26
26
from psiflow .models import BaseModel
27
- from psiflow .parsl_utils import ContainerizedLauncher , MyWorkQueueExecutor
27
+ from psiflow .parsl_utils import ContainerizedLauncher , ContainerizedSrunLauncher
28
28
from psiflow .reference import BaseReference
29
29
from psiflow .utils import resolve_and_check , set_logger
30
30
@@ -197,6 +197,7 @@ class ExecutionContextLoader:
197
197
def parse_config (yaml_dict : dict ):
198
198
definitions = []
199
199
200
+ container_dict = yaml_dict .pop ("container" , None )
200
201
for name in ["ModelEvaluation" , "ModelTraining" , "ReferenceEvaluation" ]:
201
202
if name in yaml_dict :
202
203
_dict = yaml_dict .pop (name )
@@ -223,25 +224,45 @@ def parse_config(yaml_dict: dict):
223
224
s = _dict ["mpi_command" ]
224
225
_dict ["mpi_command" ] = lambda x , s = s : s .format (x )
225
226
226
- if "container" in yaml_dict :
227
- assert not _dict ["use_threadpool" ] # not possible with container
227
+ # set up containerized launcher if necessary
228
+ if ("container" not in _dict and container_dict is None ) or _dict [
229
+ "use_threadpool"
230
+ ]:
231
+ launcher = SimpleLauncher ()
232
+ _container_dict = None
233
+ else :
234
+ _container_dict = yaml_dict .pop ("container" , container_dict )
235
+ assert _container_dict is not None
228
236
launcher = ContainerizedLauncher (
229
- ** yaml_dict ["container" ], enable_gpu = _dict ["gpu" ]
237
+ ** _container_dict ,
238
+ enable_gpu = _dict ["gpu" ],
230
239
)
231
- else :
232
- launcher = SimpleLauncher ()
233
240
234
241
# initialize provider
235
- provider_dict = None
236
- for key in _dict :
237
- if "Provider" in key :
238
- assert provider_dict is None
239
- provider_dict = _dict [key ]
240
- if provider_dict is not None :
241
- provider_cls = getattr (sys .modules [__name__ ], key )
242
- provider = provider_cls (launcher = launcher , ** _dict .pop (key ))
243
- else :
242
+ provider_keys = list (filter (lambda k : "Provider" in k , _dict .keys ()))
243
+ if len (provider_keys ) == 0 :
244
244
provider = LocalProvider (launcher = launcher ) # noqa: F405
245
+ elif len (provider_keys ) == 1 :
246
+ provider_dict = _dict [provider_keys [0 ]]
247
+
248
+ # if provider requests multiple nodes, switch to (containerized) SrunLauncher
249
+ if (
250
+ provider_dict .pop ("nodes_per_block" , 1 ) > 1
251
+ and "container" in yaml_dict
252
+ ):
253
+ assert (
254
+ provider_keys [0 ] == "SlurmProvider"
255
+ ), "multi-node blocks only supported for SLURM"
256
+ if _container_dict is not None :
257
+ launcher = ContainerizedSrunLauncher (
258
+ ** _container_dict , enable_gpu = _dict ["gpu" ]
259
+ )
260
+ else :
261
+ launcher = SrunLauncher ()
262
+ provider_cls = getattr (sys .modules [__name__ ], provider_keys [0 ])
263
+ provider = provider_cls (launcher = launcher , ** provider_dict )
264
+ else :
265
+ raise ValueError ("Can only have one provider per executor" )
245
266
246
267
# initialize definition
247
268
definition_cls = getattr (sys .modules [__name__ ], name )
@@ -259,7 +280,6 @@ def parse_config(yaml_dict: dict):
259
280
"default_threads" : 1 ,
260
281
"mode" : "htex" ,
261
282
"htex_address" : address_by_hostname (),
262
- "workqueue_use_coprocess" : False , # CP2K doesn't like this
263
283
}
264
284
forced = {
265
285
"initialize_logging" : False , # manual; to move parsl.log one level up
@@ -319,18 +339,16 @@ def load(
319
339
path .iterdir ()
320
340
), "internal directory {} should be empty" .format (path )
321
341
path .mkdir (parents = True , exist_ok = True )
322
- set_logger (psiflow_config .pop ("psiflow_log_level" ))
323
342
parsl .set_file_logger (
324
343
str (path / "parsl.log" ),
325
344
"parsl" ,
326
345
getattr (logging , psiflow_config .pop ("parsl_log_level" )),
327
- # format_string="%(levelname)s - %(name)s - %(message)s",
328
346
)
347
+ set_logger (psiflow_config .pop ("psiflow_log_level" ))
329
348
330
349
# create main parsl executors
331
350
executors = []
332
351
mode = psiflow_config .pop ("mode" )
333
- use_coprocess = psiflow_config .pop ("workqueue_use_coprocess" )
334
352
htex_address = psiflow_config .pop ("htex_address" )
335
353
for definition in definitions :
336
354
if definition .use_threadpool :
@@ -362,61 +380,8 @@ def load(
362
380
provider = definition .parsl_provider ,
363
381
cpu_affinity = definition .cpu_affinity ,
364
382
)
365
- elif mode == "workqueue" :
366
- worker_options = []
367
- if hasattr (definition .parsl_provider , "cores_per_node" ):
368
- worker_options .append (
369
- "--cores={}" .format (definition .parsl_provider .cores_per_node ),
370
- )
371
- else :
372
- worker_options .append (
373
- "--cores={}" .format (psutil .cpu_count (logical = False )),
374
- )
375
- if hasattr (definition .parsl_provider , "walltime" ):
376
- walltime_hhmmss = definition .parsl_provider .walltime .split (":" )
377
- assert len (walltime_hhmmss ) == 3
378
- walltime = 0
379
- walltime += 60 * float (walltime_hhmmss [0 ])
380
- walltime += float (walltime_hhmmss [1 ])
381
- walltime += 1 # whatever seconds are present
382
- walltime -= (
383
- 5 # add 5 minutes of slack, e.g. for container downloading
384
- )
385
- worker_options .append ("--wall-time={}" .format (walltime * 60 ))
386
- worker_options .append ("--parent-death" )
387
- worker_options .append (
388
- "--timeout={}" .format (psiflow_config ["max_idletime" ])
389
- )
390
- # manager_config = TaskVineManagerConfig(
391
- # shared_fs=True,
392
- # max_retries=1,
393
- # autocategory=False,
394
- # enable_peer_transfers=False,
395
- # port=0,
396
- # )
397
- # factory_config = TaskVineFactoryConfig(
398
- # factory_timeout=20,
399
- # worker_options=' '.join(worker_options),
400
- # )
401
- executor = MyWorkQueueExecutor (
402
- label = definition .name (),
403
- working_dir = str (path / definition .name ()),
404
- provider = definition .parsl_provider ,
405
- shared_fs = True ,
406
- autocategory = False ,
407
- port = 0 ,
408
- max_retries = 0 ,
409
- coprocess = use_coprocess ,
410
- worker_options = " " .join (worker_options ),
411
- )
412
383
else :
413
384
raise ValueError ("Unknown mode {}" .format (mode ))
414
- # executor = TaskVineExecutor(
415
- # label=definition.name(),
416
- # provider=definition.parsl_provider,
417
- # manager_config=manager_config,
418
- # factory_config=factory_config,
419
- # )
420
385
executors .append (executor )
421
386
422
387
# create default executors
0 commit comments