24
24
25
25
import xml .etree .ElementTree as ET
26
26
27
+ from enum import Enum
28
+
27
29
from jinja2 import Template
28
30
29
31
from tornado import gen
@@ -55,6 +57,11 @@ def format_template(template, *args, **kwargs):
55
57
return Template (template ).render (* args , ** kwargs )
56
58
return template .format (* args , ** kwargs )
57
59
60
+ class JobStatus (Enum ):
61
+ NOTFOUND = 0
62
+ RUNNING = 1
63
+ PENDING = 2
64
+ UNKNOWN = 3
58
65
59
66
class BatchSpawnerBase (Spawner ):
60
67
"""Base class for spawners using resource manager batch job submission mechanisms
@@ -256,30 +263,39 @@ async def submit_batch_script(self):
256
263
self .job_id = ''
257
264
return self .job_id
258
265
259
- # Override if your batch system needs something more elaborate to read the job status
266
+ # Override if your batch system needs something more elaborate to query the job status
260
267
batch_query_cmd = Unicode ('' ,
261
- help = "Command to run to read job status. Formatted using req_xyz traits as {xyz} "
268
+ help = "Command to run to query job status. Formatted using req_xyz traits as {xyz} "
262
269
"and self.job_id as {job_id}."
263
270
).tag (config = True )
264
271
265
- async def read_job_state (self ):
272
+ async def query_job_status (self ):
273
+ """Check job status, return JobStatus object."""
266
274
if self .job_id is None or len (self .job_id ) == 0 :
267
- # job not running
268
275
self .job_status = ''
269
- return self . job_status
276
+ return JobStatus . NOTFOUND
270
277
subvars = self .get_req_subvars ()
271
278
subvars ['job_id' ] = self .job_id
272
279
cmd = ' ' .join ((format_template (self .exec_prefix , ** subvars ),
273
280
format_template (self .batch_query_cmd , ** subvars )))
274
281
self .log .debug ('Spawner querying job: ' + cmd )
275
282
try :
276
- out = await self .run_command (cmd )
277
- self .job_status = out
283
+ self .job_status = await self .run_command (cmd )
284
+ except RuntimeError as e :
285
+ # e.args[0] is stderr from the process
286
+ self .job_status = e .args [0 ]
278
287
except Exception as e :
279
288
self .log .error ('Error querying job ' + self .job_id )
280
289
self .job_status = ''
281
- finally :
282
- return self .job_status
290
+
291
+ if self .state_isrunning ():
292
+ return JobStatus .RUNNING
293
+ elif self .state_ispending ():
294
+ return JobStatus .PENDING
295
+ elif self .state_isunknown ():
296
+ return JobStatus .UNKNOWN
297
+ else :
298
+ return JobStatus .NOTFOUND
283
299
284
300
batch_cancel_cmd = Unicode ('' ,
285
301
help = "Command to stop/cancel a previously submitted job. Formatted like batch_query_cmd."
@@ -326,22 +342,20 @@ def state_isrunning(self):
326
342
"Return boolean indicating if job is running, likely by parsing self.job_status"
327
343
raise NotImplementedError ("Subclass must provide implementation" )
328
344
345
+ def state_isunknown (self ):
346
+ "Return boolean indicating if job state retrieval failed because of the resource manager"
347
+ return None
348
+
329
349
def state_gethost (self ):
330
350
"Return string, hostname or addr of running job, likely by parsing self.job_status"
331
351
raise NotImplementedError ("Subclass must provide implementation" )
332
352
333
353
async def poll (self ):
334
354
"""Poll the process"""
335
- if self .job_id is not None and len (self .job_id ) > 0 :
336
- await self .read_job_state ()
337
- if self .state_isrunning () or self .state_ispending ():
338
- return None
339
- else :
340
- self .clear_state ()
341
- return 1
342
-
343
- if not self .job_id :
344
- # no job id means it's not running
355
+ status = await self .query_job_status ()
356
+ if status in (JobStatus .PENDING , JobStatus .RUNNING , JobStatus .UNKNOWN ):
357
+ return None
358
+ else :
345
359
self .clear_state ()
346
360
return 1
347
361
@@ -366,18 +380,20 @@ async def start(self):
366
380
if len (self .job_id ) == 0 :
367
381
raise RuntimeError ("Jupyter batch job submission failure (no jobid in output)" )
368
382
while True :
369
- await self .poll ()
370
- if self . state_isrunning () :
383
+ status = await self .query_job_status ()
384
+ if status == JobStatus . RUNNING :
371
385
break
386
+ elif status == JobStatus .PENDING :
387
+ self .log .debug ('Job ' + self .job_id + ' still pending' )
388
+ elif status == JobStatus .UNKNOWN :
389
+ self .log .debug ('Job ' + self .job_id + ' still unknown' )
372
390
else :
373
- if self .state_ispending ():
374
- self .log .debug ('Job ' + self .job_id + ' still pending' )
375
- else :
376
- self .log .warning ('Job ' + self .job_id + ' neither pending nor running.\n ' +
377
- self .job_status )
378
- raise RuntimeError ('The Jupyter batch job has disappeared'
379
- ' while pending in the queue or died immediately'
380
- ' after starting.' )
391
+ self .log .warning ('Job ' + self .job_id + ' neither pending nor running.\n ' +
392
+ self .job_status )
393
+ self .clear_state ()
394
+ raise RuntimeError ('The Jupyter batch job has disappeared'
395
+ ' while pending in the queue or died immediately'
396
+ ' after starting.' )
381
397
await gen .sleep (self .startup_poll_interval )
382
398
383
399
self .ip = self .state_gethost ()
@@ -410,8 +426,8 @@ async def stop(self, now=False):
410
426
if now :
411
427
return
412
428
for i in range (10 ):
413
- await self .poll ()
414
- if not self . state_isrunning ( ):
429
+ status = await self .query_job_status ()
430
+ if status not in ( JobStatus . RUNNING , JobStatus . UNKNOWN ):
415
431
return
416
432
await gen .sleep (1.0 )
417
433
if self .job_id :
@@ -467,20 +483,22 @@ class BatchSpawnerRegexStates(BatchSpawnerBase):
467
483
If this variable is set, the match object will be expanded using this string
468
484
to obtain the notebook IP.
469
485
See Python docs: re.match.expand""" ).tag (config = True )
486
+ state_unknown_re = Unicode ('' ,
487
+ help = "Regex that matches job_status if the resource manager is not answering."
488
+ "Blank indicates not used." ).tag (config = True )
470
489
471
490
def state_ispending (self ):
472
491
assert self .state_pending_re , "Misconfigured: define state_running_re"
473
- if self .job_status and re .search (self .state_pending_re , self .job_status ):
474
- return True
475
- else :
476
- return False
492
+ return self .job_status and re .search (self .state_pending_re , self .job_status )
477
493
478
494
def state_isrunning (self ):
479
495
assert self .state_running_re , "Misconfigured: define state_running_re"
480
- if self .job_status and re .search (self .state_running_re , self .job_status ):
481
- return True
482
- else :
483
- return False
496
+ return self .job_status and re .search (self .state_running_re , self .job_status )
497
+
498
+ def state_isunknown (self ):
499
+ # Blank means "not set" and this function always returns None.
500
+ if self .state_unknown_re :
501
+ return self .job_status and re .search (self .state_unknown_re , self .job_status )
484
502
485
503
def state_gethost (self ):
486
504
assert self .state_exechost_re , "Misconfigured: define state_exechost_re"
@@ -645,6 +663,7 @@ class SlurmSpawner(UserEnvMixin,BatchSpawnerRegexStates):
645
663
# RUNNING, COMPLETING = running
646
664
state_pending_re = Unicode (r'^(?:PENDING|CONFIGURING)' ).tag (config = True )
647
665
state_running_re = Unicode (r'^(?:RUNNING|COMPLETING)' ).tag (config = True )
666
+ state_unknown_re = Unicode (r'^slurm_load_jobs error: (?:Socket timed out on send/recv|Unable to contact slurm controller)' ).tag (config = True )
648
667
state_exechost_re = Unicode (r'\s+((?:[\w_-]+\.?)+)$' ).tag (config = True )
649
668
650
669
def parse_job_id (self , output ):
0 commit comments