From 2a276d56db3cca0eddb9de7a2368972bcefb7337 Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 09:35:28 +0200 Subject: [PATCH 01/10] make limit configurable --- bin/sync_slurm_acct.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/sync_slurm_acct.py b/bin/sync_slurm_acct.py index 33a76ea8..af00714e 100644 --- a/bin/sync_slurm_acct.py +++ b/bin/sync_slurm_acct.py @@ -40,8 +40,6 @@ SYNC_TIMESTAMP_FILENAME = "/var/cache/%s.timestamp" % (NAGIOS_HEADER) SYNC_SLURM_ACCT_LOGFILE = "/var/log/%s.log" % (NAGIOS_HEADER) -MAX_USERS_JOB_CANCEL = 10 - class SyncSanityError(Exception): pass @@ -75,6 +73,7 @@ def main(): 'store', [PRODUCTION, PILOT] ), + 'limit_cancel': ('Limit of scancel commands allowed', int, 'store', 10), 'force': ( 'Force the sync instead of bailing if too many scancel commands would be issues', None, @@ -154,7 +153,7 @@ def main(): sacctmgr_commands = [] # safety to avoid emptying the cluster due to some error upstream - if not opts.options.force and len(job_cancel_commands) > MAX_USERS_JOB_CANCEL: + if not opts.options.force and len(job_cancel_commands) > opts.options.limit_cancel: logging.warning("Would add commands to cancel jobs for %d users", len(job_cancel_commands)) logging.debug("Would execute the following cancel commands:") for jc in job_cancel_commands.values(): From e054019f6d5c1d851bd6a6adb61044c0ce405a82 Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 10:17:17 +0200 Subject: [PATCH 02/10] add function to get active jobs from clusters --- lib/vsc/administration/slurm/sacctmgr.py | 27 ++++++++++++++++++++++++ test/slurm_sacctmgr.py | 15 ++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 4d207bea..515d1f92 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -44,6 +44,7 @@ class SacctMgrTypes(Enum): users = "users" qos = "qos" resource = "resource" + activejobs = "activejobs" # Fields for Slurm 20.11. @@ -74,6 +75,10 @@ class SacctMgrTypes(Enum): "Name", "Server", "Type", "Count", "PCT__Allocated", "ServerType", ] +SacctActivejobsFields = [ + "JobID", "JobName", "Partition", "Account", "AllocCPUS", "State", "ExitCode", +] + IGNORE_USERS = ["root"] IGNORE_ACCOUNTS = ["root"] IGNORE_QOS = ["normal"] @@ -82,6 +87,7 @@ class SacctMgrTypes(Enum): SlurmUser = namedtuple_with_defaults('SlurmUser', SacctUserFields) SlurmQos = namedtuple_with_defaults('SlurmQos', SacctQosFields) SlurmResource = namedtuple_with_defaults('SlurmResource', SacctResourceFields) +SlurmActivejobs = namedtuple_with_defaults('SlurmActivejobs', SacctActivejobsFields) def mkSlurmAccount(fields): @@ -106,6 +112,12 @@ def mkSlurmQos(fields): return qos +def mkSlurmActivejobs(fields): + """Make a named tuple from the given fields""" + activejobs = mkNamedTupleInstance(fields, SlurmActivejobs) + return activejobs + + def mkSlurmResource(fields): """Make a named tuple from the given fields""" fields['Count'] = int(fields['Count']) @@ -140,6 +152,8 @@ def parse_slurm_sacct_line(header, line, info_type, user_field_number, account_f creator = mkSlurmQos elif info_type == SacctMgrTypes.resource: creator = mkSlurmResource + elif info_type == SacctMgrTypes.activejobs: + creator = mkSlurmActivejobs else: return None @@ -514,3 +528,16 @@ def create_modify_resource_license_command(name, server, stype, clusters, count) ] return command + + +def get_slurm_sacct_active_jobs_for_user(user): + """ + Get running and queued jobs for user. + """ + (exitcode, contents) = asyncloop([SLURM_SACCT_MGR,"-L", "-P", "-s", "r", "-u", user]) + if exitcode != 0: + raise SacctMgrException("Cannot run sacctmgr") + + info = parse_slurm_sacct_dump(contents.splitlines(), SacctActivejobsFields) + return info + diff --git a/test/slurm_sacctmgr.py b/test/slurm_sacctmgr.py index e4b6290f..314cae50 100644 --- a/test/slurm_sacctmgr.py +++ b/test/slurm_sacctmgr.py @@ -21,7 +21,7 @@ from vsc.install.testing import TestCase from vsc.administration.slurm.sacctmgr import ( - parse_slurm_sacct_dump, + parse_slurm_sacct_dump, SlurmActivejobs, SacctMgrTypes, SlurmAccount, SlurmUser, ) @@ -69,3 +69,16 @@ def test_parse_slurmm_sacct_dump(self): ])) + + sacctmgr_active_jobs_output = [ + "JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode", + "14367800|normal|part1|acc1|1|RUNNING|0:0", + "14367800.batch|batch||acc1|1|RUNNING|0:0", + "14367800.extern|extern||acc1|1|RUNNING|0:0", + ] + info = parse_slurm_sacct_dump(sacctmgr_active_jobs_output, SacctMgrTypes.activejobs) + self.assertEqual(set(info), set([ + SlurmActivejobs(JobID='14367800', JobName='normal', Partition='part1', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), + SlurmActivejobs(JobID='14367800.batch', JobName='batch', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), + SlurmActivejobs(JobID='14367800.extern', JobName='extern', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0') + ])) From 238b92fd933363fe71caeccecd4f142e22a806d3 Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 10:35:59 +0200 Subject: [PATCH 03/10] don't fail silently --- lib/vsc/administration/slurm/sacctmgr.py | 5 ++++- test/slurm_sacctmgr.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 515d1f92..15e9a0d2 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -38,6 +38,9 @@ class SacctMgrException(Exception): pass +class SacctParseException(Exception): + pass + class SacctMgrTypes(Enum): accounts = "accounts" @@ -155,7 +158,7 @@ def parse_slurm_sacct_line(header, line, info_type, user_field_number, account_f elif info_type == SacctMgrTypes.activejobs: creator = mkSlurmActivejobs else: - return None + raise SacctParseException("info_type %s does not exist.", info_type) return creator(dict(zip(header, fields))) diff --git a/test/slurm_sacctmgr.py b/test/slurm_sacctmgr.py index 314cae50..214abb11 100644 --- a/test/slurm_sacctmgr.py +++ b/test/slurm_sacctmgr.py @@ -23,6 +23,7 @@ from vsc.administration.slurm.sacctmgr import ( parse_slurm_sacct_dump, SlurmActivejobs, SacctMgrTypes, SlurmAccount, SlurmUser, + SacctParseException ) @@ -82,3 +83,7 @@ def test_parse_slurmm_sacct_dump(self): SlurmActivejobs(JobID='14367800.batch', JobName='batch', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), SlurmActivejobs(JobID='14367800.extern', JobName='extern', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0') ])) + + + with self.assertRaises(SacctParseException): + parse_slurm_sacct_dump("sacctmgr_active_jobs_output", "doesnotexist") From 5df7002fb6dbbafbdc87134418b6c06588e3a57d Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 10:48:32 +0200 Subject: [PATCH 04/10] only add scancel jobs if a user actually has jobs --- lib/vsc/administration/slurm/sync.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 01c15689..e7de499c 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -373,7 +373,9 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, ]) for user in remove_users: - job_cancel_commands[user].append(create_remove_user_jobs_command(user=user, cluster=cluster)) + active_jobs = get_slurm_sacct_active_jobs_for_user(user) + if active_jobs: + job_cancel_commands[user].append(create_remove_user_jobs_command(user=user, cluster=cluster)) # Remove users from the clusters (in all accounts) association_remove_commands.extend([ From dacb031430d77825005be828d892cad96d0d3cc6 Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 11:49:00 +0200 Subject: [PATCH 05/10] add limits --- bin/sync_slurm_acct.py | 28 +++++++++++++++++++----- lib/vsc/administration/slurm/sacctmgr.py | 11 ++++++++-- lib/vsc/administration/slurm/sync.py | 2 +- 3 files changed, 32 insertions(+), 9 deletions(-) mode change 100644 => 100755 bin/sync_slurm_acct.py diff --git a/bin/sync_slurm_acct.py b/bin/sync_slurm_acct.py old mode 100644 new mode 100755 index af00714e..55f53bc5 --- a/bin/sync_slurm_acct.py +++ b/bin/sync_slurm_acct.py @@ -73,9 +73,16 @@ def main(): 'store', [PRODUCTION, PILOT] ), - 'limit_cancel': ('Limit of scancel commands allowed', int, 'store', 10), - 'force': ( - 'Force the sync instead of bailing if too many scancel commands would be issues', + 'max_scancel': ('Limit of scancel commands allowed', int, 'store', 20), + 'max_user_del': ('Limit of user delete commands allowed', int, 'store', 10), + 'force_scancel': ( + 'Force the sync instead of bailing if too many scancel commands would be issued', + None, + 'store_true', + False + ), + 'force_user_del': ( + 'Force the sync instead of bailing if too many user delete commands would be issued', None, 'store_true', False @@ -153,15 +160,24 @@ def main(): sacctmgr_commands = [] # safety to avoid emptying the cluster due to some error upstream - if not opts.options.force and len(job_cancel_commands) > opts.options.limit_cancel: - logging.warning("Would add commands to cancel jobs for %d users", len(job_cancel_commands)) + if not opts.options.force_scancel and len(job_cancel_commands) > (opts.options.max_scancel / len(clusters)): + logging.warning("Would add %s scancel commands per cluster", len(job_cancel_commands) / len(clusters)) logging.debug("Would execute the following cancel commands:") for jc in job_cancel_commands.values(): logging.debug("%s", jc) - raise SyncSanityError("Would cancel jobs for %d users" % len(job_cancel_commands)) + raise SyncSanityError("Would run %d scancel jobs per cluster" % len(job_cancel_commands) / len(clusters)) sacctmgr_commands += [c for cl in job_cancel_commands.values() for c in cl] + # safety to avoid removing too many users due to some error upstream + max_user_del = opts.options.max_user_del / len(clusters) + if not opts.options.force_user_del and len(association_remove_commands) > max_user_del: + logging.warning("Would add commands to remove %d users", len(association_remove_commands) / len(clusters)) + logging.debug("Would execute the following remove commands:") + for jc in association_remove_commands: + logging.debug("%s", jc) + raise SyncSanityError("Would run %d user delete commands per cluster" % (len(association_remove_commands) / len(clusters))) + # removing users may fail, so should be done last sacctmgr_commands += association_remove_commands diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 15e9a0d2..dc18c8f1 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -16,6 +16,7 @@ sacctmgr commands """ import logging +import re from enum import Enum from vsc.accountpage.wrappers import mkNamedTupleInstance @@ -26,6 +27,7 @@ from vsc.administration.slurm.scancel import create_remove_user_jobs_command SLURM_SACCT_MGR = "/usr/bin/sacctmgr" +SLURM_SACCT = "/usr/bin/sacct" SLURM_ORGANISATIONS = { ANTWERPEN: 'uantwerpen', @@ -38,6 +40,7 @@ class SacctMgrException(Exception): pass + class SacctParseException(Exception): pass @@ -537,9 +540,13 @@ def get_slurm_sacct_active_jobs_for_user(user): """ Get running and queued jobs for user. """ - (exitcode, contents) = asyncloop([SLURM_SACCT_MGR,"-L", "-P", "-s", "r", "-u", user]) + (exitcode, contents) = asyncloop([SLURM_SACCT, "-L", "-P", "-s", "r", "-u", user]) if exitcode != 0: - raise SacctMgrException("Cannot run sacctmgr") + if re.search("sacct: error: Invalid user id: %s" % user, contents): + logging.warning("User %s does not exist, assuming no active jobs.", user) + return None + else: + raise SacctMgrException("Cannot run sacct") info = parse_slurm_sacct_dump(contents.splitlines(), SacctActivejobsFields) return info diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index e7de499c..de5380a7 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -25,7 +25,7 @@ create_add_account_command, create_remove_account_command, create_change_account_fairshare_command, create_add_user_command, create_change_user_command, create_remove_user_command, create_remove_user_account_command, - create_add_qos_command, create_remove_qos_command, create_modify_qos_command + create_add_qos_command, create_remove_qos_command, create_modify_qos_command, get_slurm_sacct_active_jobs_for_user, ) from vsc.administration.slurm.scancel import ( create_remove_user_jobs_command, create_remove_jobs_for_account_command, From 7b0a145f601eff3170ffba11275d548570e75575 Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 13:24:17 +0200 Subject: [PATCH 06/10] fix line too long --- bin/sync_slurm_acct.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/sync_slurm_acct.py b/bin/sync_slurm_acct.py index 55f53bc5..d1c244bb 100755 --- a/bin/sync_slurm_acct.py +++ b/bin/sync_slurm_acct.py @@ -176,7 +176,8 @@ def main(): logging.debug("Would execute the following remove commands:") for jc in association_remove_commands: logging.debug("%s", jc) - raise SyncSanityError("Would run %d user delete commands per cluster" % (len(association_remove_commands) / len(clusters))) + raise SyncSanityError("Would run %d user delete commands per cluster" % + (len(association_remove_commands) / len(clusters))) # removing users may fail, so should be done last sacctmgr_commands += association_remove_commands From 7c892574e95c5eeb62ed4d256db93375c642e972 Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 13:37:15 +0200 Subject: [PATCH 07/10] add mock test --- test/slurm_sync.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 78d1a9fc..b0095512 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -31,6 +31,7 @@ slurm_project_qos, ) +from mock import patch VO = namedtuple("VO", ["vsc_id", "institute", "fairshare", "qos"]) VO.__new__.__defaults__ = (None,) * len(VO._fields) @@ -207,7 +208,8 @@ def test_slurm_user_accounts(self): SlurmUser(User='user5', Def_Acct='vo2', Admin='None', Cluster='banette', Account='vo2', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), ] - (job_cancel_commands, commands, remove_user_commands) = slurm_user_accounts(vo_members, active_accounts, slurm_user_info, ["banette"]) + with patch("vsc.administration.slurm.sacctmgr.get_slurm_sacct_active_jobs_for_user", return_value="testing"): + (job_cancel_commands, commands, remove_user_commands) = slurm_user_accounts(vo_members, active_accounts, slurm_user_info, ["banette"]) self.assertEqual(set([tuple(x) for x in commands]), set([tuple(x) for x in [ shlex.split("/usr/bin/sacctmgr -i add user user6 Account=vo2 Cluster=banette DefaultAccount=vo2"), From ffbdca6f27f4b4c3c46105d68e39fcc6789346bc Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 14:13:53 +0200 Subject: [PATCH 08/10] reshuffle --- lib/vsc/administration/slurm/sacct.py | 110 +++++++++++++++++++++++ lib/vsc/administration/slurm/sacctmgr.py | 37 +------- lib/vsc/administration/slurm/sync.py | 4 +- test/slurm_sacct.py | 45 ++++++++++ test/slurm_sacctmgr.py | 20 +---- test/slurm_sync.py | 2 +- 6 files changed, 163 insertions(+), 55 deletions(-) create mode 100644 lib/vsc/administration/slurm/sacct.py create mode 100644 test/slurm_sacct.py diff --git a/lib/vsc/administration/slurm/sacct.py b/lib/vsc/administration/slurm/sacct.py new file mode 100644 index 00000000..09e5bb4c --- /dev/null +++ b/lib/vsc/administration/slurm/sacct.py @@ -0,0 +1,110 @@ +# +# Copyright 2013-2022 Ghent University +# +# This file is part of vsc-administration, +# originally created by the HPC team of Ghent University (http://ugent.be/hpc/en), +# with support of Ghent University (http://ugent.be/hpc), +# the Flemish Supercomputer Centre (VSC) (https://www.vscentrum.be), +# the Flemish Research Foundation (FWO) (http://www.fwo.be/en) +# and the Department of Economy, Science and Innovation (EWI) (http://www.ewi-vlaanderen.be/en). +# +# https://github.com/hpcugent/vsc-administration +# +# All rights reserved. +# +""" +sacct commands +""" +import logging +import re +from enum import Enum + +from vsc.accountpage.wrappers import mkNamedTupleInstance +from vsc.config.base import ANTWERPEN, BRUSSEL, GENT, LEUVEN +from vsc.utils.missing import namedtuple_with_defaults +from vsc.utils.run import asyncloop + +SLURM_SACCT = "/usr/bin/sacct" + +SLURM_ORGANISATIONS = { + ANTWERPEN: 'uantwerpen', + BRUSSEL: 'vub', + GENT: 'ugent', + LEUVEN: 'kuleuven', +} + + +class SacctParseException(Exception): + pass + +class SacctException(Exception): + pass + + +class SacctTypes(Enum): + jobs = "jobs" + + +# Fields for Slurm 20.11. +# FIXME: at some point this should be versioned + +SacctJobsFields = [ + "JobID", "JobName", "Partition", "Account", "AllocCPUS", "State", "ExitCode", +] + +SlurmJobs = namedtuple_with_defaults('SlurmJobs', SacctJobsFields) + +def mkSlurmJobs(fields): + """Make a named tuple from the given fields""" + activejobs = mkNamedTupleInstance(fields, SlurmJobs) + return activejobs + +def parse_slurm_sacct_line(header, line, info_type): + """Parse the line into the correct data type.""" + fields = line.split("|") + + if info_type == SacctTypes.jobs: + creator = mkSlurmJobs + else: + raise SacctParseException("info_type %s does not exist.", info_type) + + return creator(dict(zip(header, fields))) + + +def parse_slurm_sacct_dump(lines, info_type): + """Parse the sacctmgr dump from the listing.""" + acct_info = set() + + header = [w.replace(' ', '_').replace('%', 'PCT_') for w in lines[0].rstrip().split("|")] + + for line in lines[1:]: + logging.debug("line %s", line) + line = line.rstrip() + try: + info = parse_slurm_sacct_line(header, line, info_type) + except Exception as err: + logging.exception("Slurm sacct parse dump: could not process line %s [%s]", line, err) + raise + # This fails when we get e.g., the users and look at the account lines. + # We should them just skip that line instead of raising an exception + if info: + acct_info.add(info) + + return acct_info + + +def get_slurm_sacct_active_jobs_for_user(user): + """ + Get running and queued jobs for user. + """ + (exitcode, contents) = asyncloop([ + SLURM_SACCT, "--allclusters", "--parsable2", "--state", "RUNNING,PENDING", "--user", user]) + if exitcode != 0: + if re.search("sacct: error: Invalid user id: %s" % user, contents): + logging.warning("User %s does not exist, assuming no active jobs.", user) + return None + else: + raise SacctException("Cannot run sacct") + + info = parse_slurm_sacct_dump(contents.splitlines(), SacctJobsFields) + return info diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index dc18c8f1..3f2cde59 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -16,14 +16,12 @@ sacctmgr commands """ import logging -import re from enum import Enum from vsc.accountpage.wrappers import mkNamedTupleInstance from vsc.config.base import ANTWERPEN, BRUSSEL, GENT, LEUVEN from vsc.utils.missing import namedtuple_with_defaults from vsc.utils.run import asyncloop - from vsc.administration.slurm.scancel import create_remove_user_jobs_command SLURM_SACCT_MGR = "/usr/bin/sacctmgr" @@ -41,7 +39,7 @@ class SacctMgrException(Exception): pass -class SacctParseException(Exception): +class SacctMgrParseException(Exception): pass @@ -50,7 +48,6 @@ class SacctMgrTypes(Enum): users = "users" qos = "qos" resource = "resource" - activejobs = "activejobs" # Fields for Slurm 20.11. @@ -81,10 +78,6 @@ class SacctMgrTypes(Enum): "Name", "Server", "Type", "Count", "PCT__Allocated", "ServerType", ] -SacctActivejobsFields = [ - "JobID", "JobName", "Partition", "Account", "AllocCPUS", "State", "ExitCode", -] - IGNORE_USERS = ["root"] IGNORE_ACCOUNTS = ["root"] IGNORE_QOS = ["normal"] @@ -93,7 +86,6 @@ class SacctMgrTypes(Enum): SlurmUser = namedtuple_with_defaults('SlurmUser', SacctUserFields) SlurmQos = namedtuple_with_defaults('SlurmQos', SacctQosFields) SlurmResource = namedtuple_with_defaults('SlurmResource', SacctResourceFields) -SlurmActivejobs = namedtuple_with_defaults('SlurmActivejobs', SacctActivejobsFields) def mkSlurmAccount(fields): @@ -118,12 +110,6 @@ def mkSlurmQos(fields): return qos -def mkSlurmActivejobs(fields): - """Make a named tuple from the given fields""" - activejobs = mkNamedTupleInstance(fields, SlurmActivejobs) - return activejobs - - def mkSlurmResource(fields): """Make a named tuple from the given fields""" fields['Count'] = int(fields['Count']) @@ -158,10 +144,8 @@ def parse_slurm_sacct_line(header, line, info_type, user_field_number, account_f creator = mkSlurmQos elif info_type == SacctMgrTypes.resource: creator = mkSlurmResource - elif info_type == SacctMgrTypes.activejobs: - creator = mkSlurmActivejobs else: - raise SacctParseException("info_type %s does not exist.", info_type) + raise SacctMgrParseException("info_type %s does not exist.", info_type) return creator(dict(zip(header, fields))) @@ -534,20 +518,3 @@ def create_modify_resource_license_command(name, server, stype, clusters, count) ] return command - - -def get_slurm_sacct_active_jobs_for_user(user): - """ - Get running and queued jobs for user. - """ - (exitcode, contents) = asyncloop([SLURM_SACCT, "-L", "-P", "-s", "r", "-u", user]) - if exitcode != 0: - if re.search("sacct: error: Invalid user id: %s" % user, contents): - logging.warning("User %s does not exist, assuming no active jobs.", user) - return None - else: - raise SacctMgrException("Cannot run sacct") - - info = parse_slurm_sacct_dump(contents.splitlines(), SacctActivejobsFields) - return info - diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index de5380a7..51bb953d 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -25,12 +25,12 @@ create_add_account_command, create_remove_account_command, create_change_account_fairshare_command, create_add_user_command, create_change_user_command, create_remove_user_command, create_remove_user_account_command, - create_add_qos_command, create_remove_qos_command, create_modify_qos_command, get_slurm_sacct_active_jobs_for_user, + create_add_qos_command, create_remove_qos_command, create_modify_qos_command, ) from vsc.administration.slurm.scancel import ( create_remove_user_jobs_command, create_remove_jobs_for_account_command, ) - +from vsc.administration.slurm.sacct import get_slurm_sacct_active_jobs_for_user class SlurmSyncException(Exception): pass diff --git a/test/slurm_sacct.py b/test/slurm_sacct.py new file mode 100644 index 00000000..c7a7159f --- /dev/null +++ b/test/slurm_sacct.py @@ -0,0 +1,45 @@ +# +# Copyright 2015-2022 Ghent University +# +# This file is part of vsc-administration, +# originally created by the HPC team of Ghent University (http://ugent.be/hpc/en), +# with support of Ghent University (http://ugent.be/hpc), +# the Flemish Supercomputer Centre (VSC) (https://www.vscentrum.be), +# the Flemish Research Foundation (FWO) (http://www.fwo.be/en) +# and the Department of Economy, Science and Innovation (EWI) (http://www.ewi-vlaanderen.be/en). +# +# https://github.com/hpcugent/vsc-administration +# +# All rights reserved. +# +""" +Tests for vsc.administration.slurm.* + +@author: Andy Georges (Ghent University) +""" + +from vsc.install.testing import TestCase + +from vsc.administration.slurm.sacct import parse_slurm_sacct_dump, SlurmJobs, SacctTypes, SacctParseException + + +class SlurmSacctmgrTest(TestCase): + def test_parse_slurmm_sacct_dump(self): + """Test that the sacct output is correctly processed.""" + + sacct_active_jobs_output = [ + "JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode", + "14367800|normal|part1|acc1|1|RUNNING|0:0", + "14367800.batch|batch||acc1|1|RUNNING|0:0", + "14367800.extern|extern||acc1|1|RUNNING|0:0", + ] + info = parse_slurm_sacct_dump(sacct_active_jobs_output, SacctTypes.jobs) + self.assertEqual(set(info), set([ + SlurmJobs(JobID='14367800', JobName='normal', Partition='part1', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), + SlurmJobs(JobID='14367800.batch', JobName='batch', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), + SlurmJobs(JobID='14367800.extern', JobName='extern', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0') + ])) + + + with self.assertRaises(SacctParseException): + parse_slurm_sacct_dump("sacct_active_jobs_output", "doesnotexist") diff --git a/test/slurm_sacctmgr.py b/test/slurm_sacctmgr.py index 214abb11..d182ed70 100644 --- a/test/slurm_sacctmgr.py +++ b/test/slurm_sacctmgr.py @@ -21,9 +21,9 @@ from vsc.install.testing import TestCase from vsc.administration.slurm.sacctmgr import ( - parse_slurm_sacct_dump, SlurmActivejobs, + parse_slurm_sacct_dump, SacctMgrTypes, SlurmAccount, SlurmUser, - SacctParseException + SacctMgrParseException ) @@ -71,19 +71,5 @@ def test_parse_slurmm_sacct_dump(self): - sacctmgr_active_jobs_output = [ - "JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode", - "14367800|normal|part1|acc1|1|RUNNING|0:0", - "14367800.batch|batch||acc1|1|RUNNING|0:0", - "14367800.extern|extern||acc1|1|RUNNING|0:0", - ] - info = parse_slurm_sacct_dump(sacctmgr_active_jobs_output, SacctMgrTypes.activejobs) - self.assertEqual(set(info), set([ - SlurmActivejobs(JobID='14367800', JobName='normal', Partition='part1', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), - SlurmActivejobs(JobID='14367800.batch', JobName='batch', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0'), - SlurmActivejobs(JobID='14367800.extern', JobName='extern', Partition='', Account='acc1', AllocCPUS='1', State='RUNNING', ExitCode='0:0') - ])) - - - with self.assertRaises(SacctParseException): + with self.assertRaises(SacctMgrParseException): parse_slurm_sacct_dump("sacctmgr_active_jobs_output", "doesnotexist") diff --git a/test/slurm_sync.py b/test/slurm_sync.py index b0095512..520868e1 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -208,7 +208,7 @@ def test_slurm_user_accounts(self): SlurmUser(User='user5', Def_Acct='vo2', Admin='None', Cluster='banette', Account='vo2', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), ] - with patch("vsc.administration.slurm.sacctmgr.get_slurm_sacct_active_jobs_for_user", return_value="testing"): + with patch("vsc.administration.slurm.sacct.get_slurm_sacct_active_jobs_for_user", return_value="testing"): (job_cancel_commands, commands, remove_user_commands) = slurm_user_accounts(vo_members, active_accounts, slurm_user_info, ["banette"]) self.assertEqual(set([tuple(x) for x in commands]), set([tuple(x) for x in [ From 8e85fef2f876b511053e0883099d89d4c9b069fe Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 14:20:11 +0200 Subject: [PATCH 09/10] remove unused global --- lib/vsc/administration/slurm/sacctmgr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 3f2cde59..f127e00a 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -25,7 +25,6 @@ from vsc.administration.slurm.scancel import create_remove_user_jobs_command SLURM_SACCT_MGR = "/usr/bin/sacctmgr" -SLURM_SACCT = "/usr/bin/sacct" SLURM_ORGANISATIONS = { ANTWERPEN: 'uantwerpen', From da1e11827fb4b650881728901fea05780a89641b Mon Sep 17 00:00:00 2001 From: Wouter Depypere Date: Thu, 30 Jun 2022 14:23:13 +0200 Subject: [PATCH 10/10] fix context manager --- test/slurm_sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 520868e1..010ca642 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -208,7 +208,7 @@ def test_slurm_user_accounts(self): SlurmUser(User='user5', Def_Acct='vo2', Admin='None', Cluster='banette', Account='vo2', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), ] - with patch("vsc.administration.slurm.sacct.get_slurm_sacct_active_jobs_for_user", return_value="testing"): + with patch("vsc.administration.slurm.sync.get_slurm_sacct_active_jobs_for_user", return_value="testing"): (job_cancel_commands, commands, remove_user_commands) = slurm_user_accounts(vo_members, active_accounts, slurm_user_info, ["banette"]) self.assertEqual(set([tuple(x) for x in commands]), set([tuple(x) for x in [