From 4ea72d23b2114091b55c9c4614bf6c321caee044 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 17 Apr 2023 11:02:33 +0200 Subject: [PATCH 01/32] fix: fstrings and optional arguments --- lib/vsc/administration/slurm/sacctmgr.py | 66 ++++++++++++------------ lib/vsc/administration/slurm/scancel.py | 6 ++- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 9e4cc6a7..2b598451 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -244,10 +244,10 @@ def create_default_account_command(user, account, cluster): """ command = [ "user", - "Name={0}".format(user), - "Cluster={0}".format(cluster), + f"Name={user}", + f"Cluster={cluster}", "set", - "DefaultAccount={0}".format(account), + f"DefaultAccount={account}", ] logging.debug( "Creating command to set default account to %s for %s on cluster %s", @@ -262,10 +262,10 @@ def create_default_account_command(user, account, cluster): def create_change_account_fairshare_command(account, cluster, fairshare): command = [ "account", - "name={0}".format(account), - "cluster={0}".format(cluster), + f"name={account}", + f"cluster={cluster}", "set", - "fairshare={0}".format(fairshare), + f"fairshare={fairshare}", ] logging.debug( "Adding command to change fairshare for account %s on cluster %s to %d", @@ -292,12 +292,12 @@ def create_add_user_command(user, account, cluster, default_account=None): command = [ "user", user, - "Account={0}".format(account), - "Cluster={0}".format(cluster) + f"Account={account}", + f"Cluster={cluster}", ] if default_account is not None: command.append( - "DefaultAccount={0}".format(account), + f"DefaultAccount={account}", ) logging.debug( "Adding command to add user %s with Account=%s Cluster=%s", @@ -340,16 +340,18 @@ def create_change_user_command(user, current_vo_id, new_vo_id, cluster): @mksacctmgr('remove') -def create_remove_user_command(user, cluster): +def create_remove_user_command(user, cluster=None): """Create the command to remove a user. @returns: list comprising the command """ command = [ "user", - "Name={user}".format(user=user), - "Cluster={cluster}".format(cluster=cluster) + f"Name={user}", ] + + if cluster is not None: + command.append(f"Cluster={cluster}") logging.debug( "Adding command to remove user %s from Cluster=%s", user, @@ -367,8 +369,8 @@ def create_remove_account_command(account, cluster): """ command = [ "account", - "Name={account}".format(account=account), - "Cluster={cluster}".format(cluster=cluster), + f"Name={account}", + f"Cluster={cluster}", ] logging.debug( @@ -388,9 +390,9 @@ def create_remove_user_account_command(user, account, cluster): """ command = [ "user", - "Name={user}".format(user=user), - "Account={account}".format(account=account), - "Cluster={cluster}".format(cluster=cluster) + f"Name={user}", + f"Account={account}", + f"Cluster={cluster}", ] logging.debug( @@ -411,7 +413,7 @@ def create_add_qos_command(name): """ command = [ "qos", - "Name={0}".format(name) + f"Name={name}", ] return command @@ -428,7 +430,7 @@ def create_remove_qos_command(name): command = [ "qos", "where", - "Name={0}".format(name), + f"Name={name}", ] return command @@ -451,7 +453,7 @@ def create_modify_qos_command(name, settings): ] for k, v in settings.items(): - command.append("{0}={1}".format(k, v)) + command.append(f"{k}={v}") return command @@ -465,11 +467,11 @@ def create_add_resource_license_command(name, server, stype, clusters, count): command = [ "resource", "Type=license", - "Name={0}".format(name), - "Server={0}".format(server), - "ServerType={0}".format(stype), - "Cluster={0}".format(",".join(clusters)), - "Count={0}".format(count), + f"Name={name}", + f"Server={server}", + f"ServerType={stype}", + "Cluster={}".format(",".join(clusters)), + f"Count={count}", "PercentAllowed=100", ] @@ -486,9 +488,9 @@ def create_remove_resource_license_command(name, server, stype): "resource", "where", "Type=license", - "Name={0}".format(name), - "Server={0}".format(server), - "ServerType={0}".format(stype), + f"Name={name}", + f"Server={server}", + f"ServerType={stype}", ] return command @@ -504,12 +506,12 @@ def create_modify_resource_license_command(name, server, stype, clusters, count) "resource", "where" "Type=license", - "Name={0}".format(name), - "Server={0}".format(server), - "ServerType={0}".format(stype), + f"Name={name}", + f"Server={server}", + f"ServerType={stype}", "set", "Cluster={0}".format(",".join(clusters)), - "Count={0}".format(count), + f"Count={count}", "PercentAllowed=100", ] diff --git a/lib/vsc/administration/slurm/scancel.py b/lib/vsc/administration/slurm/scancel.py index 463b5fa8..bc8f0355 100644 --- a/lib/vsc/administration/slurm/scancel.py +++ b/lib/vsc/administration/slurm/scancel.py @@ -18,17 +18,19 @@ SLURM_SCANCEL = "/usr/bin/scancel" -def create_remove_user_jobs_command(user, cluster, state=None, account=None): +def create_remove_user_jobs_command(user, cluster=None, state=None, account=None): """Create the command to remove a user's jobs in the given state. @returns: a list comprising the command """ remove_user_jobs_command = [ SLURM_SCANCEL, - "--cluster={cluster}".format(cluster=cluster), "--user={user}".format(user=user), ] + if cluster is not None: + remove_user_jobs_command.append("--cluster={cluster}".format(cluster=cluster)) + if state is not None: remove_user_jobs_command.append("--state={state}".format(state=state)) From 2bab5e941c6d5ca829254f96540510b6d1de8f76 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 09:04:06 +0200 Subject: [PATCH 02/32] fix: remove users from all clusters at once when they are inactive --- lib/vsc/administration/slurm/sync.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index a1b5af06..b01b9aa7 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -317,15 +317,17 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, for m in members: reverse_vo_mapping[m] = (vo.vsc_id, vo.institute['name']) + remove_users = set() # we do not make a disctinction between clusters, i.e., inactive users should be removed everywhere + for cluster in clusters: cluster_users_acct = [ (user.User, user.Def_Acct) for user in slurm_user_info if user and user.Cluster == cluster ] cluster_users = set([u[0] for u in cluster_users_acct]) - # these are the users that need to be removed as they are no longer an active user in any + # these are the users that need to be completely removed as they are no longer an active user in any # (including the institute default) VO - remove_users = cluster_users - active_vo_members + remove_users |= cluster_users - active_vo_members new_users = set() changed_users = set() @@ -371,14 +373,6 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, default_account=vo_id) for (user, vo_id, _) in new_users ]) - for user in remove_users: - job_cancel_commands[user].append(create_remove_user_jobs_command(user=user, cluster=cluster)) - - # Remove users from the clusters (in all accounts) - association_remove_commands.extend([ - create_remove_user_command(user=user, cluster=cluster) for user in remove_users - ]) - for (user, current_vo_id, (new_vo_id, _)) in moved_users: [add, default_account, remove_jobs, remove_association_user] = create_change_user_command( user=user, @@ -390,4 +384,13 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, association_remove_commands.append(remove_association_user) job_cancel_commands[user].append(remove_jobs) + # Remove users from the clusters (in all accounts) + for user in remove_users: + job_cancel_commands[user].append(create_remove_user_jobs_command(user=user)) + + association_remove_commands.extend([ + create_remove_user_command(user=user) for user in remove_users + ]) + + return (job_cancel_commands, commands, association_remove_commands) From 8b6f68aa50bd920878481ef34db6afa49b9655f2 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 10:51:37 +0200 Subject: [PATCH 03/32] fix: split off scancel command and allow them to fail --- bin/sync_slurm_acct.py | 8 +++++--- lib/vsc/administration/slurm/scancel.py | 2 +- lib/vsc/administration/slurm/sync.py | 15 ++++++++------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/bin/sync_slurm_acct.py b/bin/sync_slurm_acct.py index 33a76ea8..f94a7598 100644 --- a/bin/sync_slurm_acct.py +++ b/bin/sync_slurm_acct.py @@ -150,8 +150,6 @@ def main(): logging.info("Executing %d commands", len(sacctmgr_commands)) execute_commands(sacctmgr_commands) - # reset to go on with the remainder of the commands - sacctmgr_commands = [] # safety to avoid emptying the cluster due to some error upstream if not opts.options.force and len(job_cancel_commands) > MAX_USERS_JOB_CANCEL: @@ -161,7 +159,10 @@ def main(): logging.debug("%s", jc) raise SyncSanityError("Would cancel jobs for %d users" % len(job_cancel_commands)) - sacctmgr_commands += [c for cl in job_cancel_commands.values() for c in cl] + scancel_commands = [c for cl in job_cancel_commands.values() for c in cl] + + # reset to go on with the remainder of the commands + sacctmgr_commands = [] # removing users may fail, so should be done last sacctmgr_commands += association_remove_commands @@ -171,6 +172,7 @@ def main(): print("\n".join([" ".join(c) for c in sacctmgr_commands])) else: logging.info("Executing %d commands", len(sacctmgr_commands)) + execute_commands(scancel_commands, disallow_failure=False) execute_commands(sacctmgr_commands) if not opts.options.dry_run: diff --git a/lib/vsc/administration/slurm/scancel.py b/lib/vsc/administration/slurm/scancel.py index bc8f0355..894de3a5 100644 --- a/lib/vsc/administration/slurm/scancel.py +++ b/lib/vsc/administration/slurm/scancel.py @@ -29,7 +29,7 @@ def create_remove_user_jobs_command(user, cluster=None, state=None, account=None ] if cluster is not None: - remove_user_jobs_command.append("--cluster={cluster}".format(cluster=cluster)) + remove_user_jobs_command.append("--clusters={cluster}".format(cluster=cluster)) if state is not None: remove_user_jobs_command.append("--state={state}".format(state=state)) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index b01b9aa7..54f4f4d9 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -40,7 +40,7 @@ class SCommandException(Exception): pass -def execute_commands(commands): +def execute_commands(commands, disallow_failure=True): """Run the specified commands""" for command in commands: @@ -48,7 +48,7 @@ def execute_commands(commands): # if one fails, we simply fail the script and should get notified (ec, _) = RunNoShell.run(command) - if ec != 0: + if ec != 0 and disallow_failure: raise SCommandException("Command failed: {0}".format(command)) @@ -327,7 +327,12 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, # these are the users that need to be completely removed as they are no longer an active user in any # (including the institute default) VO - remove_users |= cluster_users - active_vo_members + remove_users_cluster = cluster_users - active_vo_members + remove_users |= remove_users_cluster + + # Removed users should no longer have running jobs. + for user in remove_users_cluster: + job_cancel_commands[user].append(create_remove_user_jobs_command(user=user, cluster=cluster)) new_users = set() changed_users = set() @@ -384,10 +389,6 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, association_remove_commands.append(remove_association_user) job_cancel_commands[user].append(remove_jobs) - # Remove users from the clusters (in all accounts) - for user in remove_users: - job_cancel_commands[user].append(create_remove_user_jobs_command(user=user)) - association_remove_commands.extend([ create_remove_user_command(user=user) for user in remove_users ]) From 5ccd92326c48b05715dbbcb718acc08fd177cde4 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 11:20:24 +0200 Subject: [PATCH 04/32] fix: remove unused lines --- lib/vsc/administration/slurm/sacctmgr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 0a2d843b..0e074267 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -505,12 +505,10 @@ def create_modify_resource_license_command(name, server, stype, count): command = [ "resource", "where" - "Type=license", f"Name={name}", f"Server={server}", f"ServerType={stype}", "set", - "Cluster={0}".format(",".join(clusters)), f"Count={count}", "PercentAllowed=100", ] From e30d5df83eef59a2cd5522e0d391a949fa1e3cb9 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 11:20:32 +0200 Subject: [PATCH 05/32] lint: line length --- lib/vsc/administration/slurm/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 48a5ef76..671e413a 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -317,7 +317,7 @@ def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, for m in members: reverse_vo_mapping[m] = (vo.vsc_id, vo.institute['name']) - remove_users = set() # we do not make a disctinction between clusters, i.e., inactive users should be removed everywhere + remove_users = set() # we do not make a disctinction between clusters for cluster in clusters: cluster_users_acct = [ From 2121eb6dd6b8068649e49f61698cadc7425801ae Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 11:21:07 +0200 Subject: [PATCH 06/32] fix: forgotten comma --- lib/vsc/administration/slurm/sacctmgr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 0e074267..7c843646 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -504,7 +504,7 @@ def create_modify_resource_license_command(name, server, stype, count): """ command = [ "resource", - "where" + "where", f"Name={name}", f"Server={server}", f"ServerType={stype}", From d8b41e5149423bdb941676fe866a2d0546e539a4 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 11:40:52 +0200 Subject: [PATCH 07/32] test: fix results --- test/slurm_sync.py | 4 ++-- test/sync_licenses.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 5c9b950a..4ff3a14e 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -218,8 +218,8 @@ def test_slurm_user_accounts(self): ]])) self.assertEqual(set([tuple(x) for x in remove_user_commands]), set([tuple(x) for x in [ shlex.split("/usr/bin/sacctmgr -i remove user Name=user2 Cluster=banette"), - shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=vo2 Cluster=banette"), - shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=vo1 Cluster=banette"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=vo2"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=vo1"), ]])) self.assertEqual(set([tuple(x) for c in job_cancel_commands.values() for x in c]), set([tuple(x) for x in [ diff --git a/test/sync_licenses.py b/test/sync_licenses.py index 147a02d7..a35f24bc 100644 --- a/test/sync_licenses.py +++ b/test/sync_licenses.py @@ -169,7 +169,7 @@ def test_update_licenses(self, masync): logging.debug("new_update %s remove %s", nw_up, rem) self.assertEqual(nw_up, [ ['/usr/bin/sacctmgr', '-i', 'add', 'resource', 'Type=license', 'Name=ano-1', 'Server=ano-comp1', 'ServerType=strange', 'Cluster=clust1,clust2', 'Count=100', 'PercentAllowed=100'], - ['/usr/bin/sacctmgr', '-i', 'modify', 'resource', 'where', 'Name=an-5', 'Server=ano-comp2', 'ServerType=flexlm', 'set', 'Count=7'], + ['/usr/bin/sacctmgr', '-i', 'modify', 'resource', 'where', 'Name=an-5', 'Server=ano-comp2', 'ServerType=flexlm', 'set', 'Count=7', 'PercentAllowed=100'], ]) self.assertEqual(rem, [ ['/usr/bin/sacctmgr', '-i', 'remove', 'resource', 'where', 'Type=license', 'Name=comsol', 'Server=bogus', 'ServerType=flexlm'], From fe8b255af6af82151618fcd7449b8f3156322633 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 12:07:11 +0200 Subject: [PATCH 08/32] test: fix removal commands --- test/slurm_sync.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 4ff3a14e..e6c48377 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -217,9 +217,9 @@ def test_slurm_user_accounts(self): shlex.split("/usr/bin/sacctmgr -i modify user Name=user4 Cluster=banette set DefaultAccount=vo2"), ]])) self.assertEqual(set([tuple(x) for x in remove_user_commands]), set([tuple(x) for x in [ - shlex.split("/usr/bin/sacctmgr -i remove user Name=user2 Cluster=banette"), - shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=vo2"), - shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=vo1"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user2"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=vo2 Cluster=banette"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=vo1 Cluster=banette"), ]])) self.assertEqual(set([tuple(x) for c in job_cancel_commands.values() for x in c]), set([tuple(x) for x in [ From 3c9bc8667607ed35866b1458ea673d2bc062f3fe Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 12:12:35 +0200 Subject: [PATCH 09/32] test: fix scancel commands --- test/slurm_sync.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index e6c48377..ec054630 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -223,9 +223,9 @@ def test_slurm_user_accounts(self): ]])) self.assertEqual(set([tuple(x) for c in job_cancel_commands.values() for x in c]), set([tuple(x) for x in [ - shlex.split("/usr/bin/scancel --cluster=banette --user=user2"), - shlex.split("/usr/bin/scancel --cluster=banette --user=user3 --account=vo2"), - shlex.split("/usr/bin/scancel --cluster=banette --user=user4 --account=vo1"), + shlex.split("/usr/bin/scancel --user=user2 --clusters=banette"), + shlex.split("/usr/bin/scancel --user=user3 --clusters=banette --account=vo2"), + shlex.split("/usr/bin/scancel --user=user4 --clusters=banette --account=vo1"), ]])) From 3e777a6230fbc1bf79c5fc8e2ea0b5b896dda92b Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 19 Apr 2023 12:20:51 +0200 Subject: [PATCH 10/32] bump: version to 4.2.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 740325d4..a6bd2391 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ ] PACKAGE = { - 'version': '4.1.7', + 'version': '4.2.0', 'author': [ag, jt], 'maintainer': [ag, jt], 'tests_require': ['mock'], From fdedc71151abf7cf6155a466532bd9159343aff1 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 26 Apr 2023 16:50:56 +0200 Subject: [PATCH 11/32] feat: add a partition for projects in the slurm acct sync --- lib/vsc/administration/slurm/sacctmgr.py | 6 ++- lib/vsc/administration/slurm/sync.py | 47 +++++++++++++----------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index db30cf7e..1cfc833d 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -278,7 +278,7 @@ def create_change_account_fairshare_command(account, cluster, fairshare): @mksacctmgr('add') -def create_add_user_command(user, account, cluster, default_account=None): +def create_add_user_command(user, account, cluster, partition=None, default_account=None): """ Creates the command to add the given account. @@ -295,6 +295,10 @@ def create_add_user_command(user, account, cluster, default_account=None): "Account={0}".format(account), "Cluster={0}".format(cluster) ] + if partition is not None: + command.append( + f"Partition={partition}" + ) if default_account is not None: command.append( "DefaultAccount={0}".format(account), diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index c4ef3072..03b5c975 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -225,7 +225,7 @@ def slurm_project_users_accounts( - If the user does not exist on the system: - a new association in the default account is created. The associated QoS does not allow jobs. - - a new association in the project account is created. + - a new association in the project account is created in combination with the allowed partitions if any are specified - For users who left the project: - The user's association in the project is removed. @@ -236,25 +236,26 @@ def slurm_project_users_accounts( for cluster in clusters: cluster_users_acct = [ - (user.User, user.Account) for user in slurm_user_info if user and user.Cluster == cluster + (user.User, user.Account, user.Partition) for user in slurm_user_info if user and user.Cluster == cluster ] - protected_users = [u for (u, a) in cluster_users_acct if a in protected_accounts] + protected_users = [u for (u, a, _) in cluster_users_acct if a in protected_accounts] new_users = set() remove_project_users = set() all_project_users = set() - for (members, project_name) in project_members: + for (members, project_name, project_partitions) in project_members: # these are the current Slurm users for this project - slurm_project_users = set([user for (user, acct) in cluster_users_acct if acct == project_name]) + slurm_project_users = set([user for (user, acct, part) in cluster_users_acct if acct == project_name and part in project_partitions]) all_project_users |= slurm_project_users # these users are not yet in the Slurm DBD for this project - new_users |= set([(user, project_name) for user in (members & active_accounts) - slurm_project_users]) + new_users |= set([(user, project_name, project_partitions) for user in (members & active_accounts) - slurm_project_users]) # these are the Slurm users that should no longer be associated with the project + # XXX: what to do with the partitions here? if we first remove, then add, this should be sufficient? remove_project_users |= set([(user, project_name) for user in slurm_project_users - members]) logging.info("%d new users", len(new_users)) @@ -268,35 +269,37 @@ def slurm_project_users_accounts( "Number of slurm users not in projects: %d > 0: %s", len(remove_slurm_users), remove_slurm_users ) - # create associations in the default account for users that do not already have one + # kick out users no longer in the project or whose partition changed + commands.extend([ + create_remove_user_account_command(user=user, account=project_name, cluster=cluster) + for (user, project_name) in remove_project_users + ]) + + # remove associations in the default account for users no longer in any project + commands.extend([ + create_remove_user_account_command(user=user, account=default_account, cluster=cluster) + for user in cluster_users_with_default_account - all_project_users if user not in protected_users + ]) + + # create associations in the default account for users that do not already have one cluster_users_with_default_account = set([u for (u, a) in cluster_users_acct if a == default_account]) commands.extend([create_add_user_command( user=user, account=default_account, default_account=default_account, - cluster=cluster) for (user, _) in new_users if user not in cluster_users_with_default_account + cluster=cluster, + partition=project_partition) for (user, _, project_partition) in new_users if user not in cluster_users_with_default_account ]) # create associations for the actual project's new users commands.extend([create_add_user_command( user=user, account=project_name, - cluster=cluster) for (user, project_name) in new_users - ]) - - # kick out users no longer in the project - commands.extend([ - create_remove_user_account_command(user=user, account=project_name, cluster=cluster) - for (user, project_name) in remove_project_users - ]) - - # remove associations in the default account for users no longer in any project - commands.extend([ - create_remove_user_account_command(user=user, account=default_account, cluster=cluster) - for user in cluster_users_with_default_account - all_project_users if user not in protected_users + cluster=cluster, + partition=project_partition) for (user, project_name, project_partition) in new_users ]) - return commands + return commands def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, dry_run=False): From d4e627ec1bebdadd02656c079797941362a7f213 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 8 May 2023 15:08:33 +0200 Subject: [PATCH 12/32] fix: tests + remove users from partitions --- lib/vsc/administration/slurm/sacctmgr.py | 5 ++- lib/vsc/administration/slurm/sync.py | 48 +++++++++++++++++++----- test/slurm_sync.py | 39 ++++++++++--------- 3 files changed, 65 insertions(+), 27 deletions(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 1cfc833d..0e851338 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -385,7 +385,7 @@ def create_remove_account_command(account, cluster): @mksacctmgr('remove') -def create_remove_user_account_command(user, account, cluster): +def create_remove_user_account_command(user, account, cluster, partition=None): """Create the command to remove a user. @returns: list comprising the command @@ -397,6 +397,9 @@ def create_remove_user_account_command(user, account, cluster): "Cluster={cluster}".format(cluster=cluster) ] + if partition is not None: + command.append(f"Partition={partition}") + logging.debug( "Adding command to remove user %s with account %s from Cluster=%s", user, diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 03b5c975..ed5e4111 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -225,7 +225,8 @@ def slurm_project_users_accounts( - If the user does not exist on the system: - a new association in the default account is created. The associated QoS does not allow jobs. - - a new association in the project account is created in combination with the allowed partitions if any are specified + - a new association in the project account is created in combination with the allowed partitions + if any are specified - For users who left the project: - The user's association in the project is removed. @@ -239,27 +240,48 @@ def slurm_project_users_accounts( (user.User, user.Account, user.Partition) for user in slurm_user_info if user and user.Cluster == cluster ] + print(f"cluster_users_acct: {cluster_users_acct}") + protected_users = [u for (u, a, _) in cluster_users_acct if a in protected_accounts] new_users = set() remove_project_users = set() + obsolete_slurm_project_users = set() all_project_users = set() for (members, project_name, project_partitions) in project_members: # these are the current Slurm users for this project - slurm_project_users = set([user for (user, acct, part) in cluster_users_acct if acct == project_name and part in project_partitions]) + slurm_project_users = set([ + user for (user, acct, part) in cluster_users_acct + if acct == project_name and part in project_partitions + ]) + obsolete_slurm_project_users |= set([ + (user, acct, part) for (user, acct, part) in cluster_users_acct + if acct == project_name and part not in project_partitions + ]) all_project_users |= slurm_project_users # these users are not yet in the Slurm DBD for this project - new_users |= set([(user, project_name, project_partitions) for user in (members & active_accounts) - slurm_project_users]) + new_users |= set([ + (user, project_name, tuple(project_partitions)) + for user in (members & active_accounts) - slurm_project_users + ]) # these are the Slurm users that should no longer be associated with the project # XXX: what to do with the partitions here? if we first remove, then add, this should be sufficient? remove_project_users |= set([(user, project_name) for user in slurm_project_users - members]) - logging.info("%d new users", len(new_users)) - logging.info("%d removed project users", len(remove_project_users)) + logging.info("%d new users", len(new_users)) + logging.info("%d removed project users", len(remove_project_users)) + logging.info("===============================================================") + logging.info(f"Project {project_name} members: {members}") + logging.info(f"Project {project_name} project_partitions: {project_partitions}") + logging.info(f"Project {project_name} slurm_project_users: {slurm_project_users}") + logging.info(f"Project {project_name} obsolete_slurm_project_users: {obsolete_slurm_project_users}") + logging.info(f"Project {project_name} new_users: {new_users}") + logging.info(f"Project {project_name} removed users: {remove_project_users}") + logging.info("===============================================================") # these are the users not in any project, we should decide if we want any of those remove_slurm_users = set([u[0] for u in cluster_users_acct if u not in protected_users]) - all_project_users @@ -269,26 +291,34 @@ def slurm_project_users_accounts( "Number of slurm users not in projects: %d > 0: %s", len(remove_slurm_users), remove_slurm_users ) + cluster_users_with_default_account = set([u for (u, a, _) in cluster_users_acct if a == default_account]) + # kick out users no longer in the project or whose partition changed commands.extend([ create_remove_user_account_command(user=user, account=project_name, cluster=cluster) for (user, project_name) in remove_project_users ]) + commands.extend([ + create_remove_user_account_command(user=user, account=project_name, cluster=cluster, partition=partition) + for (user, project_name, partition) in obsolete_slurm_project_users + ]) + # remove associations in the default account for users no longer in any project commands.extend([ create_remove_user_account_command(user=user, account=default_account, cluster=cluster) for user in cluster_users_with_default_account - all_project_users if user not in protected_users ]) - # create associations in the default account for users that do not already have one - cluster_users_with_default_account = set([u for (u, a) in cluster_users_acct if a == default_account]) + # create associations in the default account for users that do not already have one commands.extend([create_add_user_command( user=user, account=default_account, default_account=default_account, cluster=cluster, - partition=project_partition) for (user, _, project_partition) in new_users if user not in cluster_users_with_default_account + partition=p) + for (user, _, project_partition) in new_users for p in project_partition + if user not in cluster_users_with_default_account ]) # create associations for the actual project's new users @@ -296,7 +326,7 @@ def slurm_project_users_accounts( user=user, account=project_name, cluster=cluster, - partition=project_partition) for (user, project_name, project_partition) in new_users + partition=p) for (user, project_name, project_partition) in new_users for p in project_partition ]) return commands diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 5c9b950a..bd794bf8 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -132,19 +132,19 @@ def test_slurm_project_qos(self): def test_slurm_project_users_accounts(self): project_members = [ - (set(["user1", "user2", "user3"]), "gpr_compute_project1"), - (set(["user4", "user5", "user6"]), "gpr_compute_project2"), + (set(["user1", "user2", "user3"]), "gpr_compute_project1", ["part1", "part3"]), + (set(["user4", "user5", "user6"]), "gpr_compute_project2", ["part2"]), ] active_accounts = set(["user1", "user3", "user4", "user5", "user6", "user7"]) slurm_user_info = [ - SlurmUser(User='user1', Def_Acct='default_account', Admin='None', Cluster='mycluster', Account='gpr_compute_project1', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), - SlurmUser(User='user1', Def_Acct='default_account', Admin='None', Cluster='mycluster', Account='default_account', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), - SlurmUser(User='user2', Def_Acct='gpr_compute_project1', Admin='None', Cluster='mycluster', Account='gpr_compute_project1', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), - SlurmUser(User='user2', Def_Acct='default_account', Admin='None', Cluster='mycluster', Account='default_account', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), - SlurmUser(User='user4', Def_Acct='gpr_compute_project1', Admin='None', Cluster='mycluster', Account='gpr_compute_project1', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), - SlurmUser(User='user3', Def_Acct='gpr_compute_project2', Admin='None', Cluster='mycluster', Account='gpr_compute_project2', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), - SlurmUser(User='user5', Def_Acct='gpr_compute_project2', Admin='None', Cluster='mycluster', Account='gpr_compute_project2', Partition='', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user1', Def_Acct='default_account', Admin='None', Cluster='mycluster', Account='gpr_compute_project1', Partition='part1', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user1', Def_Acct='default_account', Admin='None', Cluster='mycluster', Account='default_account', Partition='part1', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user2', Def_Acct='gpr_compute_project1', Admin='None', Cluster='mycluster', Account='gpr_compute_project1', Partition='part1', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user2', Def_Acct='default_account', Admin='None', Cluster='mycluster', Account='default_account', Partition='part1', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user4', Def_Acct='gpr_compute_project1', Admin='None', Cluster='mycluster', Account='gpr_compute_project1', Partition='part2', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user3', Def_Acct='gpr_compute_project2', Admin='None', Cluster='mycluster', Account='gpr_compute_project2', Partition='part2', Share='2', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), + SlurmUser(User='user5', Def_Acct='gpr_compute_project2', Admin='None', Cluster='mycluster', Account='gpr_compute_project2', Partition='part1', Share='1', MaxJobs='', MaxNodes='', MaxCPUs='', MaxSubmit='', MaxWall='', MaxCPUMins='', QOS='normal', Def_QOS=''), ] commands = slurm_project_users_accounts( @@ -153,18 +153,23 @@ def test_slurm_project_users_accounts(self): slurm_user_info, ["mycluster"], default_account="default_account", - protected_accounts=("protected_account1", "protected_acocunt2") + protected_accounts=("protected_account1", "protected_account2") ) self.assertEqual(set([tuple(x) for x in commands]), set([tuple(x) for x in [ - shlex.split("/usr/bin/sacctmgr -i add user user4 Account=default_account Cluster=mycluster DefaultAccount=default_account"), - shlex.split("/usr/bin/sacctmgr -i add user user6 Account=default_account Cluster=mycluster DefaultAccount=default_account"), - shlex.split("/usr/bin/sacctmgr -i add user user3 Account=default_account Cluster=mycluster DefaultAccount=default_account"), - shlex.split("/usr/bin/sacctmgr -i add user user4 Account=gpr_compute_project2 Cluster=mycluster"), - shlex.split("/usr/bin/sacctmgr -i add user user6 Account=gpr_compute_project2 Cluster=mycluster"), - shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster"), + shlex.split("/usr/bin/sacctmgr -i add user user4 Account=default_account Cluster=mycluster Partition=part2 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user6 Account=default_account Cluster=mycluster Partition=part2 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user3 Account=default_account Cluster=mycluster Partition=part1 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user3 Account=default_account Cluster=mycluster Partition=part3 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user5 Account=default_account Cluster=mycluster Partition=part2 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user4 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), + shlex.split("/usr/bin/sacctmgr -i add user user6 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), + shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part1"), + shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part3"), + shlex.split("/usr/bin/sacctmgr -i add user user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=gpr_compute_project2 Cluster=mycluster"), - shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=gpr_compute_project1 Cluster=mycluster"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part1"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=gpr_compute_project1 Cluster=mycluster Partition=part2"), ]])) From 5d14ba16c52026690d88dd4f318daa879f634a23 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 8 May 2023 15:17:50 +0200 Subject: [PATCH 13/32] fix: covert to debug logging level --- lib/vsc/administration/slurm/sync.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 11fafd83..d097f86f 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -272,16 +272,16 @@ def slurm_project_users_accounts( # XXX: what to do with the partitions here? if we first remove, then add, this should be sufficient? remove_project_users |= set([(user, project_name) for user in slurm_project_users - members]) - logging.info("%d new users", len(new_users)) - logging.info("%d removed project users", len(remove_project_users)) - logging.info("===============================================================") - logging.info(f"Project {project_name} members: {members}") - logging.info(f"Project {project_name} project_partitions: {project_partitions}") - logging.info(f"Project {project_name} slurm_project_users: {slurm_project_users}") - logging.info(f"Project {project_name} obsolete_slurm_project_users: {obsolete_slurm_project_users}") - logging.info(f"Project {project_name} new_users: {new_users}") - logging.info(f"Project {project_name} removed users: {remove_project_users}") - logging.info("===============================================================") + logging.debug("%d new users", len(new_users)) + logging.debug("%d removed project users", len(remove_project_users)) + logging.debug("===============================================================") + logging.debug(f"Project {project_name} members: {members}") + logging.debug(f"Project {project_name} project_partitions: {project_partitions}") + logging.debug(f"Project {project_name} slurm_project_users: {slurm_project_users}") + logging.debug(f"Project {project_name} obsolete_slurm_project_users: {obsolete_slurm_project_users}") + logging.debug(f"Project {project_name} new_users: {new_users}") + logging.debug(f"Project {project_name} removed users: {remove_project_users}") + logging.debug("===============================================================") # these are the users not in any project, we should decide if we want any of those remove_slurm_users = set([u[0] for u in cluster_users_acct if u not in protected_users]) - all_project_users From 0ad27362bc99bea4cd56b3c4b76498de6d5bfa65 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 9 May 2023 11:43:16 +0200 Subject: [PATCH 14/32] fix: reverse logic, see #143 --- bin/sync_slurm_acct.py | 2 +- lib/vsc/administration/slurm/sync.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/sync_slurm_acct.py b/bin/sync_slurm_acct.py index d9849957..ae1349da 100644 --- a/bin/sync_slurm_acct.py +++ b/bin/sync_slurm_acct.py @@ -171,7 +171,7 @@ def main(): print("\n".join([" ".join(c) for c in sacctmgr_commands])) else: logging.info("Executing %d commands", len(sacctmgr_commands)) - execute_commands(scancel_commands, disallow_failure=False) + execute_commands(scancel_commands, allow_failure=True) execute_commands(sacctmgr_commands) if not opts.options.dry_run: diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index d097f86f..b4549912 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -40,7 +40,7 @@ class SCommandException(Exception): pass -def execute_commands(commands, disallow_failure=True): +def execute_commands(commands, allow_failure=False): """Run the specified commands""" for command in commands: @@ -48,7 +48,7 @@ def execute_commands(commands, disallow_failure=True): # if one fails, we simply fail the script and should get notified (ec, _) = RunNoShell.run(command) - if ec != 0 and disallow_failure: + if ec != 0 and not allow_failure: raise SCommandException("Command failed: {0}".format(command)) From 565b4c212afbadb98449d1a4b0e0d7ed4229f11e Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 9 May 2023 13:43:35 +0200 Subject: [PATCH 15/32] fix: remove print statement --- lib/vsc/administration/slurm/sync.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index b4549912..57a7edcb 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -240,8 +240,6 @@ def slurm_project_users_accounts( (user.User, user.Account, user.Partition) for user in slurm_user_info if user and user.Cluster == cluster ] - print(f"cluster_users_acct: {cluster_users_acct}") - protected_users = [u for (u, a, _) in cluster_users_acct if a in protected_accounts] new_users = set() From 3bebcf98499bc4e3b7b0f3eefa205a8f7ffa3fe6 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 10 May 2023 10:35:34 +0200 Subject: [PATCH 16/32] bump: version to 4.3.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a6bd2391..8ecfbf93 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ ] PACKAGE = { - 'version': '4.2.0', + 'version': '4.3.0', 'author': [ag, jt], 'maintainer': [ag, jt], 'tests_require': ['mock'], From 222f41368384dae01ff6fd8d4be6ffc6bb7e4799 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Wed, 10 May 2023 17:29:46 +0200 Subject: [PATCH 17/32] fix: first add associations, before we remove any to avoid users being deleted with jobs in the queue --- lib/vsc/administration/slurm/sync.py | 41 ++++++++++++++-------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 57a7edcb..86d3a2d3 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -281,6 +281,27 @@ def slurm_project_users_accounts( logging.debug(f"Project {project_name} removed users: {remove_project_users}") logging.debug("===============================================================") + cluster_users_with_default_account = set([u for (u, a, _) in cluster_users_acct if a == default_account]) + + # create associations in the default account for users that do not already have one + commands.extend([create_add_user_command( + user=user, + account=default_account, + default_account=default_account, + cluster=cluster, + partition=p) + for (user, _, project_partition) in new_users for p in project_partition + if user not in cluster_users_with_default_account + ]) + + # create associations for the actual project's new users + commands.extend([create_add_user_command( + user=user, + account=project_name, + cluster=cluster, + partition=p) for (user, project_name, project_partition) in new_users for p in project_partition + ]) + # these are the users not in any project, we should decide if we want any of those remove_slurm_users = set([u[0] for u in cluster_users_acct if u not in protected_users]) - all_project_users @@ -289,7 +310,6 @@ def slurm_project_users_accounts( "Number of slurm users not in projects: %d > 0: %s", len(remove_slurm_users), remove_slurm_users ) - cluster_users_with_default_account = set([u for (u, a, _) in cluster_users_acct if a == default_account]) # kick out users no longer in the project or whose partition changed commands.extend([ @@ -308,25 +328,6 @@ def slurm_project_users_accounts( for user in cluster_users_with_default_account - all_project_users if user not in protected_users ]) - # create associations in the default account for users that do not already have one - commands.extend([create_add_user_command( - user=user, - account=default_account, - default_account=default_account, - cluster=cluster, - partition=p) - for (user, _, project_partition) in new_users for p in project_partition - if user not in cluster_users_with_default_account - ]) - - # create associations for the actual project's new users - commands.extend([create_add_user_command( - user=user, - account=project_name, - cluster=cluster, - partition=p) for (user, project_name, project_partition) in new_users for p in project_partition - ]) - return commands From e5717478b752b798507b1febec2034af324f949a Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Thu, 11 May 2023 12:59:02 +0200 Subject: [PATCH 18/32] fix: indentation --- lib/vsc/administration/slurm/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 86d3a2d3..b90f365f 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -328,7 +328,7 @@ def slurm_project_users_accounts( for user in cluster_users_with_default_account - all_project_users if user not in protected_users ]) - return commands + return commands def slurm_user_accounts(vo_members, active_accounts, slurm_user_info, clusters, dry_run=False): From f22f5347dec96f4c5bf9d5f4c25d58c6b3c5ee30 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 15 May 2023 11:38:58 +0200 Subject: [PATCH 19/32] fix: rename variable to reflect multiple --- lib/vsc/administration/slurm/sync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index b90f365f..3335b4e4 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -290,7 +290,7 @@ def slurm_project_users_accounts( default_account=default_account, cluster=cluster, partition=p) - for (user, _, project_partition) in new_users for p in project_partition + for (user, _, project_partitions) in new_users for p in project_partitions if user not in cluster_users_with_default_account ]) @@ -299,7 +299,7 @@ def slurm_project_users_accounts( user=user, account=project_name, cluster=cluster, - partition=p) for (user, project_name, project_partition) in new_users for p in project_partition + partition=p) for (user, project_name, project_partitions) in new_users for p in project_partitions ]) # these are the users not in any project, we should decide if we want any of those From c5361fc050c9c70fe99f548bb214f7d6bb33df64 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 15:49:52 +0200 Subject: [PATCH 20/32] fix: also create associations for user, partition combo's not yet in slurm --- lib/vsc/administration/slurm/sync.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 3335b4e4..6aa9f5f0 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -251,19 +251,19 @@ def slurm_project_users_accounts( # these are the current Slurm users for this project slurm_project_users = set([ - user for (user, acct, part) in cluster_users_acct + (user, part) for (user, acct, part) in cluster_users_acct if acct == project_name and part in project_partitions ]) obsolete_slurm_project_users |= set([ (user, acct, part) for (user, acct, part) in cluster_users_acct if acct == project_name and part not in project_partitions ]) - all_project_users |= slurm_project_users + all_project_users |= set([u for (u, _) in slurm_project_users]) # these users are not yet in the Slurm DBD for this project new_users |= set([ - (user, project_name, tuple(project_partitions)) - for user in (members & active_accounts) - slurm_project_users + (user, project_name, part) + for (user, part) in set([(u, p) for u in (members & active_accounts) for p in project_partitions]) - slurm_project_users ]) # these are the Slurm users that should no longer be associated with the project From 09d9a0441825164fa6584024c74be7657fc69164 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 15:57:35 +0200 Subject: [PATCH 21/32] fix: partition is now already in the new_users set --- lib/vsc/administration/slurm/sync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 6aa9f5f0..b9215a4c 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -290,7 +290,7 @@ def slurm_project_users_accounts( default_account=default_account, cluster=cluster, partition=p) - for (user, _, project_partitions) in new_users for p in project_partitions + for (user, _, project_partition) in new_users if user not in cluster_users_with_default_account ]) @@ -299,7 +299,7 @@ def slurm_project_users_accounts( user=user, account=project_name, cluster=cluster, - partition=p) for (user, project_name, project_partitions) in new_users for p in project_partitions + partition=p) for (user, project_name, project_partition) in new_users ]) # these are the users not in any project, we should decide if we want any of those From a9712a8f7039238e29df0d2d2b6e370e767727e7 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 16:15:10 +0200 Subject: [PATCH 22/32] fix: use the correct variable name --- lib/vsc/administration/slurm/sync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index b9215a4c..6526aa46 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -289,7 +289,7 @@ def slurm_project_users_accounts( account=default_account, default_account=default_account, cluster=cluster, - partition=p) + partition=project_partition) for (user, _, project_partition) in new_users if user not in cluster_users_with_default_account ]) @@ -299,7 +299,7 @@ def slurm_project_users_accounts( user=user, account=project_name, cluster=cluster, - partition=p) for (user, project_name, project_partition) in new_users + partition=project_partition) for (user, project_name, project_partition) in new_users ]) # these are the users not in any project, we should decide if we want any of those From a71342b3345eb7e39f88e6e135c2631aa8209204 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 16:30:46 +0200 Subject: [PATCH 23/32] bump: version to 4.4.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8ecfbf93..8dce491d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ ] PACKAGE = { - 'version': '4.3.0', + 'version': '4.4.0', 'author': [ag, jt], 'maintainer': [ag, jt], 'tests_require': ['mock'], From 81fde7fcb39b875d9cf544e10e35450091ee1b2d Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 16:51:47 +0200 Subject: [PATCH 24/32] fix: also remove users if the partition no longer matches --- lib/vsc/administration/slurm/sync.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 6526aa46..70f04831 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -263,12 +263,15 @@ def slurm_project_users_accounts( # these users are not yet in the Slurm DBD for this project new_users |= set([ (user, project_name, part) - for (user, part) in set([(u, p) for u in (members & active_accounts) for p in project_partitions]) - slurm_project_users + for (user, part) in + set([(u, p) for u in (members & active_accounts) for p in project_partitions]) - slurm_project_users ]) # these are the Slurm users that should no longer be associated with the project - # XXX: what to do with the partitions here? if we first remove, then add, this should be sufficient? - remove_project_users |= set([(user, project_name) for user in slurm_project_users - members]) + remove_project_users |= set([ + (user, project_name, part) for (user, acct, part) in cluster_users_acct + if acct == project_name and (user not in members or part not in project_partitions) + ]) logging.debug("%d new users", len(new_users)) logging.debug("%d removed project users", len(remove_project_users)) @@ -313,14 +316,14 @@ def slurm_project_users_accounts( # kick out users no longer in the project or whose partition changed commands.extend([ - create_remove_user_account_command(user=user, account=project_name, cluster=cluster) - for (user, project_name) in remove_project_users + create_remove_user_account_command(user=user, account=project_name, cluster=cluster, partition=partition) + for (user, project_name, partition) in remove_project_users ]) commands.extend([ create_remove_user_account_command(user=user, account=project_name, cluster=cluster, partition=partition) for (user, project_name, partition) in obsolete_slurm_project_users - ]) + ]) # remove associations in the default account for users no longer in any project commands.extend([ From 2b50a842427d7090496c53320cfd28bc96d2d646 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 16:54:56 +0200 Subject: [PATCH 25/32] fix: remove trailing whitespace --- lib/vsc/administration/slurm/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 70f04831..8c16b643 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -263,7 +263,7 @@ def slurm_project_users_accounts( # these users are not yet in the Slurm DBD for this project new_users |= set([ (user, project_name, part) - for (user, part) in + for (user, part) in set([(u, p) for u in (members & active_accounts) for p in project_partitions]) - slurm_project_users ]) From 6b4de0eed0d0bc5f933ac1118b3f22cc1015ea7a Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 16:57:15 +0200 Subject: [PATCH 26/32] test: reflect new partition addition and removal strategy --- test/slurm_sync.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 622e8694..8632e860 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -157,6 +157,7 @@ def test_slurm_project_users_accounts(self): ) self.assertEqual(set([tuple(x) for x in commands]), set([tuple(x) for x in [ + shlex.split("/usr/bin/sacctmgr -i add user user1 Account=gpr_compute_project1 Cluster=mycluster Partition=part3 DefaultAccount=default_account"), shlex.split("/usr/bin/sacctmgr -i add user user4 Account=default_account Cluster=mycluster Partition=part2 DefaultAccount=default_account"), shlex.split("/usr/bin/sacctmgr -i add user user6 Account=default_account Cluster=mycluster Partition=part2 DefaultAccount=default_account"), shlex.split("/usr/bin/sacctmgr -i add user user3 Account=default_account Cluster=mycluster Partition=part1 DefaultAccount=default_account"), @@ -167,7 +168,7 @@ def test_slurm_project_users_accounts(self): shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part1"), shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part3"), shlex.split("/usr/bin/sacctmgr -i add user user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), - shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=gpr_compute_project2 Cluster=mycluster"), + shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), shlex.split("/usr/bin/sacctmgr -i remove user Name=user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part1"), shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=gpr_compute_project1 Cluster=mycluster Partition=part2"), ]])) From 1ab2168ec0489484283561b40c6fd21e385da9f2 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 17:09:51 +0200 Subject: [PATCH 27/32] fix: also list the default account --- lib/vsc/administration/slurm/sync.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 8c16b643..23487ddb 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -302,6 +302,7 @@ def slurm_project_users_accounts( user=user, account=project_name, cluster=cluster, + default_account=default_account, partition=project_partition) for (user, project_name, project_partition) in new_users ]) From adce868c6df7fb6404f6d214402d26065d06f8bf Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 17:15:21 +0200 Subject: [PATCH 28/32] fix: use the right default account --- lib/vsc/administration/slurm/sacctmgr.py | 2 +- lib/vsc/administration/slurm/sync.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vsc/administration/slurm/sacctmgr.py b/lib/vsc/administration/slurm/sacctmgr.py index 813b0592..0d902caf 100644 --- a/lib/vsc/administration/slurm/sacctmgr.py +++ b/lib/vsc/administration/slurm/sacctmgr.py @@ -301,7 +301,7 @@ def create_add_user_command(user, account, cluster, partition=None, default_acco ) if default_account is not None: command.append( - f"DefaultAccount={account}", + f"DefaultAccount={default_account}", ) logging.debug( "Adding command to add user %s with Account=%s Cluster=%s", diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index 23487ddb..e0874156 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -301,8 +301,8 @@ def slurm_project_users_accounts( commands.extend([create_add_user_command( user=user, account=project_name, - cluster=cluster, default_account=default_account, + cluster=cluster, partition=project_partition) for (user, project_name, project_partition) in new_users ]) From 46d28fad638c54be47abbed811eee48867e33da7 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Tue, 16 May 2023 17:17:46 +0200 Subject: [PATCH 29/32] test: more default accounts --- test/slurm_sync.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 8632e860..22190a7d 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -163,11 +163,11 @@ def test_slurm_project_users_accounts(self): shlex.split("/usr/bin/sacctmgr -i add user user3 Account=default_account Cluster=mycluster Partition=part1 DefaultAccount=default_account"), shlex.split("/usr/bin/sacctmgr -i add user user3 Account=default_account Cluster=mycluster Partition=part3 DefaultAccount=default_account"), shlex.split("/usr/bin/sacctmgr -i add user user5 Account=default_account Cluster=mycluster Partition=part2 DefaultAccount=default_account"), - shlex.split("/usr/bin/sacctmgr -i add user user4 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), - shlex.split("/usr/bin/sacctmgr -i add user user6 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), - shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part1"), - shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part3"), - shlex.split("/usr/bin/sacctmgr -i add user user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), + shlex.split("/usr/bin/sacctmgr -i add user user4 Account=gpr_compute_project2 Cluster=mycluster Partition=part2 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user6 Account=gpr_compute_project2 Cluster=mycluster Partition=part2 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part1 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user3 Account=gpr_compute_project1 Cluster=mycluster Partition=part3 DefaultAccount=default_account"), + shlex.split("/usr/bin/sacctmgr -i add user user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part2 DefaultAccount=default_account"), shlex.split("/usr/bin/sacctmgr -i remove user Name=user3 Account=gpr_compute_project2 Cluster=mycluster Partition=part2"), shlex.split("/usr/bin/sacctmgr -i remove user Name=user5 Account=gpr_compute_project2 Cluster=mycluster Partition=part1"), shlex.split("/usr/bin/sacctmgr -i remove user Name=user4 Account=gpr_compute_project1 Cluster=mycluster Partition=part2"), From edc80aad16fa3f02e52d9be0ad20b4152334a026 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 22 May 2023 11:11:33 +0200 Subject: [PATCH 30/32] fix: only set billing minutes, not cpu minutes for QoS --- lib/vsc/administration/slurm/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsc/administration/slurm/sync.py b/lib/vsc/administration/slurm/sync.py index e0874156..ba4840a2 100644 --- a/lib/vsc/administration/slurm/sync.py +++ b/lib/vsc/administration/slurm/sync.py @@ -117,7 +117,7 @@ def slurm_project_qos(projects, slurm_qos_info, clusters, protected_qos, qos_cle if qos_name not in cluster_qos_names: commands.append(create_add_qos_command(qos_name)) commands.append(create_modify_qos_command(qos_name, { - "GRPTRESMins": "billing={cpuminutes},cpu={cpuminutes},gres/gpu={gpuminutes}".format( + "GRPTRESMins": "billing={cpuminutes},gres/gpu={gpuminutes}".format( cpuminutes=60*int(project.cpu_hours) + TIER1_GPU_TO_CPU_HOURS_RATE * 60 * int(project.gpu_hours), gpuminutes=max(1, 60*int(project.gpu_hours))) From 7ff39eeeb2b42cd22064da26121676f3c08ede0f Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 22 May 2023 11:12:47 +0200 Subject: [PATCH 31/32] bump: version to 4.5.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8dce491d..beb38fd3 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ ] PACKAGE = { - 'version': '4.4.0', + 'version': '4.5.0', 'author': [ag, jt], 'maintainer': [ag, jt], 'tests_require': ['mock'], From 278a171299082e7f1afaa7986272d5854d5906f3 Mon Sep 17 00:00:00 2001 From: Andy Georges Date: Mon, 22 May 2023 14:44:24 +0200 Subject: [PATCH 32/32] test: remove the cpu minutes assignment --- test/slurm_sync.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/slurm_sync.py b/test/slurm_sync.py index 22190a7d..2e7a2222 100644 --- a/test/slurm_sync.py +++ b/test/slurm_sync.py @@ -123,9 +123,9 @@ def test_slurm_project_qos(self): self.assertEqual(set([tuple(x) for x in commands]), set([tuple(x) for x in [ shlex.split("/usr/bin/sacctmgr -i add qos Name=mycluster-gpr_compute_project1"), shlex.split("/usr/bin/sacctmgr -i add qos Name=mycluster-gpr_compute_project2"), - shlex.split("/usr/bin/sacctmgr -i modify qos mycluster-gpr_compute_project1 set flags=NoDecay,DenyOnLimit GRPTRESMins=billing=2280,cpu=2280,gres/gpu=180"), - shlex.split("/usr/bin/sacctmgr -i modify qos mycluster-gpr_compute_project2 set flags=NoDecay,DenyOnLimit GRPTRESMins=billing=300,cpu=300,gres/gpu=1"), - shlex.split("/usr/bin/sacctmgr -i modify qos mycluster-gpr_compute_project3 set flags=NoDecay,DenyOnLimit GRPTRESMins=billing=240,cpu=240,gres/gpu=1"), + shlex.split("/usr/bin/sacctmgr -i modify qos mycluster-gpr_compute_project1 set flags=NoDecay,DenyOnLimit GRPTRESMins=billing=2280,gres/gpu=180"), + shlex.split("/usr/bin/sacctmgr -i modify qos mycluster-gpr_compute_project2 set flags=NoDecay,DenyOnLimit GRPTRESMins=billing=300,gres/gpu=1"), + shlex.split("/usr/bin/sacctmgr -i modify qos mycluster-gpr_compute_project3 set flags=NoDecay,DenyOnLimit GRPTRESMins=billing=240,gres/gpu=1"), shlex.split("/usr/bin/sacctmgr -i remove qos where Name=mycluster-gpr_compute_project4"), ]]))