From ed75b85aff42c6a42fd9bde7229a28b275eb0b8d Mon Sep 17 00:00:00 2001
From: Andy Georges
If you wish to accommodate multiple runtimes in your environment, +it is possible to do so with a bit of extra setup. This section outlines one +possible way to do so:
+ ++IgnoreFileConfigJson=true +RunTimeRun="/opt/slurm-oci/run %b %m %u %U %n %j %s %t %@" +RunTimeKill="kill -s SIGTERM %p" +RunTimeDelete="kill -s SIGKILL %p" ++
+#!/bin/bash +if [[ -e ~/.slurm-oci-run ]]; then + ~/.slurm-oci-run "$@" +else + /opt/slurm-oci/slurm-oci-run-default "$@" +fi ++
+#!/bin/bash --login +# Parse +CONTAINER="$1" +SPOOL_DIR="$2" +USER_NAME="$3" +USER_ID="$4" +NODE_NAME="$5" +JOB_ID="$6" +STEP_ID="$7" +TASK_ID="$8" +shift 8 # subsequent arguments are the command to run in the container +# Run +apptainer run --bind /var/spool --containall "$CONTAINER" "$@" ++
chmod +x /opt/slurm-oci/run /opt/slurm-oci/slurm-oci-run-default+
Once this is done, users may create a script at '~/.slurm-oci-run' if +they wish to customize the container run process, such as using a different +container runtime. Users should model this file after the default +'/opt/slurm-oci/slurm-oci-run-default'
+Last modified 08 October 2024
+Last modified 19 November 2024
From 57e3a1241844f82dcedfc7b2f5e000599d6779a1 Mon Sep 17 00:00:00 2001 From: Stephen KendallDo I need to maintain synchronized
clocks on the cluster?
-In general, yes. Having inconsistent clocks may cause nodes to
-be unusable. Slurm log files should contain references to
-expired credentials. For example:
error: Munge decode failed: Expired credential
ENCODED: Wed May 12 12:34:56 2008
@@ -2438,6 +2438,6 @@ dset TV::parallel_configs {
}
!-->
-Last modified 07 November 2024
+Last modified 19 November 2024
From a6bbb70aaf77e86034597948fbd10036a30f7bf0 Mon Sep 17 00:00:00 2001
From: Stephen Kendall
Date: Wed, 9 Oct 2024 11:03:25 -0600
Subject: [PATCH 46/90] Docs - update v0.0.41 descriptions indicating format
lists are ignored
Ticket 20302
---
src/plugins/data_parser/v0.0.41/parsers.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/plugins/data_parser/v0.0.41/parsers.c b/src/plugins/data_parser/v0.0.41/parsers.c
index 9d37e8d7dbe..ee8edb9ba21 100644
--- a/src/plugins/data_parser/v0.0.41/parsers.c
+++ b/src/plugins/data_parser/v0.0.41/parsers.c
@@ -8423,7 +8423,7 @@ static const parser_t PARSER_ARRAY(OPENAPI_WARNING)[] = {
static const parser_t PARSER_ARRAY(INSTANCE_CONDITION)[] = {
add_parse(CSV_STRING_LIST, cluster_list, "cluster", "CSV clusters list"),
add_parse(CSV_STRING_LIST, extra_list, "extra", "CSV extra list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(CSV_STRING_LIST, instance_id_list, "instance_id", "CSV instance_id list"),
add_parse(CSV_STRING_LIST, instance_type_list, "instance_type", "CSV instance_type list"),
add_parse(STRING, node_list, "node_list", "Ranged node string"),
@@ -8501,7 +8501,7 @@ static const parser_t PARSER_ARRAY(JOB_CONDITION)[] = {
add_flags(JOB_CONDITION_DB_FLAGS, db_flags),
add_parse(INT32, exitcode, "exit_code", "Job exit code (numeric)"),
add_flags(JOB_CONDITION_FLAGS, flags),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(GROUP_ID_STRING_LIST, groupid_list, "groups", "CSV group list"),
add_parse(CSV_STRING_LIST, jobname_list, "job_name", "CSV job name list"),
add_parse(UINT32_NO_VAL, nodes_max, "nodes_max", "Maximum number of nodes"),
@@ -8531,7 +8531,7 @@ static const parser_t PARSER_ARRAY(JOB_CONDITION)[] = {
static const parser_t PARSER_ARRAY(QOS_CONDITION)[] = {
add_parse(CSV_STRING_LIST, description_list, "description", "CSV description list"),
add_parse(QOS_ID_STRING_CSV_LIST, id_list, "id", "CSV QOS id list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(QOS_NAME_CSV_LIST, name_list, "name", "CSV QOS name list"),
add_parse_bit_flag_array(slurmdb_qos_cond_t, QOS_PREEMPT_MODES, false, preempt_mode, "preempt_mode", "PreemptMode used when jobs in this QOS are preempted"),
add_parse(BOOL16, with_deleted, "with_deleted", "Include deleted QOS"),
@@ -8590,7 +8590,7 @@ static const parser_t PARSER_ARRAY(ASSOC_CONDITION)[] = {
add_parse(CSV_STRING_LIST, acct_list, "account", "CSV accounts list"),
add_parse(CSV_STRING_LIST, cluster_list, "cluster", "CSV clusters list"),
add_parse(QOS_ID_STRING_CSV_LIST, def_qos_id_list, "default_qos", "CSV QOS list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(ASSOC_ID_STRING_CSV_LIST, id_list, "id", "CSV id list"),
add_parse(BOOL16, only_defs, "only_defaults", "Filter to only defaults"),
add_parse(CSV_STRING_LIST, parent_acct_list, "parent_account", "CSV names of parent account"),
@@ -8658,7 +8658,7 @@ static const parser_t PARSER_ARRAY(OPENAPI_WCKEY_PARAM)[] = {
add_parser(slurmdb_wckey_cond_t, mtype, false, field, 0, path, desc)
static const parser_t PARSER_ARRAY(WCKEY_CONDITION)[] = {
add_parse(CSV_STRING_LIST, cluster_list, "cluster", "CSV cluster name list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format name list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(CSV_STRING_LIST, id_list, "id", "CSV id list"),
add_parse(CSV_STRING_LIST, name_list, "name", "CSV name list"),
add_parse(BOOL16, only_defs, "only_defaults", "Only query defaults"),
@@ -8716,7 +8716,7 @@ static const parser_t PARSER_ARRAY(CLUSTER_CONDITION)[] = {
add_parse(STRING_LIST, cluster_list, "cluster", "CSV cluster list"),
add_parse(STRING_LIST, federation_list, "federation", "CSV federation list"),
add_parse_bit_flag_array(slurmdb_cluster_cond_t, CLUSTER_REC_FLAGS, false, flags, "flags", "Query flags"),
- add_parse(STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(STRING_LIST, rpc_version_list, "rpc_version", "CSV RPC version list"),
add_parse(TIMESTAMP, usage_end, "usage_end", "Usage end (UNIX timestamp)"),
add_parse(TIMESTAMP, usage_start, "usage_start", "Usage start (UNIX timestamp)"),
From c8f00b6298aedb7c6074e502732a4093c1c1cff2 Mon Sep 17 00:00:00 2001
From: Stephen Kendall
Date: Wed, 9 Oct 2024 11:04:42 -0600
Subject: [PATCH 47/90] Docs - update v0.0.40 descriptions indicating format
lists are ignored
Ticket 20302
---
src/plugins/data_parser/v0.0.40/parsers.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/plugins/data_parser/v0.0.40/parsers.c b/src/plugins/data_parser/v0.0.40/parsers.c
index 54f1ef33636..fe0437de58b 100644
--- a/src/plugins/data_parser/v0.0.40/parsers.c
+++ b/src/plugins/data_parser/v0.0.40/parsers.c
@@ -8230,7 +8230,7 @@ static const parser_t PARSER_ARRAY(OPENAPI_WARNING)[] = {
static const parser_t PARSER_ARRAY(INSTANCE_CONDITION)[] = {
add_parse(CSV_STRING_LIST, cluster_list, "cluster", "CSV clusters list"),
add_parse(CSV_STRING_LIST, extra_list, "extra", "CSV extra list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(CSV_STRING_LIST, instance_id_list, "instance_id", "CSV instance_id list"),
add_parse(CSV_STRING_LIST, instance_type_list, "instance_type", "CSV instance_type list"),
add_parse(STRING, node_list, "node_list", "Ranged node string"),
@@ -8305,7 +8305,7 @@ static const parser_t PARSER_ARRAY(JOB_CONDITION)[] = {
add_flags(JOB_CONDITION_DB_FLAGS, db_flags),
add_parse(INT32, exitcode, "exit_code", "Job exit code (numeric)"),
add_flags(JOB_CONDITION_FLAGS, flags),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(GROUP_ID_STRING_LIST, groupid_list, "groups", "CSV group list"),
add_parse(CSV_STRING_LIST, jobname_list, "job_name", "CSV job name list"),
add_parse(UINT32_NO_VAL, nodes_max, "nodes_max", "Maximum number of nodes"),
@@ -8335,7 +8335,7 @@ static const parser_t PARSER_ARRAY(JOB_CONDITION)[] = {
static const parser_t PARSER_ARRAY(QOS_CONDITION)[] = {
add_parse(CSV_STRING_LIST, description_list, "description", "CSV description list"),
add_parse(QOS_ID_STRING_CSV_LIST, id_list, "id", "CSV QOS id list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(QOS_NAME_CSV_LIST, name_list, "name", "CSV QOS name list"),
add_parse_bit_flag_array(slurmdb_qos_cond_t, QOS_PREEMPT_MODES, false, preempt_mode, "preempt_mode", "PreemptMode used when jobs in this QOS are preempted"),
add_parse(BOOL16, with_deleted, "with_deleted", "Include deleted QOS"),
@@ -8394,7 +8394,7 @@ static const parser_t PARSER_ARRAY(ASSOC_CONDITION)[] = {
add_parse(CSV_STRING_LIST, acct_list, "account", "CSV accounts list"),
add_parse(CSV_STRING_LIST, cluster_list, "cluster", "CSV clusters list"),
add_parse(QOS_ID_STRING_CSV_LIST, def_qos_id_list, "default_qos", "CSV QOS list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(ASSOC_ID_STRING_CSV_LIST, id_list, "id", "CSV id list"),
add_parse(BOOL16, only_defs, "only_defaults", "Filter to only defaults"),
add_parse(CSV_STRING_LIST, parent_acct_list, "parent_account", "CSV names of parent account"),
@@ -8462,7 +8462,7 @@ static const parser_t PARSER_ARRAY(OPENAPI_WCKEY_PARAM)[] = {
add_parser(slurmdb_wckey_cond_t, mtype, false, field, 0, path, desc)
static const parser_t PARSER_ARRAY(WCKEY_CONDITION)[] = {
add_parse(CSV_STRING_LIST, cluster_list, "cluster", "CSV cluster name list"),
- add_parse(CSV_STRING_LIST, format_list, "format", "CSV format name list"),
+ add_parse(CSV_STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(CSV_STRING_LIST, id_list, "id", "CSV id list"),
add_parse(CSV_STRING_LIST, name_list, "name", "CSV name list"),
add_parse(BOOL16, only_defs, "only_defaults", "Only query defaults"),
@@ -8525,7 +8525,7 @@ static const parser_t PARSER_ARRAY(CLUSTER_CONDITION)[] = {
add_parse(STRING_LIST, cluster_list, "cluster", "CSV cluster list"),
add_parse(STRING_LIST, federation_list, "federation", "CSV federation list"),
add_parse_bit_flag_array(slurmdb_cluster_cond_t, CLUSTER_REC_FLAGS, false, flags, "flags", "Query flags"),
- add_parse(STRING_LIST, format_list, "format", "CSV format list"),
+ add_parse(STRING_LIST, format_list, "format", "Ignored; process JSON manually to control output format"),
add_parse(STRING_LIST, rpc_version_list, "rpc_version", "CSV RPC version list"),
add_parse(TIMESTAMP, usage_end, "usage_end", "Usage end (UNIX timestamp)"),
add_parse(TIMESTAMP, usage_start, "usage_start", "Usage start (UNIX timestamp)"),
From bfcb9e3959ce140a59d7b9f0adc7aedb31318df2 Mon Sep 17 00:00:00 2001
From: Ben Roberts
Date: Mon, 21 Oct 2024 11:59:52 -0500
Subject: [PATCH 48/90] Docs - Add srun example about exclusive access with
number of tasks
Ticket 21193
Signed-off-by: Stephen Kendall
---
doc/man/man1/srun.1 | 31 ++++++++++++++++++++++++++++++-
1 file changed, 30 insertions(+), 1 deletion(-)
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index 3f6b128bc9f..fc2ff9d03f3 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -1,4 +1,4 @@
-.TH srun "1" "Slurm Commands" "October 2024" "Slurm Commands"
+.TH srun "1" "Slurm Commands" "November 2024" "Slurm Commands"
.SH "NAME"
srun \- Run parallel jobs
@@ -4647,6 +4647,35 @@ This behavior can be changed by adding \fBSelectTypeParameters=CR_Pack_Nodes\fR
to your slurm.conf. The logic to pack nodes will allow job steps to start on
a single node without having to explicitly request a single node.
+.TP
+\fBExample 11:\fR
+This example demonstrates that adding the \fB\-\-exclusive\fR flag to job
+allocation requests can give different results based on whether you also
+request a certain number of tasks.
+
+Requesting exclusive access with no additional requirements will allow the
+process to access all the CPUs on the allocated node.
+.nf
+$ srun \-l \-\-exclusive bash \-c 'grep Cpus_allowed_list /proc/self/status'
+0: Cpus_allowed_list: 0\-23
+.fi
+
+Adding a request for a certain number of tasks will cause each task to only
+have access to a single CPU.
+.nf
+$ srun \-l \-\-exclusive \-n2 bash \-c 'grep Cpus_allowed_list /proc/self/status'
+0: Cpus_allowed_list: 0
+1: Cpus_allowed_list: 12
+.fi
+
+You can define the number of CPUs per task if you want to give them access to
+more than one CPU.
+.nf
+$ srun \-l \-\-exclusive \-n2 \-\-cpus\-per\-task=12 bash \-c 'grep Cpus_allowed_list /proc/self/status'
+0: Cpus_allowed_list: 0\-5,12\-17
+1: Cpus_allowed_list: 6\-11,18\-23
+.fi
+
.SH "COPYING"
Copyright (C) 2006\-2007 The Regents of the University of California.
Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
From d34898f2c38995c49190c6bfb7e4c37a00caea62 Mon Sep 17 00:00:00 2001
From: Stephen Kendall
Date: Thu, 14 Nov 2024 10:20:16 -0700
Subject: [PATCH 49/90] Docs - standardize strict_order and reorder_count
descriptions
Ticket 21311
---
doc/man/man5/slurm.conf.5 | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 165c7e2a6e4..981b790d750 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -2719,8 +2719,10 @@ plugin.
.TP
\fBreorder_count=#\fR
Specify how many attempts should be made in reordering preemptable jobs to
-minimize the count of jobs preempted.
+minimize the total number of jobs that will be preempted.
The default value is 1. High values may adversely impact performance.
+Changes to the order of jobs on these attempts can be enabled with
+\fBstrict_order\fR.
The logic to support this option is only available in the select/cons_tres
plugin.
.IP
@@ -2735,10 +2737,11 @@ otherwise a SIGTERM will be sent to the tasks.
.TP
\fBstrict_order\fR
-If set, then execute extra logic in an attempt to preempt only the lowest
-priority jobs.
-It may be desirable to set this configuration parameter when there are multiple
-priorities of preemptable jobs.
+When reordering preemptable jobs, place the most recently tested job at the
+front of the list since we are certain that it actually added resources needed
+by the new job. This ensures that with enough reorder attempts, the minimum
+possible number of jobs will be preempted.
+See also \fBreorder_count=#\fR.
The logic to support this option is only available in the select/cons_tres
plugin.
.IP
From 51a1b5b707959deca01b85b816e3ba3075285e18 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Wed, 6 Nov 2024 13:18:35 -0700
Subject: [PATCH 50/90] data_parser/v0.0.41 - Add field_parser to dump()
When dumping PARSER_MODEL_ARRAY, track the parser for the field and the
final parser for the field's contents.
Ticket 21341
---
src/plugins/data_parser/v0.0.41/api.c | 2 +-
src/plugins/data_parser/v0.0.41/parsing.c | 19 +++++++++++--------
src/plugins/data_parser/v0.0.41/parsing.h | 11 ++++++-----
3 files changed, 18 insertions(+), 14 deletions(-)
diff --git a/src/plugins/data_parser/v0.0.41/api.c b/src/plugins/data_parser/v0.0.41/api.c
index 5b213dcd49f..7e665ecb43a 100644
--- a/src/plugins/data_parser/v0.0.41/api.c
+++ b/src/plugins/data_parser/v0.0.41/api.c
@@ -95,7 +95,7 @@ extern int data_parser_p_dump(args_t *args, data_parser_type_t type, void *src,
return ESLURM_NOT_SUPPORTED;
}
- return dump(src, src_bytes, parser, dst, args);
+ return dump(src, src_bytes, NULL, parser, dst, args);
}
extern int data_parser_p_parse(args_t *args, data_parser_type_t type, void *dst,
diff --git a/src/plugins/data_parser/v0.0.41/parsing.c b/src/plugins/data_parser/v0.0.41/parsing.c
index c6db4704e18..ea342fc39d3 100644
--- a/src/plugins/data_parser/v0.0.41/parsing.c
+++ b/src/plugins/data_parser/v0.0.41/parsing.c
@@ -1244,8 +1244,9 @@ static int _foreach_dump_list(void *obj, void *arg)
xassert(args->parser->ptr_offset == NO_VAL);
/* we don't know the size of the items in the list */
- if (dump(&obj, NO_VAL, find_parser_by_type(args->parser->list_type),
- item, args->args))
+ if (dump(&obj, NO_VAL, NULL,
+ find_parser_by_type(args->parser->list_type), item,
+ args->args))
return -1;
return 0;
@@ -1327,7 +1328,7 @@ static int _dump_pointer(const parser_t *const parser, void *src, data_t *dst,
return SLURM_SUCCESS;
}
- return dump(*ptr, NO_VAL, pt, dst, args);
+ return dump(*ptr, NO_VAL, NULL, pt, dst, args);
}
static int _dump_nt_array(const parser_t *const parser, void *src, data_t *dst,
@@ -1345,7 +1346,7 @@ static int _dump_nt_array(const parser_t *const parser, void *src, data_t *dst,
return SLURM_SUCCESS;
for (int i = 0; !rc && array[i]; i++) {
- rc = dump(array[i], NO_VAL,
+ rc = dump(array[i], NO_VAL, NULL,
find_parser_by_type(parser->array_type),
data_list_append(dst), args);
}
@@ -1369,7 +1370,7 @@ static int _dump_nt_array(const parser_t *const parser, void *src, data_t *dst,
if (done)
break;
- rc = dump(ptr, NO_VAL,
+ rc = dump(ptr, NO_VAL, NULL,
find_parser_by_type(parser->array_type),
data_list_append(dst), args);
}
@@ -1518,7 +1519,8 @@ static int _dump_linked(args_t *args, const parser_t *const array,
array->ptr_offset, (uintptr_t) dst, array->key,
(uintptr_t) dst);
- rc = dump(src, NO_VAL, find_parser_by_type(parser->type), dst, args);
+ rc = dump(src, NO_VAL, parser, find_parser_by_type(parser->type), dst,
+ args);
log_flag(DATA, "END: dumping %s parser %s->%s(0x%" PRIxPTR ") for %s(0x%" PRIxPTR ")->%s(+%zd) for data(0x%" PRIxPTR ")/%s(0x%" PRIxPTR ")",
parser->obj_type_string, array->type_string,
@@ -1545,8 +1547,9 @@ static void _check_dump(const parser_t *const parser, data_t *dst, args_t *args)
}
}
-extern int dump(void *src, ssize_t src_bytes, const parser_t *const parser,
- data_t *dst, args_t *args)
+extern int dump(void *src, ssize_t src_bytes,
+ const parser_t *const field_parser,
+ const parser_t *const parser, data_t *dst, args_t *args)
{
int rc;
diff --git a/src/plugins/data_parser/v0.0.41/parsing.h b/src/plugins/data_parser/v0.0.41/parsing.h
index 94e325dde1a..f1b34a1554e 100644
--- a/src/plugins/data_parser/v0.0.41/parsing.h
+++ b/src/plugins/data_parser/v0.0.41/parsing.h
@@ -59,11 +59,12 @@
#undef DATA_DUMP
#undef DATA_PARSE
-extern int dump(void *src, ssize_t src_bytes, const parser_t *const parser,
- data_t *dst, args_t *args);
-#define DUMP(type, src, dst, args) \
- dump(&src, sizeof(src), find_parser_by_type(DATA_PARSER_##type), dst, \
- args)
+extern int dump(void *src, ssize_t src_bytes,
+ const parser_t *const field_parser,
+ const parser_t *const parser, data_t *dst, args_t *args);
+#define DUMP(type, src, dst, args) \
+ dump(&src, sizeof(src), NULL, find_parser_by_type(DATA_PARSER_##type), \
+ dst, args)
extern int parse(void *dst, ssize_t dst_bytes, const parser_t *const parser,
data_t *src, args_t *args, data_t *parent_path);
From ab82257bdd267b385543b32c9eef08d14ca320ea Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Wed, 6 Nov 2024 13:23:41 -0700
Subject: [PATCH 51/90] data_parser/v0.0.41 - Pass field_parser to
_dump_pointer()
Ticket 21341
---
src/plugins/data_parser/v0.0.41/parsing.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/plugins/data_parser/v0.0.41/parsing.c b/src/plugins/data_parser/v0.0.41/parsing.c
index ea342fc39d3..a71c62c90de 100644
--- a/src/plugins/data_parser/v0.0.41/parsing.c
+++ b/src/plugins/data_parser/v0.0.41/parsing.c
@@ -1287,7 +1287,8 @@ static int _dump_list(const parser_t *const parser, void *src, data_t *dst,
return SLURM_SUCCESS;
}
-static int _dump_pointer(const parser_t *const parser, void *src, data_t *dst,
+static int _dump_pointer(const parser_t *const field_parser,
+ const parser_t *const parser, void *src, data_t *dst,
args_t *args)
{
const parser_t *pt = find_parser_by_type(parser->pointer_type);
@@ -1620,7 +1621,7 @@ extern int dump(void *src, ssize_t src_bytes,
verify_parser_not_sliced(parser);
xassert(data_get_type(dst) == DATA_TYPE_NULL);
- rc = _dump_pointer(parser, src, dst, args);
+ rc = _dump_pointer(field_parser, parser, src, dst, args);
break;
case PARSER_MODEL_NT_PTR_ARRAY:
case PARSER_MODEL_NT_ARRAY:
From 7c9cc563adfa2008b35a584eb213363015725339 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Wed, 6 Nov 2024 13:25:39 -0700
Subject: [PATCH 52/90] data_parser/v0.0.41 - Dump non-required fields as null
Avoid breaking generated clients by dumping '{}' instead of 'null' for
NULL pointers fields for PARSER_MODEL_ARRAY structs when field is marked
as required=false and clients consider 'null' a valid value but will
error for an empty dictionary '{}'.
Ticket 21341
---
src/plugins/data_parser/v0.0.41/parsing.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/plugins/data_parser/v0.0.41/parsing.c b/src/plugins/data_parser/v0.0.41/parsing.c
index a71c62c90de..ff5bae5b924 100644
--- a/src/plugins/data_parser/v0.0.41/parsing.c
+++ b/src/plugins/data_parser/v0.0.41/parsing.c
@@ -1304,10 +1304,11 @@ static int _dump_pointer(const parser_t *const field_parser,
while (pt->pointer_type)
pt = find_parser_by_type(pt->pointer_type);
- if (parser->allow_null_pointer) {
+ if (parser->allow_null_pointer ||
+ (field_parser && !field_parser->required)) {
xassert(data_get_type(dst) == DATA_TYPE_NULL);
} else if ((pt->model == PARSER_MODEL_ARRAY) ||
- (pt->obj_openapi == OPENAPI_FORMAT_OBJECT)) {
+ (pt->obj_openapi == OPENAPI_FORMAT_OBJECT)) {
/*
* OpenAPI clients can't handle a null instead of an
* object. Work around by placing an empty dictionary
From 637f251eac8c9139bd4986fc0cc369da005a09f0 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Wed, 13 Nov 2024 10:34:41 -0700
Subject: [PATCH 53/90] Add NEWS entry for last 3 commits
Ticket 21341
---
NEWS | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/NEWS b/NEWS
index a6152e0a41a..1cade4d5f2f 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,10 @@ documents those changes that are of interest to users and administrators.
removal of a dynamic node.
-- gpu/nvml - Attempt loading libnvidia-ml.so.1 as a fallback for failure in
loading libnvidia-ml.so.
+ -- slurmrestd - Fix populating non-required object fields of objects as '{}' in
+ JSON/YAML instead of 'null' causing compiled OpenAPI clients to reject
+ the response to 'GET /slurm/v0.0.40/jobs' due to validation failure of
+ '.jobs[].job_resources'.
* Changes in Slurm 24.05.4
==========================
From 2ff6059881800694c3331c3905b2619e2af42e80 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Thu, 14 Nov 2024 12:08:00 -0700
Subject: [PATCH 54/90] Testsuite - Avoid false failure in test_112_41
Ticket 21341
---
testsuite/python/tests/test_112_41.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/testsuite/python/tests/test_112_41.py b/testsuite/python/tests/test_112_41.py
index 1d529b1fc7c..f65b39ff7e7 100644
--- a/testsuite/python/tests/test_112_41.py
+++ b/testsuite/python/tests/test_112_41.py
@@ -1001,9 +1001,11 @@ def test_jobs(slurm, slurmdb):
resp = slurm.slurm_v0041_post_job(str(jobid), v0041_job_desc_msg=job)
assert not len(resp.warnings)
assert not len(resp.errors)
- for result in resp.results:
- assert result.job_id == jobid
- assert result.error_code == 0
+ # Not all changes populate "results" field
+ if resp.results is not None:
+ for result in resp.results:
+ assert result.job_id == jobid
+ assert result.error_code == 0
resp = slurm.slurm_v0041_get_job(str(jobid))
assert len(resp.warnings) == 0
From 7efb6bb443327c89f695d027d1f61a538c82bd83 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Thu, 14 Nov 2024 12:08:41 -0700
Subject: [PATCH 55/90] Testsuite - Avoid xfail in test_112_41
Ticket 21341
---
testsuite/python/tests/test_112_41.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/testsuite/python/tests/test_112_41.py b/testsuite/python/tests/test_112_41.py
index f65b39ff7e7..cfdf67726af 100644
--- a/testsuite/python/tests/test_112_41.py
+++ b/testsuite/python/tests/test_112_41.py
@@ -928,8 +928,6 @@ def test_db_config(slurmdb):
assert len(resp.errors) == 0
-# TODO: Remove xfail once bug 21341 is fixed
-@pytest.mark.xfail
def test_jobs(slurm, slurmdb):
from openapi_client.models.v0041_job_submit_req import V0041JobSubmitReq
from openapi_client.models.v0041_job_desc_msg import V0041JobDescMsg
From a200dc8c397a2ae59b6ceb2494638d663d0b3478 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Thu, 14 Nov 2024 12:14:46 -0700
Subject: [PATCH 56/90] Testsuite - Wait for slurmdbd to settle in test_112_41
Wait until slurmdbd gets the new job_name to continue.
Ticket 21341
---
testsuite/python/tests/test_112_41.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/testsuite/python/tests/test_112_41.py b/testsuite/python/tests/test_112_41.py
index cfdf67726af..8d197dfa399 100644
--- a/testsuite/python/tests/test_112_41.py
+++ b/testsuite/python/tests/test_112_41.py
@@ -1043,8 +1043,8 @@ def test_jobs(slurm, slurmdb):
assert len(resp.errors) == 0
assert resp.jobs
for job in resp.jobs:
- if job.name == "allocation":
- # job hasn't settled at slurmdbd yet
+ if job.name != "updated test job":
+ # job change hasn't settled at slurmdbd yet
requery = True
else:
requery = False
From 4d0680564edc1e9316f9fddc04f417bf2f694d68 Mon Sep 17 00:00:00 2001
From: Alejandro Sanchez
Date: Wed, 20 Nov 2024 21:45:20 +0100
Subject: [PATCH 57/90] Fix sstat/sattach regressions for steps in higher
version slurmds.
The sstat regression was introduced in 20.11.0rc1 commit 906b3cc346 with
respect to functionality introduced in 17.11.10 commit dd7e9b1113.
The sattach regression was introduced in 16.05.1rc1 commit a1ea1e2a2f9.
Ticket 21465
---
NEWS | 2 ++
src/api/job_step_info.c | 3 ++-
src/sattach/sattach.c | 3 ++-
3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/NEWS b/NEWS
index 1cade4d5f2f..29ca7d6f585 100644
--- a/NEWS
+++ b/NEWS
@@ -18,6 +18,8 @@ documents those changes that are of interest to users and administrators.
JSON/YAML instead of 'null' causing compiled OpenAPI clients to reject
the response to 'GET /slurm/v0.0.40/jobs' due to validation failure of
'.jobs[].job_resources'.
+ -- Fix sstat/sattach protocol errors for steps on higher version slurmd's
+ (regressions since 20.11.0rc1 and 16.05.1rc1 respectively).
* Changes in Slurm 24.05.4
==========================
diff --git a/src/api/job_step_info.c b/src/api/job_step_info.c
index 373f389145c..00458feb5b9 100644
--- a/src/api/job_step_info.c
+++ b/src/api/job_step_info.c
@@ -738,7 +738,8 @@ extern int slurm_job_step_stat(slurm_step_id_t *step_id,
memcpy(&req, step_id, sizeof(req));
memcpy(&resp_out->step_id, step_id, sizeof(resp_out->step_id));
- req_msg.protocol_version = use_protocol_ver;
+ req_msg.protocol_version = MIN(SLURM_PROTOCOL_VERSION,
+ use_protocol_ver);
req_msg.msg_type = REQUEST_JOB_STEP_STAT;
req_msg.data = &req;
diff --git a/src/sattach/sattach.c b/src/sattach/sattach.c
index 3c8f7b10e5d..921b6c7bcdd 100644
--- a/src/sattach/sattach.c
+++ b/src/sattach/sattach.c
@@ -408,7 +408,8 @@ static int _attach_to_tasks(slurm_step_id_t stepid,
slurm_msg_set_r_uid(&msg, SLURM_AUTH_UID_ANY);
msg.msg_type = REQUEST_REATTACH_TASKS;
msg.data = &reattach_msg;
- msg.protocol_version = layout->start_protocol_ver;
+ msg.protocol_version = MIN(SLURM_PROTOCOL_VERSION,
+ layout->start_protocol_ver);
if (layout->front_end)
hosts = layout->front_end;
From 4625559cd4b72df45de6b0345aa0594162a828cd Mon Sep 17 00:00:00 2001
From: Albert Gil
Date: Thu, 14 Nov 2024 18:10:50 +0100
Subject: [PATCH 58/90] Testsuite - Add start_slurmdbd() and stop_slurmdbd()
Ticket 21393
Signed-off-by: Nathan Prisbrey
---
testsuite/python/lib/atf.py | 112 ++++++++++++++++++++++++------------
1 file changed, 74 insertions(+), 38 deletions(-)
diff --git a/testsuite/python/lib/atf.py b/testsuite/python/lib/atf.py
index 334e127226d..05e58df3aa5 100644
--- a/testsuite/python/lib/atf.py
+++ b/testsuite/python/lib/atf.py
@@ -546,6 +546,45 @@ def start_slurmctld(clean=False, quiet=False):
pytest.fail(f"Slurmctld is not running")
+def start_slurmdbd(clean=False, quiet=False):
+ """Starts the Slurm DB daemon (slurmdbd).
+
+ This function may only be used in auto-config mode.
+
+ Args:
+ clean (boolean): If True, clears previous slurmdbd state.
+ quiet (boolean): If True, logging is performed at the TRACE log level.
+
+ Returns:
+ None
+ """
+ if not properties["auto-config"]:
+ require_auto_config("wants to start slurmdbd")
+
+ if (
+ run_command_exit(
+ "sacctmgr show cluster", user=properties["slurm-user"], quiet=quiet
+ )
+ != 0
+ ):
+ # Start slurmdbd
+ results = run_command(
+ f"{properties['slurm-sbin-dir']}/slurmdbd",
+ user=properties["slurm-user"],
+ quiet=quiet,
+ )
+ if results["exit_code"] != 0:
+ pytest.fail(
+ f"Unable to start slurmdbd (rc={results['exit_code']}): {results['stderr']}"
+ )
+
+ # Verify that slurmdbd is running
+ if not repeat_command_until(
+ "sacctmgr show cluster", lambda results: results["exit_code"] == 0
+ ):
+ pytest.fail(f"Slurmdbd is not running")
+
+
def start_slurm(clean=False, quiet=False):
"""Starts all applicable Slurm daemons.
@@ -574,28 +613,7 @@ def start_slurm(clean=False, quiet=False):
get_config_parameter("AccountingStorageType", live=False, quiet=quiet)
== "accounting_storage/slurmdbd"
):
- if (
- run_command_exit(
- "sacctmgr show cluster", user=properties["slurm-user"], quiet=quiet
- )
- != 0
- ):
- # Start slurmdbd
- results = run_command(
- f"{properties['slurm-sbin-dir']}/slurmdbd",
- user=properties["slurm-user"],
- quiet=quiet,
- )
- if results["exit_code"] != 0:
- pytest.fail(
- f"Unable to start slurmdbd (rc={results['exit_code']}): {results['stderr']}"
- )
-
- # Verify that slurmdbd is running
- if not repeat_command_until(
- "sacctmgr show cluster", lambda results: results["exit_code"] == 0
- ):
- pytest.fail(f"Slurmdbd is not running")
+ start_slurmdbd(clean, quiet)
# Remove unnecessary default node0 from config to avoid being used or reserved
output = run_command_output(
@@ -694,6 +712,39 @@ def stop_slurmctld(quiet=False):
pytest.fail("Slurmctld is still running")
+def stop_slurmdbd(quiet=False):
+ """Stops the Slurm DB daemon (slurmdbd).
+
+ This function may only be used in auto-config mode.
+
+ Args:
+ quiet (boolean): If True, logging is performed at the TRACE log level.
+
+ Returns:
+ None
+ """
+
+ if not properties["auto-config"]:
+ require_auto_config("wants to stop slurmdbd")
+
+ # Stop slurmdbd
+ results = run_command(
+ "sacctmgr shutdown", user=properties["slurm-user"], quiet=quiet
+ )
+ if results["exit_code"] != 0:
+ failures.append(
+ f"Command \"sacctmgr shutdown\" failed with rc={results['exit_code']}"
+ )
+
+ # Verify that slurmdbd is not running (we might have to wait for rollups to complete)
+ if not repeat_until(
+ lambda: pids_from_exe(f"{properties['slurm-sbin-dir']}/slurmdbd"),
+ lambda pids: len(pids) == 0,
+ timeout=60,
+ ):
+ failures.append("Slurmdbd is still running")
+
+
def stop_slurm(fatal=True, quiet=False):
"""Stops all applicable Slurm daemons.
@@ -728,22 +779,7 @@ def stop_slurm(fatal=True, quiet=False):
get_config_parameter("AccountingStorageType", live=False, quiet=quiet)
== "accounting_storage/slurmdbd"
):
- # Stop slurmdbd
- results = run_command(
- "sacctmgr shutdown", user=properties["slurm-user"], quiet=quiet
- )
- if results["exit_code"] != 0:
- failures.append(
- f"Command \"sacctmgr shutdown\" failed with rc={results['exit_code']}"
- )
-
- # Verify that slurmdbd is not running (we might have to wait for rollups to complete)
- if not repeat_until(
- lambda: pids_from_exe(f"{properties['slurm-sbin-dir']}/slurmdbd"),
- lambda pids: len(pids) == 0,
- timeout=60,
- ):
- failures.append("Slurmdbd is still running")
+ stop_slurmdbd(quiet)
# Stop slurmctld and slurmds
results = run_command(
From 9da3a8273b55c1a3f38cbe8a2c5550459be5a2ad Mon Sep 17 00:00:00 2001
From: Nathan Prisbrey
Date: Thu, 14 Nov 2024 08:51:06 +0000
Subject: [PATCH 59/90] Testsuite - Rewrite test21.39 as test_102_5
Also changed how edge case is tested, eliminating the need to rely on
race conditions naturally because we manually create the condition by
stopping and starting slurmctld and slurmdbd ourselves.
Ticket 21393
---
testsuite/README | 2 +-
testsuite/expect/test21.39 | 439 ---------------------------
testsuite/python/tests/test_102_5.py | 354 +++++++++++++++++++++
3 files changed, 355 insertions(+), 440 deletions(-)
delete mode 100755 testsuite/expect/test21.39
create mode 100644 testsuite/python/tests/test_102_5.py
diff --git a/testsuite/README b/testsuite/README
index db15b3f8873..ce9d2e9d553 100644
--- a/testsuite/README
+++ b/testsuite/README
@@ -592,7 +592,6 @@ test21.35 Validate DenyOnLimit QoS flag is enforced on QoS and Associations.
test21.36 Validate that sacctmgr lost jobs fixes lost jobs.
test21.37 sacctmgr show and clear stats
test21.38 sacctmgr modify limits for nested accounts with multiple users
-test21.39 sacctmgr create qos/account job and then delete account/qos
test21.40 Test association plus partition/job QoS unique node limits enforced
test21.41 sacctmgr update job set newwckey=
test21.42 Test if headers returned by sacctmgr show can be used as format= specifiers
@@ -793,6 +792,7 @@ test_102_# Testing of sacctmgr options.
test_102_1 /commands/sacctmgr/test_federation.py
test_102_2 /commands/sacctmgr/test_--usage.py
test_102_3 /commands/sacctmgr/test_--json.py
+test_102_5 sacctmgr create qos/account job and then delete account/qos
test_103_# Testing of salloc options.
=======================================
diff --git a/testsuite/expect/test21.39 b/testsuite/expect/test21.39
deleted file mode 100755
index 884d1623aad..00000000000
--- a/testsuite/expect/test21.39
+++ /dev/null
@@ -1,439 +0,0 @@
-#!/usr/bin/env expect
-############################################################################
-# Purpose: Test of Slurm functionality
-# sacctmgr create qos/account job and then delete account/qos
-############################################################################
-# Copyright (C) 2019 SchedMD LLC.
-# Written by Nathan Rini
-# All rights reserved.
-#
-# This file is part of Slurm, a resource management program.
-# For details, see .
-# Please also read the included file: DISCLAIMER.
-#
-# Slurm is free software; you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free
-# Software Foundation; either version 2 of the License, or (at your option)
-# any later version.
-#
-# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License along
-# with Slurm; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-############################################################################
-source ./globals
-source ./globals_accounting
-
-set part_name "${test_name}_part"
-set ta1 "${test_name}-account.1"
-set ta2 "${test_name}-account.2"
-set tu1 [get_my_user_name]
-set tq1 "${test_name}-qos.1"
-set tq2 "${test_name}-qos.2"
-set job_id_1 0
-
-# account options
-array set acct_1 {}
-array set acct_2 {}
-
-# user options
-array set user_req_1 {}
-set user_req_1(Account) $ta1
-set user_req_1(Qos) "$tq1,$tq2"
-array set user_req_2 {}
-set user_req_2(Account) $ta2
-set user_req_2(Qos) "$tq1,$tq2"
-
-# qos options
-array set qos_1 {}
-array set qos_2 {}
-
-set access_err 0
-
-# Get the location of the slurm.conf file
-set config_dir [get_conf_path]
-set config_file $config_dir/slurm.conf
-
-# TODO: This test is just too error-prone to be useful (t21393)
-skip "This test is temporally disabled to avoid testsuite noise (t21393)"
-
-#
-# Verify preconditions
-#
-if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
- skip "This test can't be run without a usable AccountStorageType"
-}
-
-if {![param_contains [get_config_param "AccountingStorageEnforce"] "associations"]} {
- skip "This test can't be run without AccountingStorageEnforce=associations"
-}
-
-if {![param_contains [get_config_param "AccountingStorageEnforce"] "qos"]} {
- skip "This test can't be run without AccountingStorageEnforce=qos"
-}
-
-if {[param_contains [get_config_param "JobContainerType"] "*tmpfs"]} {
- skip "This test can't be run with JobContainerType=job_container/tmpfs"
-}
-
-if {[get_admin_level] ne "Administrator"} {
- skip "This test can't be run without being an Accounting administrator.\nUse: sacctmgr mod user \$USER set admin=admin"
-}
-
-proc check_rc { rc } {
- if {$rc != 0} {
- fail "Subcommand failed with return code $rc"
- }
-}
-
-# Create test assoc and accounts
-proc create_accounts {} {
- global ta1 ta2 tq1 tq2 tu1 user_req_1 user_req_2
- global qos_1 qos_2
- log_debug "Create account and QOS"
-
-
- # Create test assoc and accounts
- check_rc [add_qos $tq1 [array get qos_1]]
- check_rc [add_qos $tq2 [array get qos_2]]
- check_rc [add_acct $ta1 [array get acct_1]]
- check_rc [add_acct $ta2 [array get acct_2]]
- check_rc [add_user $tu1 [array get user_req_1]]
- check_rc [add_user $tu1 [array get user_req_2]]
-}
-
-# Cleanup test assoc and accounts
-proc cleanup_accounts {} {
- global ta1 ta2 tq1 tq2
-
- wait_for_account_done $ta1,$ta2
-
- log_debug "Remove QOS: $tq1,$tq2"
- remove_qos $tq1,$tq2
-
- log_debug "Remove account: $ta1,$ta2"
- remove_acct "" $ta1,$ta2
-}
-
-proc cleanup { } {
- global job_id_1 config_file part_name
-
- cancel_job $job_id_1
- wait_for_part_done $part_name
-
- cleanup_accounts
-
- restore_conf $config_file
- reconfigure
-}
-
-proc test_salloc { qos account num_nodes } {
- global salloc number part_name bin_bash
-
- set rc -12345
- set job_id 0
-
- spawn $salloc -p$part_name --exclusive -q$qos -A$account -N$num_nodes $bin_bash
- expect {
- -re "allocation ($number)" {
- set job_id $expect_out(1,string)
- }
- -re "error" {
- fail "salloc job was not submitted"
- }
- timeout {
- fail "salloc not responding"
- }
- }
-
- if { $job_id == 0 } {
- fail "Submit failure"
- }
-
- return $job_id
-}
-
-proc check_job_reason { job_id state why } {
- global scontrol re_word_str re_word_str
-
- set found_why ""
- set state_found ""
-
- log_user 0
- spawn $scontrol show job $job_id
- expect {
- -re "State=($re_word_str)" {
- set state_found $expect_out(1,string)
- exp_continue
- }
- -re "Reason=($re_word_str)" {
- set found_why $expect_out(1,string)
- exp_continue
- }
- timeout {
- fail "scontrol not responding"
- }
- eof {
- lassign [wait] pid spawnid os_error_flag rc
- }
- }
-
- log_user 1
- if { $state_found != $state } {
- fail "Job ($job_id) state found was $state_found, expected $state"
- }
-
- set found_reason 0
- foreach iwhy $why {
- if { $found_why == $iwhy } {
- set found_reason 1
- break
- }
- }
-
- if { !$found_reason } {
- fail "Job ($job_id) scontrol returned Reason=$found_why instead of Reason='$why'"
- }
-
- log_debug "Found jobid $job_id with correct state '$state' and reason '$found_why'"
- return 0
-}
-
-proc run_test { run_qos } {
- global scontrol tq1 tq2 ta1 ta2 part_name
- # TODO: Temporary globals for the extra debug info for bug10604
- global squeue sinfo sacctmgr
-
- if { $run_qos } {
- set qos $tq1
- set acct $ta2
- set update_line "qos=$tq2"
- set reason "InvalidQOS"
- } else {
- set qos $tq1
- set acct $ta1
- set update_line "account=$ta2"
- set reason "InvalidAccount"
- }
-
- set job_id_1 [test_salloc $qos $acct 1]
- set job_id_2 [test_salloc $qos $acct 1]
- # TODO Temporary debug to troubleshoot bug 10604 (revert once fixed)
- # check_rc [wait_for_job $job_id_1 "RUNNING"]
- set rc [wait_for_job $job_id_1 "RUNNING"]
- if {$rc} {
- log_warn "Job never started, extra debug information for bug10604 below before the actual fail:"
- run_command "$squeue"
- run_command "$sinfo"
- run_command "$scontrol show job"
- run_command "$scontrol show node"
- run_command "$scontrol show partition"
- run_command "$sacctmgr show qos -p"
- fail "Subcommand failed with exit code $rc"
- }
-
- if { $run_qos } {
- # Remove test qos
- if [remove_qos $qos] {
- log_debug "We hit the race trying to get the job running before it hits the database before we removed the qos. This can be expected, trying again."
- wait_for_part_done $part_name
- return 1
- }
- } else {
- # Remove test acct
- if [remove_acct "" $acct] {
- log_debug "We hit the race trying to get the job running before it hits the database before we removed the account. This can be expected, trying again."
- wait_for_part_done $part_name
- return 1
- }
- }
-
- # Verify jobs state and reason
- check_job_reason $job_id_1 "RUNNING" [list "None"]
- check_job_reason $job_id_2 "PENDING" [list "$reason"]
-
- # Update pending job to make it runnable, updating the running job isn't
- # possible but it tests other code if you are watching the log file
- spawn $scontrol update jobid=$job_id_1 $update_line
- spawn $scontrol update jobid=$job_id_2 $update_line
-
- sleep 5
- # Check reasons after account alter
- check_job_reason $job_id_1 "RUNNING" [list "None"]
- check_job_reason $job_id_2 "PENDING" [list "Resources" "None"]
-
- # Cleanup jobs
- wait_for_part_done $part_name
-
- return 0
-}
-
-cleanup_accounts
-create_accounts
-
-#
-# Copy slurm.conf file
-#
-save_conf $config_file
-
-# Comment out PrologFlags in the slurm.conf
-exec $bin_sed -i {s/^\(PrologFlags=\)/#\1/gI} $config_file
-
-# TODO: Temporarily increase logging to debug bug 10604 (remove once fixed)
-run_command -none "$bin_echo SlurmctldDebug=debug3 >> $config_file"
-# Allow the test's existing reconfigure call to establish these values
-
-reconfigure -fail
-
-delete_part $part_name
-
-if [create_part $part_name 1] {
- fail "Unable to create partition ($part_name)"
-}
-
-for {set i 0} {$i < 2} {incr i} {
- set cnt 0
- set rc 1
- # First lets test against removing an account and then qos
- # from a running and pending job since partition only has 1 so
- # one of these should run and the second should pend
- while { $rc && ($cnt < 10) } {
- set rc [run_test $i]
- incr cnt
- }
- if { $rc } {
- set credential [expr {$i == 0 ? "account" : "qos"}]
- fail "Too many ($cnt) failures trying to remove $credential from job"
- }
-}
-
-log_info "Testing that PD jobs that lost their QOS will not lunch until restored"
-
-# Return to clean slate
-cleanup_accounts
-
-# Add test qos
-run_command -fail "$sacctmgr -vi add qos $tq1"
-
-# Add test account
-run_command -fail "$sacctmgr -vi add account name=$ta1 set qos=normal,$tq1"
-
-# Add test user to account
-run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta1"
-
-# Check what we just added to make sure it is there
-if {![regexp -line "^\\S+\\|$ta1\\|$tu1\\|normal,$tq1" [run_command_output "$sacctmgr -n -P show assoc format=cluster,account,user,qos"]]} {
- fail "Association with the right account, user and qos was not found"
-}
-
-# Submit a delayed job that wants to run in the test qos
-regexp {Submitted batch job (\d+)} [run_command_output -fail "$sbatch --begin=now+2 --qos=$tq1 -A $ta1 --wrap \"$srun sleep 10\" -o none -e none"] {} job_id_1
-
-# The first job should get queued
-wait_for_job -fail $job_id_1 "PENDING"
-
-# Remove the test qos out from under the job
-run_command "$sacctmgr -vi modify account name=$ta1 user=$tu1 set qos=normal"
-
-# Submit a second delayed job requesting the test qos. It should be rejected
-if {![run_command_status -none "$sbatch --begin=now+2 --qos=$tq1 -A $ta1 --wrap \"$srun sleep 10\" -o none -e none"]} {
- fail "Job submitted with unassociated qos should have failed"
-} else {
- log_debug "The preceding job failure was expected"
-}
-
-# Wait for the first job to pend with Reason=InvalidQOS
-set condition_matched false
-wait_for {$condition_matched} {
- set scontrol_out [run_command_output -fail "$scontrol -o show job $job_id_1"]
- regexp {JobState=([^ ]+) Reason=([^ ]+)} $scontrol_out {} job_state reason
- if {$job_state eq "PENDING" && $reason eq "InvalidQOS"} {
- set condition_matched true
- }
-}
-subtest {$condition_matched} "Job ($job_id_1) should be PD with Reason=InvalidQOS" "JobState=$job_state, Reason=$reason"
-
-# Add back the test qos
-run_command -fail "$sacctmgr -vi modify account name=$ta1 user=$tu1 set qos=normal,$tq1"
-
-# Wait for the first job to begin running
-wait_for_job -fail $job_id_1 "RUNNING"
-
-log_info "Testing that PD jobs that lost their Account will not resume until restored"
-
-# Return to clean slate
-cleanup_accounts
-
-# Add test qos
-run_command -fail "$sacctmgr -vi add qos $tq1"
-
-# Add test accounts
-run_command -fail "$sacctmgr -vi add account name=$ta1 set qos=normal,$tq1"
-run_command -fail "$sacctmgr -vi add account name=$ta2 set qos=normal,$tq1"
-
-# Add test user to accounts
-run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta1"
-run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta2"
-
-# Check what we just added to make sure it is there
-if {![regexp -line "^\\S+\\|$ta1\\|$tu1\\|normal,$tq1" [run_command_output "$sacctmgr -n -P show assoc format=cluster,account,user,qos"]]} {
- fail "QOS ($tq1) has not been added to account ($ta1)"
-}
-if {![regexp -line "^\\S+\\|$ta2\\|$tu1\\|normal,$tq1" [run_command_output "$sacctmgr -n -P show assoc format=cluster,account,user,qos"]]} {
- fail "QOS ($tq1) has not been added to account ($ta2)"
-}
-
-# We may have to try this multiple times since we are relying on a race
-# condition. If the idle job gets written to the job accounting records
-# before the account is deleted, the account deletion will fail saying
-# that it cannot delete an account with active jobs.
-wait_for -fail {$delete_account_status == 0} {
- # Submit a delayed job that wants to run with the test qos and test account 2
- regexp {Submitted batch job (\d+)} [run_command_output -fail "$sbatch --begin=now+2 --qos=$tq1 -A $ta2 --wrap \"$srun sleep 10\" -o none -e none"] {} job_id_1
-
- # The first job should get queued
- wait_for_job -fail $job_id_1 "PENDING"
-
- # Remove the test account out from under the job
- set delete_account_results [run_command "$sacctmgr -vi delete account name=$ta2"]
- set delete_account_status [dict get $delete_account_results exit_code]
- set delete_account_output [dict get $delete_account_results output]
-
- # Mitigation if the account deletion fails
- if {$delete_account_status != 0} {
- if [regexp {Error with request: Job\(s\) active} $delete_account_output] {
- cancel_job $job_id_1
- } else {
- fail "Failure deleting account ($ta2): $delete_account_output"
- }
- }
-}
-
-# Submit a second delayed job requesting test account 2. It should be rejected
-if {![run_command_status -none "$sbatch --begin=now+2 --qos=$tq1 -A $ta2 --wrap \"$srun sleep 10\" -o none -e none"]} {
- fail "Job submitted with unassociated account should have failed"
-} else {
- log_debug "The preceding job failure was expected"
-}
-
-# Wait for the first job to pend with Reason=InvalidAccount
-set condition_matched false
-wait_for {$condition_matched} {
- set scontrol_out [run_command_output -fail "$scontrol -o show job $job_id_1"]
- regexp {JobState=([^ ]+) Reason=([^ ]+)} $scontrol_out {} job_state reason
- if {$job_state eq "PENDING" && $reason eq "InvalidAccount"} {
- set condition_matched true
- }
-}
-subtest {$condition_matched} "Job ($job_id_1) should be PD with Reason=InvalidAccount" "JobState=$job_state, Reason=$reason"
-
-# Add back the test account
-run_command -fail "$sacctmgr -vi add account name=$ta2 set qos=normal,$tq1"
-run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta2"
-
-# Wait for the first job to begin running
-wait_for_job -fail $job_id_1 "RUNNING"
diff --git a/testsuite/python/tests/test_102_5.py b/testsuite/python/tests/test_102_5.py
new file mode 100644
index 00000000000..9fa8bf91036
--- /dev/null
+++ b/testsuite/python/tests/test_102_5.py
@@ -0,0 +1,354 @@
+############################################################################
+# Copyright (C) SchedMD LLC.
+############################################################################
+import atf
+import pytest
+
+# Global variables
+qos1 = "qos1"
+qos2 = "qos2"
+acct1 = "acct1"
+acct2 = "acct2"
+acct = "acct"
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup():
+ """Test setup with required configurations."""
+ atf.require_auto_config("Manually creating and deleting qoses and accounts")
+ atf.require_config_parameter("AccountingStorageType", "accounting_storage/slurmdbd")
+ atf.require_config_parameter_includes("AccountingStorageEnforce", "associations")
+ atf.require_config_parameter_includes("AccountingStorageEnforce", "qos")
+ atf.require_slurm_running()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def setup_db():
+ # Create test QOS and account
+ atf.run_command(
+ f"sacctmgr -i add qos {qos1},{qos2}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+ atf.run_command(
+ f"sacctmgr -i add account {acct},{acct1},{acct2}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+ atf.run_command(
+ f"sacctmgr -i add user {atf.get_user_name()} DefaultAccount={acct} account={acct1},{acct2} qos=normal,{qos1},{qos2}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+ yield
+
+ atf.cancel_all_jobs(fatal=True)
+
+ atf.run_command(
+ f"sacctmgr -i remove user {atf.get_user_name()} {acct1},{acct2}",
+ user=atf.properties["slurm-user"],
+ quiet=True,
+ )
+ atf.run_command(
+ f"sacctmgr -i remove account {acct1},{acct2}",
+ user=atf.properties["slurm-user"],
+ quiet=True,
+ )
+ atf.run_command(
+ f"sacctmgr -i remove qos {qos1},{qos2}",
+ user=atf.properties["slurm-user"],
+ quiet=True,
+ )
+
+
+def submit_job_with(extra_params, xfail=False, fatal=False):
+ """Submit a job with specified extra params."""
+ return atf.submit_job_sbatch(
+ f"{extra_params} -N1 --wrap='sleep 300'", xfail=xfail, fatal=fatal
+ )
+
+
+def test_qos_removal_single():
+ """Test that removal of a single QOS:
+ 1. Marks pending jobs with InvalidQOS
+ 2. Rejects new job submissions using removed QOS
+ """
+
+ # Stop slurmdbd and submit job
+ atf.stop_slurmdbd(quiet=True)
+ job_id = submit_job_with(f"--qos={qos1}", fatal=True)
+
+ # Stop slurmctld, start slurmdbd, remove QOS
+ atf.stop_slurmctld(quiet=True)
+ atf.start_slurmdbd(quiet=True)
+ atf.run_command(
+ f"sacctmgr -i remove qos {qos1}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # TODO: Unnecessary for 24.05+ (Bug 21393)
+ atf.run_command(
+ f"sacctmgr -i modify user {atf.get_user_name()} set qos={qos2}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # Start slurmctld and verify job state/reason
+ atf.start_slurmctld(quiet=True)
+
+ assert atf.wait_for_job_state(
+ job_id, "PENDING", desired_reason="InvalidQOS"
+ ), f"Job {job_id} not in PENDING state with InvalidQOS reason"
+
+ # Try to submit a new job with removed QOS - should be rejected
+ assert (
+ submit_job_with(f"--qos={qos1}", xfail=True) == 0
+ ), f"Job submission with removed QOS {qos1} should have failed but got job id {job_id2}"
+
+ # Submit a job with remaining QOS - should succeed
+ assert (
+ submit_job_with(f"--qos={qos2}") != 0
+ ), f"Job submission with valid QOS {qos2} should have succeeded"
+
+
+def test_qos_removal_multiple():
+ """Test QOS removal when user has multiple QOS access:
+ 1. Verifies jobs with removed QOS get marked InvalidQOS
+ 2. Verifies jobs with remaining QOS stay valid
+ 3. Verifies new job submissions with removed QOS are rejected
+ 4. Verifies new job submissions with remaining QOS succeed
+ """
+
+ # Stop slurmdbd and submit jobs with different QOSs
+ atf.stop_slurmdbd(quiet=True)
+ job_id1 = submit_job_with(f"--qos={qos1}", fatal=True)
+ job_id2 = submit_job_with(f"--qos={qos2}", fatal=True)
+
+ # Stop slurmctld, start slurmdbd, remove first QOS
+ atf.stop_slurmctld(quiet=True)
+ atf.start_slurmdbd(quiet=True)
+ atf.run_command(
+ f"sacctmgr -i remove qos {qos1}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # TODO: Unnecessary for 24.05+ (Bug 21393)
+ atf.run_command(
+ f"sacctmgr -i modify user {atf.get_user_name()} set qos={qos2}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # Start slurmctld and verify jobs state/reason
+ atf.start_slurmctld(quiet=True)
+
+ # First job should be PENDING with InvalidQOS
+ assert atf.wait_for_job_state(
+ job_id1, "PENDING", desired_reason="InvalidQOS"
+ ), f"Job {job_id1} not in PENDING state and InvalidQOS reason"
+
+ # Second job should stay PENDING with different reason
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING", timeout=10
+ ), f"Job {job_id2} should be in PENDING state"
+ assert (
+ atf.get_job_parameter(job_id2, "Reason") != "InvalidQOS"
+ ), "The second job whose QOS was not deleted should not be pending due to 'InvalidQOS'"
+
+ # Try to submit a new job with removed QOS - should be rejected
+ assert (
+ submit_job_with(f"--qos={qos1}", xfail=True) == 0
+ ), f"Job submission with removed QOS {qos1} should have failed"
+
+ # Submit a job with remaining QOS - should succeed
+ assert (
+ submit_job_with(f"--qos={qos2}") != 0
+ ), f"Job submission with valid QOS {qos2} should have succeeded"
+
+
+def test_qos_removal_running_vs_pending():
+ """Test QOS removal impact on running vs pending jobs:
+ 1. Submit two jobs with same QOS - one running, one pending
+ 2. Remove the QOS
+ 3. Verify running job continues running
+ 4. Verify pending job gets marked with InvalidQOS
+ """
+
+ # Stop slurmdbd and submit jobs - use exclusive to ensure only one can run
+ atf.stop_slurmdbd(quiet=True)
+ job_id1 = submit_job_with(f"--qos={qos1} --exclusive", fatal=True)
+ job_id2 = submit_job_with(f"--qos={qos1} --exclusive", fatal=True)
+
+ # Wait for first job to start running
+ assert atf.wait_for_job_state(
+ job_id1, "RUNNING"
+ ), f"Job {job_id1} never started running"
+
+ # Verify second job is pending (due to exclusive)
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING"
+ ), f"Job {job_id2} should be pending"
+
+ # Stop slurmctld, start slurmdbd, remove QOS
+ atf.stop_slurmctld(quiet=True)
+ atf.start_slurmdbd(quiet=True)
+ atf.run_command(
+ f"sacctmgr -i remove qos {qos1}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # TODO: Unnecessary for 24.05+ (Bug 21393)
+ atf.run_command(
+ f"sacctmgr -i modify user {atf.get_user_name()} set qos={qos2}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # Start slurmctld and verify jobs state/reason
+ atf.start_slurmctld(quiet=True)
+
+ # Running job should continue running
+ assert atf.wait_for_job_state(
+ job_id1, "RUNNING"
+ ), f"Previously running job {job_id1} should stay RUNNING"
+ assert (
+ atf.get_job_parameter(job_id1, "Reason") == "None"
+ ), f"Running job {job_id1} should have 'None' as reason"
+
+ # Pending job should be marked with InvalidQOS
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING", desired_reason="InvalidQOS"
+ ), f"Pending job {job_id2} should be PENDING with InvalidQOS reason"
+
+
+def test_account_removal_single():
+ """Test that removal of a single account:
+ 1. Marks pending jobs with InvalidAccount
+ 2. Rejects new job submissions using removed account
+ """
+
+ # Stop slurmdbd and submit job
+ atf.stop_slurmdbd(quiet=True)
+ job_id = submit_job_with(f"--account={acct1}", fatal=True)
+
+ # Stop slurmctld, start slurmdbd, remove account
+ atf.stop_slurmctld(quiet=True)
+ atf.start_slurmdbd(quiet=True)
+ atf.run_command(
+ f"sacctmgr -i remove account {acct1}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # Start slurmctld and verify job state/reason
+ atf.start_slurmctld(quiet=True)
+
+ assert atf.wait_for_job_state(
+ job_id, "PENDING", desired_reason="InvalidAccount"
+ ), f"Job {job_id} not in PENDING state and InvalidAccount reason"
+
+ # Try to submit a new job with removed account - should be rejected
+ assert (
+ submit_job_with(f"--account={acct1}", xfail=True) == 0
+ ), f"Job submission with removed account {acct1} should have failed"
+
+
+def test_account_removal_multiple():
+ """Test account removal when user has multiple account access:
+ 1. Verifies jobs with removed account get marked InvalidAccount
+ 2. Verifies jobs with remaining account stay valid
+ 3. Verifies new job submissions with removed account are rejected
+ 4. Verifies new job submissions with remaining account succeed
+ """
+
+ # Stop slurmdbd and submit jobs with different accounts
+ atf.stop_slurmdbd(quiet=True)
+ job_id1 = submit_job_with(f"--account={acct1}", fatal=True)
+ job_id2 = submit_job_with(f"--account={acct2}", fatal=True)
+
+ # Stop slurmctld, start slurmdbd, remove first account
+ atf.stop_slurmctld(quiet=True)
+ atf.start_slurmdbd(quiet=True)
+ atf.run_command(
+ f"sacctmgr -i remove account {acct1}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # Start slurmctld and verify jobs state/reason
+ atf.start_slurmctld(quiet=True)
+
+ # First job should be PENDING with InvalidAccount
+ assert atf.wait_for_job_state(
+ job_id1, "PENDING", desired_reason="InvalidAccount"
+ ), f"Job {job_id1} not in PENDING state and InvalidAccount reason"
+
+ # Second job should stay PENDING with different reason
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING", timeout=10
+ ), f"Job {job_id2} should be in PENDING state"
+ assert (
+ atf.get_job_parameter(job_id2, "Reason") != "InvalidAccount"
+ ), "The second job whose account was not deleted should not be pending due to 'InvalidAccount'"
+
+ # Try to submit a new job with removed account - should be rejected
+ assert (
+ submit_job_with(f"--account={acct1}", xfail=True) == 0
+ ), f"Job submission with removed account {acct1} should have failed"
+
+ # Submit a job with remaining account - should succeed
+ assert (
+ submit_job_with(f"--account={acct2}") != 0
+ ), f"Job submission with valid account {acct2} should have succeeded"
+
+
+def test_account_removal_running_vs_pending():
+ """Test account removal impact on running vs pending jobs:
+ 1. Submit two jobs with same account - one running, one pending
+ 2. Remove the account
+ 3. Verify running job continues running
+ 4. Verify pending job gets marked with InvalidAccount
+ """
+
+ # Stop slurmdbd and submit jobs - use exclusive to ensure only one can run
+ atf.stop_slurmdbd(quiet=True)
+ job_id1 = submit_job_with(f"--account={acct1} --exclusive", fatal=True)
+ job_id2 = submit_job_with(f"--account={acct1} --exclusive", fatal=True)
+
+ # Wait for first job to start running
+ assert atf.wait_for_job_state(
+ job_id1, "RUNNING"
+ ), f"Job {job_id1} never started running"
+
+ # Verify second job is pending (due to exclusive)
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING"
+ ), f"Job {job_id2} should be pending"
+
+ # Stop slurmctld, start slurmdbd, remove account
+ atf.stop_slurmctld(quiet=True)
+ atf.start_slurmdbd(quiet=True)
+ atf.run_command(
+ f"sacctmgr -i remove account {acct1}",
+ user=atf.properties["slurm-user"],
+ fatal=True,
+ )
+
+ # Start slurmctld and verify jobs state/reason
+ atf.start_slurmctld(quiet=True)
+
+ # Running job should continue running
+ assert atf.wait_for_job_state(
+ job_id1, "RUNNING"
+ ), f"Previously running job {job_id1} should stay RUNNING"
+ assert (
+ atf.get_job_parameter(job_id1, "Reason") == "None"
+ ), f"Running job {job_id1} should have 'None' as reason"
+
+ # Pending job should be marked with InvalidAccount
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING", desired_reason="InvalidAccount"
+ ), f"Pending job {job_id2} should be PENDING with InvalidAccount reason"
From 949b550230d9756324aca714f85f63a45bff2214 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Wed, 13 Nov 2024 10:11:41 -0700
Subject: [PATCH 60/90] Avoid slurmd crashing after failing to unpack
SlurmdSpoolDir/cred_state
If a newer version of slurmd has been started and upgraded the file formats
in SlurmdSpoolDir, trying to start an older version of slurmd can result in
an unpack failure. The cred_job_list and/or cred_state_list would then be
NULL. Calling list_delete_all() on a NULL list crashes. Instead of crashing,
log a warning and return.
Issue 50154, Ticket 21432
---
NEWS | 3 +++
src/slurmd/slurmd/cred_context.c | 18 ++++++++++++++++--
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/NEWS b/NEWS
index 29ca7d6f585..fc583d5587b 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,9 @@ documents those changes that are of interest to users and administrators.
'.jobs[].job_resources'.
-- Fix sstat/sattach protocol errors for steps on higher version slurmd's
(regressions since 20.11.0rc1 and 16.05.1rc1 respectively).
+ -- slurmd - Avoid a crash when starting slurmd version 24.05 with
+ SlurmdSpoolDir files that have been upgraded to a newer major version of
+ Slurm. Log warnings instead.
* Changes in Slurm 24.05.4
==========================
diff --git a/src/slurmd/slurmd/cred_context.c b/src/slurmd/slurmd/cred_context.c
index 998c24cf838..8df077ffc65 100644
--- a/src/slurmd/slurmd/cred_context.c
+++ b/src/slurmd/slurmd/cred_context.c
@@ -124,7 +124,14 @@ static job_state_t *_find_job_state(uint32_t jobid)
static void _clear_expired_job_states(void)
{
- time_t now = time(NULL);
+ time_t now;
+
+ if (!cred_job_list) {
+ warning("No cred_job_list, unable to clear expired job states");
+ return;
+ }
+
+ now = time(NULL);
list_delete_all(cred_job_list, _list_find_expired_job_state, &now);
}
@@ -140,7 +147,14 @@ static int _list_find_expired_cred_state(void *x, void *key)
static void _clear_expired_credential_states(void)
{
- time_t now = time(NULL);
+ time_t now;
+
+ if (!cred_state_list) {
+ warning("No cred_state_list, unable to clear expired credential states");
+ return;
+ }
+
+ now = time(NULL);
list_delete_all(cred_state_list, _list_find_expired_cred_state, &now);
}
From c6eed2f29b11e4175c79e2f77c6ffc6fb4087af9 Mon Sep 17 00:00:00 2001
From: Brian Christiansen
Date: Mon, 25 Nov 2024 23:25:45 -0700
Subject: [PATCH 61/90] Fix race condition in stepmgr step completion handling
Ticket 21122
Signed-Off-By: Scott Hilton
---
NEWS | 1 +
src/slurmd/slurmstepd/req.c | 58 +++++++++++++++++++++----------------
2 files changed, 34 insertions(+), 25 deletions(-)
diff --git a/NEWS b/NEWS
index fc583d5587b..41825a3cd30 100644
--- a/NEWS
+++ b/NEWS
@@ -23,6 +23,7 @@ documents those changes that are of interest to users and administrators.
-- slurmd - Avoid a crash when starting slurmd version 24.05 with
SlurmdSpoolDir files that have been upgraded to a newer major version of
Slurm. Log warnings instead.
+ -- Fix race condition in stepmgr step completion handling.
* Changes in Slurm 24.05.4
==========================
diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c
index e7da47c75dd..5c0e21cbfa1 100644
--- a/src/slurmd/slurmstepd/req.c
+++ b/src/slurmd/slurmstepd/req.c
@@ -2163,31 +2163,39 @@ _handle_completion(int fd, stepd_step_rec_t *step, uid_t uid)
goto rwfail;
FREE_NULL_BUFFER(buffer);
- if (job_step_ptr && do_stepmgr) {
- int rem = 0;
- uint32_t max_rc;
- slurm_step_id_t temp_id = {
- .job_id = job_step_ptr->job_id,
- .step_het_comp = NO_VAL,
- .step_id = step_id
- };
-
- step_complete_msg_t req = {
- .range_first = first,
- .range_last = last,
- .step_id = temp_id,
- .step_rc = step_rc,
- .jobacct = jobacct
- };
-
- step_partial_comp(&req, uid, true, &rem, &max_rc);
-
- safe_write(fd, &rc, sizeof(int));
- safe_write(fd, &errnum, sizeof(int));
-
- jobacctinfo_destroy(jobacct);
-
- return SLURM_SUCCESS;
+ if (do_stepmgr) {
+ slurm_mutex_lock(&stepmgr_mutex);
+ if (job_step_ptr) {
+ int rem = 0;
+ uint32_t max_rc;
+ slurm_step_id_t temp_id = {
+ .job_id = job_step_ptr->job_id,
+ .step_het_comp = NO_VAL,
+ .step_id = step_id
+ };
+
+ step_complete_msg_t req = {
+ .range_first = first,
+ .range_last = last,
+ .step_id = temp_id,
+ .step_rc = step_rc,
+ .jobacct = jobacct
+ };
+
+ step_partial_comp(&req, uid, true, &rem, &max_rc);
+
+ safe_write(fd, &rc, sizeof(int));
+ safe_write(fd, &errnum, sizeof(int));
+
+ jobacctinfo_destroy(jobacct);
+
+ rc = SLURM_SUCCESS;
+ } else {
+ error("Asked to complete a stepmgr step but we don't have a job_step_ptr. This should never happen.");
+ rc = SLURM_ERROR;
+ }
+ slurm_mutex_unlock(&stepmgr_mutex);
+ return rc;
}
/*
From 6c48f9d89312d24e5789c244807885f35fcbdc86 Mon Sep 17 00:00:00 2001
From: Megan Dahl
Date: Tue, 24 Sep 2024 18:41:44 +0200
Subject: [PATCH 62/90] Fix ctld segfault with job arrays and reserved ports
If one has MpiParams or do use resv-ports option, in combination with job
arrays, the controller will segfault. The node_bitmap being referenced has
already been freed. Additionally, the job record was copied from the
previous array task and thus referenced the other tasks resv_port_cnt,
resv_ports, and resv_port_array. Make sure that these copied values get
reset to default values before returning from job_array_split().
Ticket 21008.
---
NEWS | 1 +
src/common/port_mgr.c | 48 ++++++++++++++++++++++++++++++++++
src/slurmctld/job_mgr.c | 2 ++
src/slurmctld/node_scheduler.c | 8 ++++--
4 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/NEWS b/NEWS
index 41825a3cd30..a09a8e5b02b 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,7 @@ documents those changes that are of interest to users and administrators.
SlurmdSpoolDir files that have been upgraded to a newer major version of
Slurm. Log warnings instead.
-- Fix race condition in stepmgr step completion handling.
+ -- Fix slurmctld segfault with stepmgr and MpiParams when running a job array.
* Changes in Slurm 24.05.4
==========================
diff --git a/src/common/port_mgr.c b/src/common/port_mgr.c
index c78abc3956b..dc89241a7c9 100644
--- a/src/common/port_mgr.c
+++ b/src/common/port_mgr.c
@@ -342,6 +342,9 @@ static int _resv_port_alloc(uint16_t resv_port_cnt,
static int last_port_alloc = 0;
static int dims = -1;
+ xassert(!*resv_ports);
+ xassert(!*resv_port_array);
+
if (dims == -1)
dims = slurmdb_setup_cluster_dims();
@@ -389,6 +392,28 @@ extern int resv_port_step_alloc(step_record_t *step_ptr)
int rc;
int port_inx;
+ if (step_ptr->resv_port_array || step_ptr->resv_ports) {
+ /*
+ * Both resv_ports and resv_port_array need to be NULL.
+ * If they are not that could lead to resv_ports never being
+ * freed on nodes, eventually making those nodes unable to
+ * schedule jobs since their ports could have been allocated
+ * without being freed. By setting resv_ports and
+ * resv_port_array to NULL in job_array_split() guarantees that,
+ * but try to catch this issue if it happens in future.
+ */
+ error("%pS allocated reserved ports while it already had reserved ports %s",
+ step_ptr, step_ptr->resv_ports);
+
+ /*
+ * We can't just call _resv_port_free() because it is not
+ * guaranteed that the node_bitmap or resv_port_cnt is the same
+ * from when resv_port_array was allocated.
+ */
+ xfree(step_ptr->resv_port_array);
+ xfree(step_ptr->resv_ports);
+ }
+
rc = _resv_port_alloc(step_ptr->resv_port_cnt,
step_ptr->step_node_bitmap, &step_ptr->resv_ports,
&step_ptr->resv_port_array, &port_inx);
@@ -408,6 +433,29 @@ extern int resv_port_job_alloc(job_record_t *job_ptr)
int rc;
int port_inx;
+ if (job_ptr->resv_port_array || job_ptr->resv_ports) {
+ /*
+ * Both resv_ports and resv_port_array need to be NULL.
+ * If they are not that could lead to resv_ports never being
+ * freed on nodes, eventually making those nodes unable to
+ * schedule jobs since their ports could have been allocated
+ * without being freed. By setting resv_ports and
+ * resv_port_array to NULL in job_array_split() guarantees that,
+ * but try to catch this issue if it happens in future.
+ */
+ error("%pJ allocated reserved ports while it already had reserved ports %s. Ports may be lost, which will require a restart of the slurmctld daemon to resolve.",
+ job_ptr, job_ptr->resv_ports);
+
+ /*
+ * We can't just call _resv_port_free() because it is not
+ * guaranteed that the node_bitmap or resv_port_cnt is the same
+ * from when resv_port_array was allocated. A restart of the
+ * controller will restore any lost ports.
+ */
+ xfree(job_ptr->resv_port_array);
+ xfree(job_ptr->resv_ports);
+ }
+
rc = _resv_port_alloc(job_ptr->resv_port_cnt,
job_ptr->node_bitmap, &job_ptr->resv_ports,
&job_ptr->resv_port_array, &port_inx);
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index d0c02a94447..a3a04aa2562 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -3477,6 +3477,8 @@ extern job_record_t *job_array_split(job_record_t *job_ptr)
job_ptr_pend->resv_name = xstrdup(job_ptr->resv_name);
if (job_ptr->resv_list)
job_ptr_pend->resv_list = list_shallow_copy(job_ptr->resv_list);
+ job_ptr_pend->resv_ports = NULL;
+ job_ptr_pend->resv_port_array = NULL;
job_ptr_pend->resp_host = xstrdup(job_ptr->resp_host);
if (job_ptr->select_jobinfo) {
job_ptr_pend->select_jobinfo =
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 1f1794f1c1f..bda74ea0618 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -2538,6 +2538,7 @@ extern int select_nodes(job_record_t *job_ptr, bool test_only,
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
List gres_list_pre = NULL;
bool gres_list_pre_set = false;
+ job_record_t *tmp_job;
xassert(job_ptr);
xassert(job_ptr->magic == JOB_MAGIC);
@@ -2942,7 +2943,10 @@ extern int select_nodes(job_record_t *job_ptr, bool test_only,
job_end_time_reset(job_ptr);
- (void) job_array_post_sched(job_ptr);
+ tmp_job = job_array_post_sched(job_ptr);
+ if (tmp_job && (tmp_job != job_ptr) && (orig_resv_port_cnt == NO_VAL16))
+ tmp_job->resv_port_cnt = orig_resv_port_cnt;
+
if (bb_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error_code = ESLURM_INVALID_BURST_BUFFER_REQUEST;
@@ -3101,7 +3105,6 @@ extern int select_nodes(job_record_t *job_ptr, bool test_only,
}
if (error_code != SLURM_SUCCESS) {
- FREE_NULL_BITMAP(job_ptr->node_bitmap);
if (gres_list_pre_set &&
(job_ptr->gres_list_req != gres_list_pre)) {
FREE_NULL_LIST(job_ptr->gres_list_req);
@@ -3114,6 +3117,7 @@ extern int select_nodes(job_record_t *job_ptr, bool test_only,
resv_port_job_free(job_ptr);
xfree(job_ptr->resv_ports);
}
+ FREE_NULL_BITMAP(job_ptr->node_bitmap);
} else
FREE_NULL_LIST(gres_list_pre);
From c90858ccbedf55832349b7d193324298ed4d64f4 Mon Sep 17 00:00:00 2001
From: Matt Ezell
Date: Wed, 30 Oct 2024 18:42:42 -0400
Subject: [PATCH 63/90] Reset a job's priority when it is requeued
When a job is requeued it's begin_time is set to now+cred_expire+1, and
the job's priority remains until it's eligible to run again AND until
the decay thread updates the priority. So recalculate the job's priority
right after it has requeued.
Ticket 21312
---
NEWS | 1 +
src/slurmctld/job_mgr.c | 4 ++++
2 files changed, 5 insertions(+)
diff --git a/NEWS b/NEWS
index a09a8e5b02b..c248b2ca6a0 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,7 @@ documents those changes that are of interest to users and administrators.
Slurm. Log warnings instead.
-- Fix race condition in stepmgr step completion handling.
-- Fix slurmctld segfault with stepmgr and MpiParams when running a job array.
+ -- Fix requeued jobs keeping their priority until the decay thread happens.
* Changes in Slurm 24.05.4
==========================
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index a3a04aa2562..e99e2d58f21 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -15676,6 +15676,10 @@ void batch_requeue_fini(job_record_t *job_ptr)
}
}
+ /* Reset the priority (begin and accrue times were reset) */
+ if (job_ptr->priority != 0)
+ set_job_prio(job_ptr);
+
/*
* If a reservation ended and was a repeated (e.g., daily, weekly)
* reservation, its ID will be different; make sure
From 7eee12524739cabd8c19588f411e0de2d0317517 Mon Sep 17 00:00:00 2001
From: Andy Georges
Date: Wed, 27 Nov 2024 17:28:30 +0100
Subject: [PATCH 64/90] fix: include 24.05.ug branch
---
.github/workflows/build.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 04ee9237cb8..3a69be7cd3c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,9 +2,9 @@ name: slurm C/C++ build
on:
push:
- branches: [ 20.11.ug, 22.05.ug ]
+ branches: [ 20.11.ug, 22.05.ug, 24.05.ug ]
pull_request:
- branches: [ 20.11.ug, 22.05.ug ]
+ branches: [ 20.11.ug, 22.05.ug, 24.05.ug ]
jobs:
build:
From 43572f8428ef4b9bf718208b71bf1794f1f704c2 Mon Sep 17 00:00:00 2001
From: Stephen Kendall
Date: Mon, 25 Nov 2024 12:01:24 -0700
Subject: [PATCH 65/90] Docs - move enable_job_state_cache to
SchedulerParameters
Ticket 21517
---
doc/man/man5/slurm.conf.5 | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 981b790d750..98ba99e476f 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -4155,6 +4155,13 @@ Enable job steps that span heterogeneous job allocations.
The default value.
.IP
+.TP
+\fBenable_job_state_cache\fR
+Enables an independent cache of job state details within slurmctld. This allows
+processing of `\fBsqueue\fR \-\-only\-job\-state` and replaced RPCs with minimal
+impact on other slurmctld operations.
+.IP
+
.TP
\fBenable_user_top\fR
Enable use of the "scontrol top" command by non\-privileged users.
@@ -4846,13 +4853,6 @@ filenames have no path separators and are located adjacent to slurm.conf.
Glob patterns (See \fBglob\fR (7)) are not supported.
.IP
-.TP
-\fBenable_job_state_cache\fR
-Enables an independent cache of job state details within slurmctld. This allows
-processing of `\fBsqueue\fR \-\-only\-job\-state` and replaced RPCs with minimal
-impact on other slurmctld operations.
-.IP
-
.TP
\fBidle_on_node_suspend\fR
Mark nodes as idle, regardless of current state, when suspending nodes with
From e13128836cfb3d80e9face53a04cdb27df5efda6 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Tue, 26 Nov 2024 19:58:43 -0700
Subject: [PATCH 66/90] Docs - Add docker exceptions to containers guide
Exceptions for 'docker-compose', 'docker swarm',
and docker commands/api from inside containers.
Issue 50171
---
doc/html/containers.shtml | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/doc/html/containers.shtml b/doc/html/containers.shtml
index e8cea0253bd..17b96110074 100644
--- a/doc/html/containers.shtml
+++ b/doc/html/containers.shtml
@@ -515,10 +515,14 @@ the Slurm controller. The container is run by Slurm on the compute nodes which
makes having Docker setup a network isolation layer ineffective for the
container.
docker exec
command is not supported.
-docker compose
command is not supported.
+docker swarm
command is not supported.
+docker compose
/docker-compose
command is not supported.
docker pause
command is not supported.
docker unpause
command is not supported.
docker swarm
command is not supported.
+All docker
commands are not supported inside of containers.
+Docker API is
+ not supported inside of containers.
Setup procedure
From 5502b1bb55e2d37cbf942bf66552d8df6872dbe1 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Tue, 26 Nov 2024 20:25:07 -0700
Subject: [PATCH 67/90] Docs - Add podman exceptions to containers guide
Exceptions for 'podman-compose', 'podman farm'
and podman commands/api from inside containers.
Issue 50171
---
doc/html/containers.shtml | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/doc/html/containers.shtml b/doc/html/containers.shtml
index 17b96110074..7b38b4138f8 100644
--- a/doc/html/containers.shtml
+++ b/doc/html/containers.shtml
@@ -641,8 +641,15 @@ configuration.
host networking
podman exec
command is not supported.
+podman-compose
command is not supported, due to only being
+ partially implemented. Some compositions may work but each container
+ may be run on different nodes. The network for all containers must be
+ the network_mode: host device.
podman kube
command is not supported.
podman pod
command is not supported.
+podman farm
command is not supported.
+All podman
commands are not supported inside of containers.
+Podman REST API is not supported inside of containers.
Setup procedure
From 038eb99a6b8d5be846bc3518faf6a9fca7bf75a6 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Tue, 26 Nov 2024 20:32:37 -0700
Subject: [PATCH 68/90] Docs - Swap docker and podman exceptions to use 'code'
blocks
Issue 50171
---
doc/html/containers.shtml | 29 +++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/doc/html/containers.shtml b/doc/html/containers.shtml
index 7b38b4138f8..c1b9e75c4dc 100644
--- a/doc/html/containers.shtml
+++ b/doc/html/containers.shtml
@@ -514,13 +514,14 @@ scrun being isolated from the network and not being able to communicate with
the Slurm controller. The container is run by Slurm on the compute nodes which
makes having Docker setup a network isolation layer ineffective for the
container.
-docker exec
command is not supported.
-docker swarm
command is not supported.
-docker compose
/docker-compose
command is not supported.
-docker pause
command is not supported.
-docker unpause
command is not supported.
-docker swarm
command is not supported.
-All docker
commands are not supported inside of containers.
+docker exec command is not supported.
+docker swarm command is not supported.
+docker compose/docker-compose command is not
+ supported.
+docker pause command is not supported.
+docker unpause command is not supported.
+docker swarm command is not supported.
+All docker commands are not supported inside of containers.
Docker API is
not supported inside of containers.
@@ -640,15 +641,15 @@ configuration.
All containers must use
host networking
-podman exec
command is not supported.
-podman-compose
command is not supported, due to only being
+podman exec command is not supported.
+podman-compose command is not supported, due to only being
partially implemented. Some compositions may work but each container
may be run on different nodes. The network for all containers must be
the network_mode: host device.
-podman kube
command is not supported.
-podman pod
command is not supported.
-podman farm
command is not supported.
-All podman
commands are not supported inside of containers.
+podman kube command is not supported.
+podman pod command is not supported.
+podman farm command is not supported.
+All podman commands are not supported inside of containers.
Podman REST API is not supported inside of containers.
@@ -942,6 +943,6 @@ Overview slides of Sarus are
-Last modified 19 November 2024
+Last modified 27 November 2024
From cc982bc73909f29c4347112bb895804d311d1334 Mon Sep 17 00:00:00 2001
From: Nathan Rini
Date: Tue, 26 Nov 2024 16:28:53 -0700
Subject: [PATCH 69/90] Docs - Add allow userns via apparmor in containers
guide
23.10+ Ubuntu kernels restrict unprivileged user namespaces which breaks
rootless podman and docker.
Issue 50169
---
doc/html/containers.shtml | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/doc/html/containers.shtml b/doc/html/containers.shtml
index c1b9e75c4dc..0f5ee7aef6c 100644
--- a/doc/html/containers.shtml
+++ b/doc/html/containers.shtml
@@ -78,7 +78,11 @@ job or any given plugin).
Prerequisites
The host kernel must be configured to allow user land containers:
-$ sudo sysctl -w kernel.unprivileged_userns_clone=1
+
+sudo sysctl -w kernel.unprivileged_userns_clone=1
+sudo sysctl -w kernel.apparmor_restrict_unprivileged_unconfined=0
+sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0
+
Docker also provides a tool to verify the kernel configuration:
$ dockerd-rootless-setuptool.sh check --force
From b5ef918b2cc4db1ffa15a4068556fd53129d3b5c Mon Sep 17 00:00:00 2001
From: Andy Georges
Date: Thu, 5 Dec 2024 15:15:03 +0100
Subject: [PATCH 70/90] fix: add munge developnment to gh workflow build
---
.github/workflows/build.yml | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3a69be7cd3c..1bcd969c943 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,22 +2,23 @@ name: slurm C/C++ build
on:
push:
- branches: [ 20.11.ug, 22.05.ug, 24.05.ug ]
+ branches: [20.11.ug, 22.05.ug, 24.05.ug]
pull_request:
- branches: [ 20.11.ug, 22.05.ug, 24.05.ug ]
+ branches: [20.11.ug, 22.05.ug, 24.05.ug]
jobs:
build:
-
runs-on: ubuntu-20.04
steps:
- - uses: actions/checkout@v2
- - name: configure
- run: ./configure --enable-multiple-slurmd --prefix=/tmp/slurm/
- - name: make
- run: make -j
- - name: make check
- run: make -j check
- - name: make install
- run: make -j install
+ - uses: actions/checkout@v2
+ - name: Install deps
+ run: sudo apt-get install -y libmunge-dev
+ - name: configure
+ run: ./configure --enable-multiple-slurmd --prefix=/tmp/slurm/
+ - name: make
+ run: make -j
+ - name: make check
+ run: make -j check
+ - name: make install
+ run: make -j install
From 17ec21fb8944d404d7582d7ab0b6c8cfeb689771 Mon Sep 17 00:00:00 2001
From: Nathan Prisbrey
Date: Sat, 30 Nov 2024 23:34:21 +0000
Subject: [PATCH 71/90] Testsuite - Improve logging of slurmdbd start/stop
Ticket 21393
Cherry-picked: eca4a64f53
---
testsuite/python/lib/atf.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/testsuite/python/lib/atf.py b/testsuite/python/lib/atf.py
index 647db89c9c4..304716256ec 100644
--- a/testsuite/python/lib/atf.py
+++ b/testsuite/python/lib/atf.py
@@ -561,6 +561,8 @@ def start_slurmdbd(clean=False, quiet=False):
if not properties["auto-config"]:
require_auto_config("wants to start slurmdbd")
+ logging.debug("Starting slurmdbd...")
+
if (
run_command_exit(
"sacctmgr show cluster", user=properties["slurm-user"], quiet=quiet
@@ -583,6 +585,8 @@ def start_slurmdbd(clean=False, quiet=False):
"sacctmgr show cluster", lambda results: results["exit_code"] == 0
):
pytest.fail(f"Slurmdbd is not running")
+ else:
+ logging.debug("Slurmdbd started successfully")
def start_slurm(clean=False, quiet=False):
@@ -727,6 +731,8 @@ def stop_slurmdbd(quiet=False):
if not properties["auto-config"]:
require_auto_config("wants to stop slurmdbd")
+ logging.debug("Stopping slurmdbd...")
+
# Stop slurmdbd
results = run_command(
"sacctmgr shutdown", user=properties["slurm-user"], quiet=quiet
@@ -743,6 +749,8 @@ def stop_slurmdbd(quiet=False):
timeout=60,
):
failures.append("Slurmdbd is still running")
+ else:
+ logging.debug("Slurmdbd stopped successfully")
def stop_slurm(fatal=True, quiet=False):
From abcba2b6120ebaeb1fcd96f11c28e05ed756a688 Mon Sep 17 00:00:00 2001
From: Nathan Prisbrey
Date: Sat, 30 Nov 2024 23:30:45 +0000
Subject: [PATCH 72/90] Testsuite - Fix test_102_5 job race condition
Address race condition where submitted jobs would occasionally start
RUNNING when the test expected PENDING jobs.
Tests running_vs_pending are now not necessary.
Ticket 21393
Cherry-picked: d4ebd3f697
---
testsuite/python/tests/test_102_5.py | 300 ++++++++++++++-------------
1 file changed, 152 insertions(+), 148 deletions(-)
diff --git a/testsuite/python/tests/test_102_5.py b/testsuite/python/tests/test_102_5.py
index f6576fe752f..2490fa66cfb 100644
--- a/testsuite/python/tests/test_102_5.py
+++ b/testsuite/python/tests/test_102_5.py
@@ -69,79 +69,127 @@ def submit_job_with(extra_params, xfail=False, fatal=False):
def test_qos_removal_single():
- """Test that removal of a single QOS:
- 1. Marks pending jobs with InvalidQOS
- 2. Rejects new job submissions using removed QOS
+ """Test removing a QOS in use:
+ - Verify that running jobs with that QOS keep running.
+ - Verify that pending jobs are updated to InvalidQOS
+ - Verify that new jobs cannot use the removed QOS.
"""
-
- # Stop slurmdbd and submit job
+ # Stop slurmdbd to avoid the job info being saved in the DB
atf.stop_slurmdbd(quiet=True)
- job_id = submit_job_with(f"--qos={qos1}", fatal=True)
- # Stop slurmctld, start slurmdbd, remove QOS
+ # Submit a blocking job
+ job_id1 = submit_job_with(f"--qos={qos1} --exclusive", fatal=True)
+ assert atf.wait_for_job_state(
+ job_id1, "RUNNING"
+ ), f"Job {job_id1} never started running"
+
+ # Submit another job in the same node to be blocked (due exclusive)
+ node = atf.get_job_parameter(job_id1, "NodeList")
+ job_id2 = submit_job_with(f"--qos={qos1} -w {node}", fatal=True)
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING"
+ ), f"Job {job_id2} should be pending"
+
+ # Stop slurmctld before starting slurmdbd to keep the jobs info out of
+ # the DB, only in slurmctld for the moment.
atf.stop_slurmctld(quiet=True)
atf.start_slurmdbd(quiet=True)
+
+ # Remove the QOS from the DB.
+ # Note that slurmdbd won't have the QOS or the jobs using it, while
+ # slurmctld knows the jobs and still thinks that the QOS exists.
atf.run_command(
f"sacctmgr -i remove qos {qos1}",
user=atf.properties["slurm-user"],
fatal=True,
)
- # Start slurmctld and verify job state/reason
+ # Start slurmctld and verify job states/reasons are the expected now that
+ # the QOS doesn't exists anymore.
atf.start_slurmctld(quiet=True)
+ # Running job should continue running
assert atf.wait_for_job_state(
- job_id, "PENDING", desired_reason="InvalidQOS"
- ), f"Job {job_id} not in PENDING state with InvalidQOS reason"
+ job_id1, "RUNNING", desired_reason="None"
+ ), f"Previously running job {job_id1} should stay RUNNING with 'None' reason"
+
+ # Pending job should be marked with InvalidQOS
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING", desired_reason="InvalidQOS"
+ ), f"Pending job {job_id2} should be PENDING with InvalidQOS reason"
# Try to submit a new job with removed QOS - should be rejected
assert (
submit_job_with(f"--qos={qos1}", xfail=True) == 0
- ), f"Job submission with removed QOS {qos1} should have failed but got job id {job_id2}"
-
- # Submit a job with remaining QOS - should succeed
- assert (
- submit_job_with(f"--qos={qos2}") != 0
- ), f"Job submission with valid QOS {qos2} should have succeeded"
+ ), f"Job submission with removed QOS {qos1} should have failed"
def test_qos_removal_multiple():
"""Test QOS removal when user has multiple QOS access:
- 1. Verifies jobs with removed QOS get marked InvalidQOS
- 2. Verifies jobs with remaining QOS stay valid
- 3. Verifies new job submissions with removed QOS are rejected
- 4. Verifies new job submissions with remaining QOS succeed
+ - Verify that running jobs with removed QOS keep running.
+ - Verify that pending jobs with removed QOS are updated to InvalidQOS.
+ - Verify that jobs with remaining QOS stay valid.
+ - Verify that new jobs cannot use the removed QOS.
+ - Verify that new job can use the remaining QOS.
"""
- # Stop slurmdbd and submit jobs with different QOSs
+ # Stop slurmdbd to avoid the job info being saved in the DB
atf.stop_slurmdbd(quiet=True)
- job_id1 = submit_job_with(f"--qos={qos1}", fatal=True)
- job_id2 = submit_job_with(f"--qos={qos2}", fatal=True)
- # Stop slurmctld, start slurmdbd, remove first QOS
+ # Submit a blocking job
+ job_id1 = submit_job_with(f"--qos={qos1} --exclusive", fatal=True)
+ assert atf.wait_for_job_state(
+ job_id1, "RUNNING"
+ ), f"Job {job_id1} never started running"
+
+ # Submit two more jobs in the same node to be blocked (due exclusive)
+ node = atf.get_job_parameter(job_id1, "NodeList")
+ job_id2 = submit_job_with(f"--qos={qos1} -w {node}", fatal=True)
+ job_id3 = submit_job_with(f"--qos={qos2} -w {node}", fatal=True)
+
+ # Verify both jobs are pending
+ assert atf.wait_for_job_state(
+ job_id2, "PENDING"
+ ), f"Job {job_id2} should be pending"
+ assert atf.wait_for_job_state(
+ job_id3, "PENDING"
+ ), f"Job {job_id3} should be pending"
+
+ # Stop slurmctld before starting slurmdbd to keep the jobs info out of
+ # the DB, only in slurmctld for the moment.
atf.stop_slurmctld(quiet=True)
atf.start_slurmdbd(quiet=True)
+
+ # Remove the QOS from the DB.
+ # Note that slurmdbd won't have the QOS or the jobs using it, while
+ # slurmctld knows the jobs and still thinks that the QOS exists.
atf.run_command(
f"sacctmgr -i remove qos {qos1}",
user=atf.properties["slurm-user"],
fatal=True,
)
- # Start slurmctld and verify jobs state/reason
+ # Start slurmctld and verify job states/reasons are the expected now that
+ # the QOS doesn't exists anymore.
atf.start_slurmctld(quiet=True)
- # First job should be PENDING with InvalidQOS
+ # Running job should continue running
assert atf.wait_for_job_state(
- job_id1, "PENDING", desired_reason="InvalidQOS"
- ), f"Job {job_id1} not in PENDING state and InvalidQOS reason"
+ job_id1, "RUNNING", desired_reason="None"
+ ), f"Previously running job {job_id1} should stay RUNNING with 'None' reason"
- # Second job should stay PENDING with different reason
+ # Pending job with removed QOS should be marked with InvalidQOS
assert atf.wait_for_job_state(
- job_id2, "PENDING", timeout=10
- ), f"Job {job_id2} should be in PENDING state"
+ job_id2, "PENDING", desired_reason="InvalidQOS"
+ ), f"Pending job {job_id2} should be PENDING with InvalidQOS reason"
+
+ # Pending job with remaining QOS should stay valid
+ assert atf.wait_for_job_state(
+ job_id3, "PENDING"
+ ), f"Job {job_id3} should be PENDING"
assert (
- atf.get_job_parameter(job_id2, "Reason") != "InvalidQOS"
- ), "The second job whose QOS was not deleted should not be pending due to 'InvalidQOS'"
+ atf.get_job_parameter(job_id3, "Reason") != "InvalidQOS"
+ ), f"Job {job_id3} using qos2 should not have InvalidQOS reason"
# Try to submit a new job with removed QOS - should be rejected
assert (
@@ -154,80 +202,56 @@ def test_qos_removal_multiple():
), f"Job submission with valid QOS {qos2} should have succeeded"
-def test_qos_removal_running_vs_pending():
- """Test QOS removal impact on running vs pending jobs:
- 1. Submit two jobs with same QOS - one running, one pending
- 2. Remove the QOS
- 3. Verify running job continues running
- 4. Verify pending job gets marked with InvalidQOS
+def test_account_removal_single():
+ """Test removing an account in use:
+ - Verify that running jobs with that account keep running.
+ - Verify that pending jobs are updated to InvalidAccount.
+ - Verify that new jobs cannot use the removed account.
"""
- # Stop slurmdbd and submit jobs - use exclusive to ensure only one can run
+ # Stop slurmdbd to avoid the job info being saved in the DB
atf.stop_slurmdbd(quiet=True)
- job_id1 = submit_job_with(f"--qos={qos1} --exclusive", fatal=True)
- job_id2 = submit_job_with(f"--qos={qos1} --exclusive", fatal=True)
- # Wait for first job to start running
+ # Submit a blocking job
+ job_id1 = submit_job_with(f"--account={acct1} --exclusive", fatal=True)
assert atf.wait_for_job_state(
job_id1, "RUNNING"
), f"Job {job_id1} never started running"
- # Verify second job is pending (due to exclusive)
+ # Submit another job in the same node to be blocked (due exclusive)
+ node = atf.get_job_parameter(job_id1, "NodeList")
+ job_id2 = submit_job_with(f"--account={acct1} -w {node}", fatal=True)
assert atf.wait_for_job_state(
job_id2, "PENDING"
), f"Job {job_id2} should be pending"
- # Stop slurmctld, start slurmdbd, remove QOS
+ # Stop slurmctld before starting slurmdbd to keep the jobs info out of
+ # the DB, only in slurmctld for the moment.
atf.stop_slurmctld(quiet=True)
atf.start_slurmdbd(quiet=True)
+
+ # Remove the account from the DB.
+ # Note that slurmdbd won't have the account or the jobs using it, while
+ # slurmctld knows the jobs and still thinks that the account exists.
atf.run_command(
- f"sacctmgr -i remove qos {qos1}",
+ f"sacctmgr -i remove account {acct1}",
user=atf.properties["slurm-user"],
fatal=True,
)
- # Start slurmctld and verify jobs state/reason
+ # Start slurmctld and verify job states/reasons are the expected now that
+ # the account doesn't exists anymore.
atf.start_slurmctld(quiet=True)
# Running job should continue running
assert atf.wait_for_job_state(
- job_id1, "RUNNING"
- ), f"Previously running job {job_id1} should stay RUNNING"
- assert (
- atf.get_job_parameter(job_id1, "Reason") == "None"
- ), f"Running job {job_id1} should have 'None' as reason"
-
- # Pending job should be marked with InvalidQOS
- assert atf.wait_for_job_state(
- job_id2, "PENDING", desired_reason="InvalidQOS"
- ), f"Pending job {job_id2} should be PENDING with InvalidQOS reason"
-
-
-def test_account_removal_single():
- """Test that removal of a single account:
- 1. Marks pending jobs with InvalidAccount
- 2. Rejects new job submissions using removed account
- """
-
- # Stop slurmdbd and submit job
- atf.stop_slurmdbd(quiet=True)
- job_id = submit_job_with(f"--account={acct1}", fatal=True)
-
- # Stop slurmctld, start slurmdbd, remove account
- atf.stop_slurmctld(quiet=True)
- atf.start_slurmdbd(quiet=True)
- atf.run_command(
- f"sacctmgr -i remove account {acct1}",
- user=atf.properties["slurm-user"],
- fatal=True,
- )
-
- # Start slurmctld and verify job state/reason
- atf.start_slurmctld(quiet=True)
+ job_id1, "RUNNING", desired_reason="None"
+ ), f"Previously running job {job_id1} should stay RUNNING with 'None' reason"
+ # Pending job should be marked with InvalidAccount
assert atf.wait_for_job_state(
- job_id, "PENDING", desired_reason="InvalidAccount"
- ), f"Job {job_id} not in PENDING state and InvalidAccount reason"
+ job_id2, "PENDING", desired_reason="InvalidAccount"
+ ), f"Pending job {job_id2} should be PENDING with InvalidAccount reason"
# Try to submit a new job with removed account - should be rejected
assert (
@@ -236,98 +260,78 @@ def test_account_removal_single():
def test_account_removal_multiple():
- """Test account removal when user has multiple account access:
- 1. Verifies jobs with removed account get marked InvalidAccount
- 2. Verifies jobs with remaining account stay valid
- 3. Verifies new job submissions with removed account are rejected
- 4. Verifies new job submissions with remaining account succeed
+ """Test removing an account when user has multiple account access:
+ - Verify that running jobs with removed account keep running.
+ - Verify that pending jobs with removed account are updated to InvalidAccount.
+ - Verify that jobs with remaining account stay valid.
+ - Verify that new jobs cannot use the removed account.
+ - Verify that new jobs can use the remaining account.
"""
- # Stop slurmdbd and submit jobs with different accounts
+ # Stop slurmdbd to avoid the job info being saved in the DB
atf.stop_slurmdbd(quiet=True)
- job_id1 = submit_job_with(f"--account={acct1}", fatal=True)
- job_id2 = submit_job_with(f"--account={acct2}", fatal=True)
-
- # Stop slurmctld, start slurmdbd, remove first account
- atf.stop_slurmctld(quiet=True)
- atf.start_slurmdbd(quiet=True)
- atf.run_command(
- f"sacctmgr -i remove account {acct1}",
- user=atf.properties["slurm-user"],
- fatal=True,
- )
-
- # Start slurmctld and verify jobs state/reason
- atf.start_slurmctld(quiet=True)
-
- # First job should be PENDING with InvalidAccount
- assert atf.wait_for_job_state(
- job_id1, "PENDING", desired_reason="InvalidAccount"
- ), f"Job {job_id1} not in PENDING state and InvalidAccount reason"
-
- # Second job should stay PENDING with different reason
- assert atf.wait_for_job_state(
- job_id2, "PENDING", timeout=10
- ), f"Job {job_id2} should be in PENDING state"
- assert (
- atf.get_job_parameter(job_id2, "Reason") != "InvalidAccount"
- ), "The second job whose account was not deleted should not be pending due to 'InvalidAccount'"
-
- # Try to submit a new job with removed account - should be rejected
- assert (
- submit_job_with(f"--account={acct1}", xfail=True) == 0
- ), f"Job submission with removed account {acct1} should have failed"
-
- # Submit a job with remaining account - should succeed
- assert (
- submit_job_with(f"--account={acct2}") != 0
- ), f"Job submission with valid account {acct2} should have succeeded"
-
-def test_account_removal_running_vs_pending():
- """Test account removal impact on running vs pending jobs:
- 1. Submit two jobs with same account - one running, one pending
- 2. Remove the account
- 3. Verify running job continues running
- 4. Verify pending job gets marked with InvalidAccount
- """
-
- # Stop slurmdbd and submit jobs - use exclusive to ensure only one can run
- atf.stop_slurmdbd(quiet=True)
+ # Submit a blocking job
job_id1 = submit_job_with(f"--account={acct1} --exclusive", fatal=True)
- job_id2 = submit_job_with(f"--account={acct1} --exclusive", fatal=True)
-
- # Wait for first job to start running
assert atf.wait_for_job_state(
job_id1, "RUNNING"
), f"Job {job_id1} never started running"
- # Verify second job is pending (due to exclusive)
+ # Submit two more jobs in the same node to be blocked (due exclusive)
+ node = atf.get_job_parameter(job_id1, "NodeList")
+ job_id2 = submit_job_with(f"--account={acct1} -w {node}", fatal=True)
+ job_id3 = submit_job_with(f"--account={acct2} -w {node}", fatal=True)
+
+ # Verify both jobs are pending
assert atf.wait_for_job_state(
job_id2, "PENDING"
), f"Job {job_id2} should be pending"
+ assert atf.wait_for_job_state(
+ job_id3, "PENDING"
+ ), f"Job {job_id3} should be pending"
- # Stop slurmctld, start slurmdbd, remove account
+ # Stop slurmctld before starting slurmdbd to keep the jobs info out of
+ # the DB, only in slurmctld for the moment.
atf.stop_slurmctld(quiet=True)
atf.start_slurmdbd(quiet=True)
+
+ # Remove the account from the DB.
+ # Note that slurmdbd won't have the account or the jobs using it, while
+ # slurmctld knows the jobs and still thinks that the account exists.
atf.run_command(
f"sacctmgr -i remove account {acct1}",
user=atf.properties["slurm-user"],
fatal=True,
)
- # Start slurmctld and verify jobs state/reason
+ # Start slurmctld and verify job states/reasons are the expected now that
+ # the account doesn't exists anymore.
atf.start_slurmctld(quiet=True)
# Running job should continue running
assert atf.wait_for_job_state(
- job_id1, "RUNNING"
- ), f"Previously running job {job_id1} should stay RUNNING"
- assert (
- atf.get_job_parameter(job_id1, "Reason") == "None"
- ), f"Running job {job_id1} should have 'None' as reason"
+ job_id1, "RUNNING", desired_reason="None"
+ ), f"Previously running job {job_id1} should stay RUNNING with 'None' reason"
- # Pending job should be marked with InvalidAccount
+ # Pending job with removed account should be marked with InvalidAccount
assert atf.wait_for_job_state(
job_id2, "PENDING", desired_reason="InvalidAccount"
), f"Pending job {job_id2} should be PENDING with InvalidAccount reason"
+
+ # Pending job with remaining account should stay valid
+ assert atf.wait_for_job_state(
+ job_id3, "PENDING"
+ ), f"Job {job_id3} should be PENDING"
+ assert (
+ atf.get_job_parameter(job_id3, "Reason") != "InvalidAccount"
+ ), f"Job {job_id3} using acct2 should not have InvalidAccount reason"
+
+ # Try to submit a new job with removed account - should be rejected
+ assert (
+ submit_job_with(f"--account={acct1}", xfail=True) == 0
+ ), f"Job submission with removed account {acct1} should have failed"
+
+ # Submit a job with remaining account - should succeed
+ assert (
+ submit_job_with(f"--account={acct2}") != 0
+ ), f"Job submission with valid account {acct2} should have succeeded"
From aaf75401e188ce611c3654f619e47806abd1e997 Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Thu, 28 Nov 2024 11:10:10 -0500
Subject: [PATCH 73/90] Fix backup controller crash after reconfiguring.
If the backup controller is in control and scontrol reconfigure is run,
after resuming the primary controller it was possible that the backup
would not honor the request to relinquish control.
In most cases this would result in the backup controller crashing, but
in a quiet enough system both controllers could continue to operate as
the primary.
Changelog: slurmctld - Fix crash and possible split brain issue if the
backup controller handles an scontrol reconfigure while in control
before the primary resumes operation.
Ticket: 21532
Cherry-picked: f37371bac8
---
src/slurmctld/controller.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 42185e4d99a..ec4f4e8272c 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -817,6 +817,10 @@ int main(int argc, char **argv)
if (slurmctld_config.resume_backup && slurmctld_primary)
break;
+ /* The backup is now meant to relinquish control */
+ if (slurmctld_config.resume_backup && !slurmctld_primary)
+ backup_has_control = false;
+
recover = 2;
}
From ad37db167796cba755a6a20d2213c7cc964456e4 Mon Sep 17 00:00:00 2001
From: Brian Christiansen
Date: Wed, 4 Dec 2024 00:24:38 -0700
Subject: [PATCH 74/90] Fix stepmgr not getting dynamic node addrs from the
controller
The controller doesn't pack job_record_t's node_addrs, so the stepmgr
wasn't getting them and passing them to the steps. When the steps
completed and tried communicating back to the dynamic stepmgr, it failed to
find its node addr. The job's node_addr's are passed in the cred so we
can just get them from there.
Changelog: Fix stepmgr not getting dynamic node addrs from the controller
Ticket: 21535
Cherry-picked: a4af0fbc1d
---
src/slurmd/slurmstepd/slurmstepd.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c
index 494fb35dc09..09f20b2eb99 100644
--- a/src/slurmd/slurmstepd/slurmstepd.c
+++ b/src/slurmd/slurmstepd/slurmstepd.c
@@ -881,10 +881,31 @@ _init_from_slurmd(int sock, char **argv, slurm_addr_t **_cli,
if (task_msg->job_ptr &&
!xstrcmp(conf->node_name, task_msg->job_ptr->batch_host)) {
+ slurm_addr_t *node_addrs;
+
/* only allow one stepd to be stepmgr. */
job_step_ptr = task_msg->job_ptr;
job_step_ptr->part_ptr = task_msg->part_ptr;
job_node_array = task_msg->job_node_array;
+
+ /*
+ * job_record doesn't pack its node_addrs array, so get
+ * it from the cred.
+ */
+ if (task_msg->cred &&
+ (node_addrs = slurm_cred_get(
+ task_msg->cred,
+ CRED_DATA_JOB_NODE_ADDRS))) {
+ add_remote_nodes_to_conf_tbls(
+ job_step_ptr->nodes, node_addrs);
+
+ job_step_ptr->node_addrs =
+ xcalloc(job_step_ptr->node_cnt,
+ sizeof(slurm_addr_t));
+ memcpy(job_step_ptr->node_addrs, node_addrs,
+ job_step_ptr->node_cnt *
+ sizeof(slurm_addr_t));
+ }
}
break;
From 56a3317b9992d7251edc0f766e1dd659a3a84364 Mon Sep 17 00:00:00 2001
From: Marcin Stolarek
Date: Wed, 27 Nov 2024 17:12:06 +0100
Subject: [PATCH 75/90] stepgmr - avoid sending rc message on success after
already responding
Changelog: stepmgr - avoid "Unexpected missing socket" errors.
Ticket: 21422
Cherry-picked: 8469d653d1
---
src/slurmd/slurmstepd/req.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c
index 5c0e21cbfa1..d558c6aa4fb 100644
--- a/src/slurmd/slurmstepd/req.c
+++ b/src/slurmd/slurmstepd/req.c
@@ -607,8 +607,6 @@ static int _handle_job_step_get_info(int fd, stepd_step_rec_t *step, uid_t uid)
buffer);
slurm_send_node_msg(msg.conn_fd, &response_msg);
FREE_NULL_BUFFER(buffer);
-
- slurm_send_rc_msg(&msg, SLURM_SUCCESS);
slurm_free_msg_members(&msg);
done:
@@ -752,9 +750,10 @@ static int _handle_step_layout(int fd, stepd_step_rec_t *step, uid_t uid)
step_layout);
slurm_send_node_msg(msg.conn_fd, &response_msg);
slurm_step_layout_destroy(step_layout);
+ } else {
+ slurm_send_rc_msg(&msg, rc);
}
- slurm_send_rc_msg(&msg, rc);
slurm_free_msg_members(&msg);
done:
@@ -789,6 +788,8 @@ static int _handle_job_sbcast_cred(int fd, stepd_step_rec_t *step, uid_t uid)
slurm_send_node_msg(msg.conn_fd, &response_msg);
slurm_free_sbcast_cred_msg(job_info_resp_msg);
+ slurm_free_msg_members(&msg);
+ return rc;
resp:
slurm_send_rc_msg(&msg, rc);
@@ -839,6 +840,8 @@ static int _handle_het_job_alloc_info(int fd, stepd_step_rec_t *step, uid_t uid)
resp_list);
slurm_send_node_msg(msg.conn_fd, &response_msg);
FREE_NULL_LIST(resp_list);
+ slurm_free_msg_members(&msg);
+ return rc;
resp:
slurm_send_rc_msg(&msg, rc);
From 990073e59f0f8afe5a10e260c0329772317cac48 Mon Sep 17 00:00:00 2001
From: Brian Christiansen
Date: Thu, 5 Dec 2024 23:33:36 -0700
Subject: [PATCH 76/90] Fix scontrol show steps with dynamic stepmgr
Changelog: Fix `scontrol show steps` with dynamic stepmgr
Ticket: 21422
Cherry-picked: 59e69f37ca
---
src/api/job_step_info.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/src/api/job_step_info.c b/src/api/job_step_info.c
index 00458feb5b9..d1dd90a501d 100644
--- a/src/api/job_step_info.c
+++ b/src/api/job_step_info.c
@@ -315,8 +315,22 @@ static int _get_stepmgr_steps(void *x, void *arg)
slurm_msg_t req_msg;
slurm_msg_t_init(&req_msg);
slurm_msg_set_r_uid(&req_msg, slurm_conf.slurmd_user_id);
- slurm_conf_get_addr(sji->stepmgr, &req_msg.address,
- req_msg.flags);
+
+ if (slurm_conf_get_addr(sji->stepmgr, &req_msg.address, req_msg.flags))
+ {
+ /*
+ * The node isn't in the conf, see if the
+ * controller has an address for it.
+ */
+ slurm_node_alias_addrs_t *alias_addrs = NULL;
+ if (!slurm_get_node_alias_addrs(sji->stepmgr, &alias_addrs)) {
+ add_remote_nodes_to_conf_tbls(alias_addrs->node_list,
+ alias_addrs->node_addrs);
+ slurm_free_node_alias_addrs(alias_addrs);
+ slurm_conf_get_addr(sji->stepmgr, &req_msg.address,
+ req_msg.flags);
+ }
+ }
job_step_info_request_msg_t req_data = {0};
req_data.step_id.job_id = sji->job_id;
From 6a27a08c7919634b75866977be5afafc79d0c0ac Mon Sep 17 00:00:00 2001
From: Albert Gil
Date: Wed, 11 Dec 2024 12:19:29 +0100
Subject: [PATCH 77/90] Testsuite - Improve test15.4 adding diagnose info
Cherry-picked: 5b80951ce7
---
testsuite/expect/test15.4 | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/testsuite/expect/test15.4 b/testsuite/expect/test15.4
index 400e8af1bb9..f0862350f5b 100755
--- a/testsuite/expect/test15.4
+++ b/testsuite/expect/test15.4
@@ -53,7 +53,6 @@ set login_grp_info [get_my_id]
#
# Submit a slurm job that will execute 'id'
#
-set timeout $max_job_delay
spawn $salloc -N1 -t1 $srun $bin_id
expect {
-re "Granted job allocation ($number)" {
@@ -73,15 +72,12 @@ expect {
}
}
-if {$got_job_grps == 0} {
- fail "Did not get user info from slurm job"
-}
+subtest {$got_job_grps != 0} "Verify we were able to get user info from slurm job"
+
#
# Confirm the user id and group id in the slurm job matches that
# of the local 'id' execution.
#
-if {$login_grp_info ne $job_grp_info} {
- fail "Login and slurm user info mismatch"
-}
+subtest {$login_grp_info eq $job_grp_info} "Verify user info from login and job match" "$login_grp_info != $job_grp_info"
From 123076b5208ebc8e4ac02c452f22abae1ce57cd6 Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Mon, 28 Oct 2024 13:09:08 -0400
Subject: [PATCH 78/90] Make helper function for xgetaddrinfo() that accepts
custom hints.
Cherry-picked: e47410a62c
Ticket: 20997
---
src/common/util-net.c | 35 +++++++++++++++++++++--------------
1 file changed, 21 insertions(+), 14 deletions(-)
diff --git a/src/common/util-net.c b/src/common/util-net.c
index a0fb281e1cb..2bcabe7f1d0 100644
--- a/src/common/util-net.c
+++ b/src/common/util-net.c
@@ -253,6 +253,26 @@ extern char *make_full_path(const char *rpath)
return cwd2;
}
+static struct addrinfo *_xgetaddrinfo(const char *hostname, const char *serv,
+ const struct addrinfo *hints)
+{
+ struct addrinfo *result = NULL;
+ int err;
+
+ err = getaddrinfo(hostname, serv, hints, &result);
+ if (err == EAI_SYSTEM) {
+ error_in_daemon("%s: getaddrinfo(%s:%s) failed: %s: %m",
+ __func__, hostname, serv, gai_strerror(err));
+ return NULL;
+ } else if (err != 0) {
+ error_in_daemon("%s: getaddrinfo(%s:%s) failed: %s",
+ __func__, hostname, serv, gai_strerror(err));
+ return NULL;
+ }
+
+ return result;
+}
+
extern struct addrinfo *xgetaddrinfo_port(const char *hostname, uint16_t port)
{
char serv[6];
@@ -262,9 +282,7 @@ extern struct addrinfo *xgetaddrinfo_port(const char *hostname, uint16_t port)
extern struct addrinfo *xgetaddrinfo(const char *hostname, const char *serv)
{
- struct addrinfo *result = NULL;
struct addrinfo hints;
- int err;
bool v4_enabled = slurm_conf.conf_flags & CONF_FLAG_IPV4_ENABLED;
bool v6_enabled = slurm_conf.conf_flags & CONF_FLAG_IPV6_ENABLED;
@@ -300,18 +318,7 @@ extern struct addrinfo *xgetaddrinfo(const char *hostname, const char *serv)
hints.ai_flags |= AI_CANONNAME;
hints.ai_socktype = SOCK_STREAM;
- err = getaddrinfo(hostname, serv, &hints, &result);
- if (err == EAI_SYSTEM) {
- error_in_daemon("%s: getaddrinfo(%s:%s) failed: %s: %m",
- __func__, hostname, serv, gai_strerror(err));
- return NULL;
- } else if (err != 0) {
- error_in_daemon("%s: getaddrinfo(%s:%s) failed: %s",
- __func__, hostname, serv, gai_strerror(err));
- return NULL;
- }
-
- return result;
+ return _xgetaddrinfo(hostname, serv, &hints);
}
static int _name_cache_find(void *x, void *y)
From 27ceee139f05c484f1ab42719d06f4a688708845 Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Mon, 28 Oct 2024 13:09:08 -0400
Subject: [PATCH 79/90] Add function to determine what types of addresses a
host has.
Cherry-picked: 54e3c2ba62
Ticket: 20997
---
src/common/util-net.c | 32 ++++++++++++++++++++++++++++++++
src/common/util-net.h | 2 ++
2 files changed, 34 insertions(+)
diff --git a/src/common/util-net.c b/src/common/util-net.c
index 2bcabe7f1d0..5720e4b0d1f 100644
--- a/src/common/util-net.c
+++ b/src/common/util-net.c
@@ -321,6 +321,38 @@ extern struct addrinfo *xgetaddrinfo(const char *hostname, const char *serv)
return _xgetaddrinfo(hostname, serv, &hints);
}
+extern int host_has_addr_family(const char *hostname, const char *srv,
+ bool *ipv4, bool *ipv6)
+{
+ struct addrinfo hints;
+ struct addrinfo *ai_ptr, *ai_start;
+
+ memset(&hints, 0, sizeof(hints));
+
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV | AI_PASSIVE;
+ if (hostname)
+ hints.ai_flags |= AI_CANONNAME;
+ hints.ai_socktype = SOCK_STREAM;
+
+ ai_start = _xgetaddrinfo(hostname, srv, &hints);
+
+ if (!ai_start)
+ return SLURM_ERROR;
+
+ *ipv4 = *ipv6 = false;
+ for (ai_ptr = ai_start; ai_ptr; ai_ptr = ai_ptr->ai_next) {
+ if (ai_ptr->ai_family == AF_INET6)
+ *ipv6 = true;
+ else if (ai_ptr->ai_family == AF_INET)
+ *ipv4 = true;
+ }
+
+ freeaddrinfo(ai_start);
+
+ return SLURM_SUCCESS;
+}
+
static int _name_cache_find(void *x, void *y)
{
getnameinfo_cache_t *cache_ent = x;
diff --git a/src/common/util-net.h b/src/common/util-net.h
index 7d8c7f9ccfb..c85b9b562d1 100644
--- a/src/common/util-net.h
+++ b/src/common/util-net.h
@@ -87,6 +87,8 @@ extern struct addrinfo *xgetaddrinfo_port(const char *hostname,
uint16_t port);
extern char *xgetnameinfo(struct sockaddr *addr, socklen_t addrlen);
+extern int host_has_addr_family(const char *hostname, const char *srv,
+ bool *ipv4, bool *ipv6);
/* Functions responsible for cleanup of getnameinfo cache */
extern void getnameinfo_cache_destroy(void *obj);
extern void getnameinfo_cache_purge(void);
From c509494958065a6d09fe6be78c4068227f86a27d Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Mon, 28 Oct 2024 13:09:09 -0400
Subject: [PATCH 80/90] Add has_ipv4 and has_ipv6 address information when
fetching configs
Cherry-picked: 43a57d9d61
Ticket: 20997
---
src/common/fetch_config.c | 13 +++++++++++++
src/common/slurm_resolv.h | 2 ++
2 files changed, 15 insertions(+)
diff --git a/src/common/fetch_config.c b/src/common/fetch_config.c
index fceccde29a8..24b8f12298e 100644
--- a/src/common/fetch_config.c
+++ b/src/common/fetch_config.c
@@ -47,6 +47,7 @@
#include "src/common/slurm_protocol_pack.h"
#include "src/common/slurm_resolv.h"
#include "src/common/strlcpy.h"
+#include "src/common/util-net.h"
#include "src/common/xstring.h"
#include "src/common/xmalloc.h"
@@ -150,6 +151,16 @@ static void _fetch_child(List controllers, uint32_t flags)
_exit(1);
}
+static int _get_controller_addr_type(void *x, void *arg)
+{
+ ctl_entry_t *ctl = (ctl_entry_t *) x;
+
+ host_has_addr_family(ctl->hostname, NULL, &ctl->has_ipv4,
+ &ctl->has_ipv6);
+
+ return SLURM_SUCCESS;
+}
+
extern config_response_msg_t *fetch_config(char *conf_server, uint32_t flags)
{
char *env_conf_server = getenv("SLURM_CONF_SERVER");
@@ -198,6 +209,8 @@ extern config_response_msg_t *fetch_config(char *conf_server, uint32_t flags)
}
}
+ list_for_each(controllers, _get_controller_addr_type, NULL);
+
/* If the slurm.key file exists, assume we're using auth/slurm */
sack_jwks = get_extra_conf_path("slurm.jwks");
sack_key = get_extra_conf_path("slurm.key");
diff --git a/src/common/slurm_resolv.h b/src/common/slurm_resolv.h
index e0fe650daf6..63de8c74ba7 100644
--- a/src/common/slurm_resolv.h
+++ b/src/common/slurm_resolv.h
@@ -40,6 +40,8 @@ typedef struct {
uint16_t priority;
uint16_t port;
char hostname[1024];
+ bool has_ipv4;
+ bool has_ipv6;
} ctl_entry_t;
/*
From e7b6def6a7b936afca2cc69f78d96a084c1a1fa2 Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Mon, 28 Oct 2024 13:09:09 -0400
Subject: [PATCH 81/90] Selectively try to fetch configs from controllers over
IPv6.
Previous default behavior of trying IPv4 remains, however if the
controller appears to be IPv6 only skip the IPv4 attempt.
It is expected that all slurm controllers have the same IP address
families available.
Cherry-picked: bf4a853677
Ticket: 20997
---
src/common/fetch_config.c | 30 ++++++++++++++++++++++++++----
1 file changed, 26 insertions(+), 4 deletions(-)
diff --git a/src/common/fetch_config.c b/src/common/fetch_config.c
index 24b8f12298e..dd20d1fd3d6 100644
--- a/src/common/fetch_config.c
+++ b/src/common/fetch_config.c
@@ -64,7 +64,8 @@ static char *client_config_files[] = {
};
-static void _init_minimal_conf_server_config(List controllers);
+static void _init_minimal_conf_server_config(List controllers, bool use_v6,
+ bool reinit);
static int to_parent[2] = {-1, -1};
@@ -115,6 +116,7 @@ static config_response_msg_t *_fetch_parent(pid_t pid)
static void _fetch_child(List controllers, uint32_t flags)
{
config_response_msg_t *config;
+ ctl_entry_t *ctl = NULL;
buf_t *buffer = init_buf(1024 * 1024);
int len = 0;
@@ -129,9 +131,22 @@ static void _fetch_child(List controllers, uint32_t flags)
*/
slurm_conf_unlock();
- _init_minimal_conf_server_config(controllers);
+ ctl = list_peek(controllers);
+
+ if (ctl->has_ipv6 && !ctl->has_ipv4)
+ _init_minimal_conf_server_config(controllers, true, false);
+ else
+ _init_minimal_conf_server_config(controllers, false, false);
+
config = fetch_config_from_controller(flags);
+ if (!config && ctl->has_ipv6 && ctl->has_ipv4) {
+ warning("%s: failed to fetch remote configs via IPv4, retrying with IPv6: %m",
+ __func__);
+ _init_minimal_conf_server_config(controllers, true, true);
+ config = fetch_config_from_controller(flags);
+ }
+
if (!config) {
error("%s: failed to fetch remote configs: %m", __func__);
safe_write(to_parent[1], &len, sizeof(int));
@@ -344,7 +359,8 @@ static int _print_controllers(void *x, void *arg)
return SLURM_SUCCESS;
}
-static void _init_minimal_conf_server_config(List controllers)
+static void _init_minimal_conf_server_config(List controllers, bool use_v6,
+ bool reinit)
{
char *conf = NULL, *filename = NULL;
int fd;
@@ -356,11 +372,17 @@ static void _init_minimal_conf_server_config(List controllers)
if (slurm_conf.authinfo)
xstrfmtcat(conf, "AuthInfo=%s\n", slurm_conf.authinfo);
+ if (use_v6)
+ xstrcat(conf, "CommunicationParameters=EnableIPv6");
+
if ((fd = dump_to_memfd("slurm.conf", conf, &filename)) < 0)
fatal("%s: could not write temporary config", __func__);
xfree(conf);
- slurm_init(filename);
+ if (reinit)
+ slurm_conf_reinit(filename);
+ else
+ slurm_init(filename);
close(fd);
xfree(filename);
From dd6127502be3c6b9e75988607e9a586e0d845266 Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Mon, 28 Oct 2024 13:09:09 -0400
Subject: [PATCH 82/90] slurmd - Parse IPv6 addresses provided to --conf-server
correctly.
When using an IPv6 address directly, wrap the address with [] to
denote what is address vs what is the port.
Cherry-picked: 04babaab63
Ticket: 20997
---
doc/man/man8/slurmd.8 | 5 ++++-
src/common/fetch_config.c | 14 +++++++++++++-
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/doc/man/man8/slurmd.8 b/doc/man/man8/slurmd.8
index e35ff45dad3..7f4a4a3f605 100644
--- a/doc/man/man8/slurmd.8
+++ b/doc/man/man8/slurmd.8
@@ -77,10 +77,13 @@ NodeName=node1 CPUs=16 RealMemory=30000 Gres=gpu:2"
.IP
.TP
-\fB\-\-conf\-server [:]\fR
+\fB\-\-conf\-server [:]\fR
Comma\-separated list of controllers, the first being the primary slurmctld. A
port can (optionally) be specified for each controller. These hosts are where
the slurmd will fetch the configuration from when running in "configless" mode.
+\fBNOTE\fR: If specifying an IPv6 address, wrap the in [] to
+distinguish the address from the port. This is required even if no port is
+specified.
.IP
.TP
diff --git a/src/common/fetch_config.c b/src/common/fetch_config.c
index dd20d1fd3d6..0428f3e51bb 100644
--- a/src/common/fetch_config.c
+++ b/src/common/fetch_config.c
@@ -204,9 +204,21 @@ extern config_response_msg_t *fetch_config(char *conf_server, uint32_t flags)
server = strtok_r(tmp, ",", &save_ptr);
while (server) {
ctl_entry_t *ctl = xmalloc(sizeof(*ctl));
+ char *tmp_ptr = NULL;
+
+ if (server[0] == '[')
+ server++;
+
strlcpy(ctl->hostname, server, sizeof(ctl->hostname));
- if ((port = xstrchr(ctl->hostname, ':'))) {
+ if ((tmp_ptr = strchr(ctl->hostname, ']'))) {
+ *tmp_ptr = '\0';
+ tmp_ptr++;
+ } else {
+ tmp_ptr = ctl->hostname;
+ }
+
+ if ((port = xstrchr(tmp_ptr, ':'))) {
*port = '\0';
port++;
ctl->port = atoi(port);
From 388e8c3953bfe959cdd92cd586a621e2aa211beb Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Mon, 28 Oct 2024 13:24:24 -0400
Subject: [PATCH 83/90] NEWS for previous five commits
Changelog: Support IPv6 in configless mode.
Cherry-picked: 050517c526
Ticket: 20997
From 8ab29ce372732a4f446fd36560dd62112841d1bc Mon Sep 17 00:00:00 2001
From: Marshall Garey
Date: Thu, 12 Dec 2024 11:05:59 -0700
Subject: [PATCH 84/90] Docs - Update REST API reference
---
doc/html/rest_api.shtml | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/doc/html/rest_api.shtml b/doc/html/rest_api.shtml
index 3e29da52719..f540d0c64dd 100644
--- a/doc/html/rest_api.shtml
+++ b/doc/html/rest_api.shtml
@@ -3,7 +3,7 @@
API to access and control Slurm
More information: https://www.schedmd.com/
Contact Info: sales@schedmd.com
- Version: Slurm-24.05.4&openapi/slurmdbd&openapi/slurmctld
+ Version: Slurm-24.05.5&openapi/slurmdbd&openapi/slurmctld
BasePath:
Apache 2.0
https://www.apache.org/licenses/LICENSE-2.0.html
@@ -6169,7 +6169,7 @@
Query Parameter — CSV QOS list default: null format (optional)
- Query Parameter — CSV format list default: null id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null id (optional)
Query Parameter — CSV id list default: null only_defaults (optional)
@@ -6297,7 +6297,7 @@
Query Parameter — CSV QOS list default: null format (optional)
- Query Parameter — CSV format list default: null id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null id (optional)
Query Parameter — CSV id list default: null only_defaults (optional)
@@ -6433,7 +6433,7 @@
Query Parameter — Query flags default: null format (optional)
- Query Parameter — CSV format list default: null rpc_version (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null rpc_version (optional)
Query Parameter — CSV RPC version list default: null usage_end (optional)
@@ -7132,7 +7132,7 @@
Query Parameter — CSV QOS list default: null format (optional)
- Query Parameter — CSV format list default: null id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null id (optional)
Query Parameter — CSV id list default: null only_defaults (optional)
@@ -7640,7 +7640,7 @@
Query Parameter — CSV QOS list default: null format (optional)
- Query Parameter — CSV format list default: null id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null id (optional)
Query Parameter — CSV id list default: null only_defaults (optional)
@@ -8156,7 +8156,7 @@
Query Parameter — Query flags default: null format (optional)
- Query Parameter — CSV format list default: null rpc_version (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null rpc_version (optional)
Query Parameter — CSV RPC version list default: null usage_end (optional)
@@ -9960,7 +9960,7 @@
Query Parameter — CSV extra list default: null format (optional)
- Query Parameter — CSV format list default: null instance_id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null instance_id (optional)
Query Parameter — CSV instance_id list default: null instance_type (optional)
@@ -10088,7 +10088,7 @@
Query Parameter — CSV extra list default: null format (optional)
- Query Parameter — CSV format list default: null instance_id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null instance_id (optional)
Query Parameter — CSV instance_id list default: null instance_type (optional)
@@ -11488,7 +11488,7 @@
Query Parameter — Include job environment default: null format (optional)
- Query Parameter — CSV format list default: null groups (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null groups (optional)
Query Parameter — CSV group list default: null job_name (optional)
@@ -12758,7 +12758,7 @@
Query Parameter — CSV QOS id list default: null format (optional)
- Query Parameter — CSV format list default: null name (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null name (optional)
Query Parameter — CSV QOS name list default: null preempt_mode (optional)
@@ -14740,7 +14740,7 @@
Query Parameter — CSV cluster name list default: null format (optional)
- Query Parameter — CSV format name list default: null id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null id (optional)
Query Parameter — CSV id list default: null name (optional)
@@ -15436,7 +15436,7 @@
Query Parameter — CSV QOS id list default: null format (optional)
- Query Parameter — CSV format list default: null name (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null name (optional)
Query Parameter — CSV QOS name list default: null preempt_mode (optional)
@@ -15859,7 +15859,7 @@
Query Parameter — CSV cluster name list default: null format (optional)
- Query Parameter — CSV format name list default: null id (optional)
+ Query Parameter — Ignored; process JSON manually to control output format default: null id (optional)
Query Parameter — CSV id list default: null name (optional)
From fae1f13b83cd28c5f66e3f35c7182004298c0fda Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Thu, 12 Dec 2024 13:10:06 -0700
Subject: [PATCH 85/90] Update NEWS for 24.05.5
---
NEWS | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/NEWS b/NEWS
index c248b2ca6a0..e84c992e163 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,13 @@ documents those changes that are of interest to users and administrators.
-- Fix race condition in stepmgr step completion handling.
-- Fix slurmctld segfault with stepmgr and MpiParams when running a job array.
-- Fix requeued jobs keeping their priority until the decay thread happens.
+ -- slurmctld - Fix crash and possible split brain issue if the
+ backup controller handles an scontrol reconfigure while in control
+ before the primary resumes operation.
+ -- Fix stepmgr not getting dynamic node addrs from the controller
+ -- stepmgr - avoid "Unexpected missing socket" errors.
+ -- Fix `scontrol show steps` with dynamic stepmgr
+ -- Support IPv6 in configless mode.
* Changes in Slurm 24.05.4
==========================
From 8e8a552a468b7e6c0175dd1e562e8495aa47b4fe Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Thu, 12 Dec 2024 13:17:17 -0700
Subject: [PATCH 86/90] Update META for 24.05.5.
Update slurm.spec and debian/changelog as well.
---
META | 4 ++--
debian/changelog | 2 +-
slurm.spec | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/META b/META
index d83af4e1674..c3441b69275 100644
--- a/META
+++ b/META
@@ -7,8 +7,8 @@
Name: slurm
Major: 24
Minor: 05
- Micro: 4
- Version: 24.05.4
+ Micro: 5
+ Version: 24.05.5
Release: 1
##
diff --git a/debian/changelog b/debian/changelog
index 01eb1252b30..8bb6b7e6c24 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-slurm-smd (24.05.4-1) UNRELEASED; urgency=medium
+slurm-smd (24.05.5-1) UNRELEASED; urgency=medium
* Initial release.
diff --git a/slurm.spec b/slurm.spec
index aab4cdd9c1e..59a13298dbe 100644
--- a/slurm.spec
+++ b/slurm.spec
@@ -1,5 +1,5 @@
Name: slurm
-Version: 24.05.4
+Version: 24.05.5
%define rel 1
Release: %{rel}%{?dist}
Summary: Slurm Workload Manager
From bd17c8d112330da152cc6d9879801f109d5979dc Mon Sep 17 00:00:00 2001
From: Tim McMullan
Date: Thu, 12 Dec 2024 13:21:50 -0700
Subject: [PATCH 87/90] Start NEWS for v24.05.6.
---
NEWS | 3 +++
1 file changed, 3 insertions(+)
diff --git a/NEWS b/NEWS
index e84c992e163..4ce1c8bf55e 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,9 @@
This file describes changes in recent versions of Slurm. It primarily
documents those changes that are of interest to users and administrators.
+* Changes in Slurm 24.05.6
+==========================
+
* Changes in Slurm 24.05.5
==========================
-- Fix issue signaling cron jobs resulting in unintended requeues.
From 6443b033d9cb60a5a6cdb226ba115f08fa16bcb9 Mon Sep 17 00:00:00 2001
From: Andy Georges
Date: Fri, 13 Dec 2024 21:35:14 +0100
Subject: [PATCH 88/90] fix: install nvidia-driver-devel for nvml.h
---
build.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/build.sh b/build.sh
index bc2a9453852..7b0c622085d 100755
--- a/build.sh
+++ b/build.sh
@@ -96,7 +96,7 @@ sudo dnf -y install munge-devel libjwt-devel pam-devel
sudo dnf -y install http-parser-devel json-c-devel libyaml-devel
# - features: Nvidia NVML
sudo dnf -y autoremove cuda-nvml-* nvidia-driver-NVML-* nvidia-driver* libnvidia-ml*
-sudo dnf -y install "$CUDA_NVML_PKG" "$NVDRV_NVML_PKG"
+sudo dnf -y install "$CUDA_NVML_PKG" "$NVDRV_NVML_PKG" "nvidia-driver-devel"
# - plugins: MPI
sudo dnf -y install pmix "pmix-devel ${PMIX_VERSION}" "ucx-devel-${UCX_VERSION}"
# - plugins: cgroup/v2
From 9984765abb559648d6212996a3904db2b6bb4f77 Mon Sep 17 00:00:00 2001
From: Andy Georges
Date: Fri, 13 Dec 2024 21:39:30 +0100
Subject: [PATCH 89/90] fix: nvidia driver version on el9
---
build.sh | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/build.sh b/build.sh
index 7b0c622085d..c5b9ea569f0 100755
--- a/build.sh
+++ b/build.sh
@@ -33,9 +33,10 @@ if grep "release 8.8" /etc/redhat-release; then
CUDA_VERSION=${CUDA_VERSION:-12.6}
CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
elif grep "release 9.4" /etc/redhat-release; then
- NVDRV_NVML_PKG="libnvidia-ml"
- CUDA_VERSION=${CUDA_VERSION:-12.6}
- CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
+ NVIDIA_DRIVER=${NVIDIA_DRIVER-555.42.06}
+ NVDRV_NVML_PKG="nvidia-driver-NVML${NVIDIA_DRIVER:+-$NVIDIA_DRIVER}"
+ CUDA_VERSION=${CUDA_VERSION:-12.6}
+ CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
fi
# Prepare directory structure
From 30a0b7d626c6ee4fa28b489620bc8cb8ae45081d Mon Sep 17 00:00:00 2001
From: Andy Georges
Date: Mon, 16 Dec 2024 10:27:24 +0100
Subject: [PATCH 90/90] fix: nvidia version on el8
---
build.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/build.sh b/build.sh
index c5b9ea569f0..9e4b715b3c0 100755
--- a/build.sh
+++ b/build.sh
@@ -28,7 +28,7 @@ OUR_RELEASE=${RELEASE:-1}
# allow _empty_ version, which is used in pipeline
if grep "release 8.8" /etc/redhat-release; then
- NVIDIA_DRIVER=${NVIDIA_DRIVER-550.90.07}
+ NVIDIA_DRIVER=${NVIDIA_DRIVER-555.42.06}
NVDRV_NVML_PKG="nvidia-driver-NVML${NVIDIA_DRIVER:+-$NVIDIA_DRIVER}"
CUDA_VERSION=${CUDA_VERSION:-12.6}
CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"