Skip to content

Commit 5b1e2d1

Browse files
committed
Merge branch 'slurm-24.05' into 24.05.ug-reduce-patch
2 parents c42de33 + 5df9c81 commit 5b1e2d1

File tree

18 files changed

+166
-210
lines changed

18 files changed

+166
-210
lines changed

src/api/step_io.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,25 +39,27 @@
3939
#include <string.h>
4040
#include <time.h>
4141

42+
#include "src/api/step_io.h"
43+
#include "src/api/step_launch.h"
44+
45+
#include "src/common/eio.h"
4246
#include "src/common/fd.h"
4347
#include "src/common/hostlist.h"
48+
#include "src/common/io_hdr.h"
4449
#include "src/common/log.h"
4550
#include "src/common/macros.h"
51+
#include "src/common/net.h"
4652
#include "src/common/pack.h"
4753
#include "src/common/read_config.h"
4854
#include "src/common/slurm_protocol_defs.h"
4955
#include "src/common/slurm_protocol_pack.h"
50-
#include "src/interfaces/cred.h"
56+
#include "src/common/write_labelled_message.h"
5157
#include "src/common/xassert.h"
5258
#include "src/common/xmalloc.h"
5359
#include "src/common/xsignal.h"
54-
#include "src/common/eio.h"
55-
#include "src/common/io_hdr.h"
56-
#include "src/common/net.h"
57-
#include "src/common/write_labelled_message.h"
60+
#include "src/common/xstring.h"
5861

59-
#include "src/api/step_io.h"
60-
#include "src/api/step_launch.h"
62+
#include "src/interfaces/cred.h"
6163

6264
#define STDIO_MAX_FREE_BUF 1024
6365

@@ -1071,7 +1073,7 @@ _estimate_nports(int nclients, int cli_per_port)
10711073
}
10721074

10731075
client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
1074-
int num_nodes, slurm_cred_t *cred,
1076+
int num_nodes, char *io_key,
10751077
bool label, uint32_t het_job_offset,
10761078
uint32_t het_job_task_offset)
10771079
{
@@ -1090,7 +1092,7 @@ client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
10901092
else
10911093
cio->taskid_width = 0;
10921094

1093-
cio->io_key = slurm_cred_get_signature(cred);
1095+
cio->io_key = xstrdup(io_key);
10941096

10951097
cio->eio = eio_handle_create(slurm_conf.eio_timeout);
10961098

src/api/step_io.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@
2727
#ifndef _HAVE_STEP_IO_H
2828
#define _HAVE_STEP_IO_H
2929

30-
#include <stdint.h>
3130
#include <pthread.h>
31+
#include <stdint.h>
3232

3333
#include "slurm/slurm.h"
3434

35+
#include "src/common/bitstring.h"
3536
#include "src/common/eio.h"
3637
#include "src/common/list.h"
37-
#include "src/common/bitstring.h"
3838
#include "src/common/slurm_step_layout.h"
39-
struct step_launch_state;
4039

40+
struct step_launch_state;
4141

4242
typedef struct {
4343
/* input parameters - set (indirectly) by user */
@@ -96,16 +96,8 @@ typedef struct {
9696
I/O problem. */
9797
} client_io_t;
9898

99-
/*
100-
* IN cred - cred need not be a real job credential, it may be a "fake"
101-
* credential generated with slurm_cred_faker(). The credential is
102-
* sent to the slurmstepd (via the slurmd) which generates a signature
103-
* string from the credential. The slurmstepd sends the signature back
104-
* back to the client when it establishes the IO connection as a sort
105-
* of validity check.
106-
*/
10799
client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
108-
int num_nodes, slurm_cred_t *cred,
100+
int num_nodes, char *io_key,
109101
bool label, uint32_t het_job_offset,
110102
uint32_t het_job_task_offset);
111103

src/api/step_launch.c

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@
4848
#include <pthread.h>
4949
#include <signal.h>
5050
#include <stdarg.h>
51-
#include <stdlib.h>
5251
#include <stdio.h>
52+
#include <stdlib.h>
5353
#include <string.h>
5454
#include <sys/socket.h>
5555
#include <sys/stat.h>
@@ -59,6 +59,9 @@
5959

6060
#include "slurm/slurm.h"
6161

62+
#include "src/api/pmi_server.h"
63+
#include "src/api/step_launch.h"
64+
6265
#include "src/common/cpu_frequency.h"
6366
#include "src/common/eio.h"
6467
#include "src/common/fd.h"
@@ -68,9 +71,6 @@
6871
#include "src/common/macros.h"
6972
#include "src/common/net.h"
7073
#include "src/common/read_config.h"
71-
#include "src/interfaces/auth.h"
72-
#include "src/interfaces/cred.h"
73-
#include "src/interfaces/mpi.h"
7474
#include "src/common/slurm_protocol_api.h"
7575
#include "src/common/slurm_protocol_defs.h"
7676
#include "src/common/slurm_time.h"
@@ -80,8 +80,9 @@
8080
#include "src/common/xmalloc.h"
8181
#include "src/common/xstring.h"
8282

83-
#include "src/api/step_launch.h"
84-
#include "src/api/pmi_server.h"
83+
#include "src/interfaces/auth.h"
84+
#include "src/interfaces/cred.h"
85+
#include "src/interfaces/mpi.h"
8586

8687
#include "src/srun/step_ctx.h"
8788

@@ -201,6 +202,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
201202
int rc = SLURM_SUCCESS;
202203
bool preserve_env = params->preserve_env;
203204
uint32_t mpi_plugin_id;
205+
char *io_key = NULL;
204206

205207
debug("Entering %s", __func__);
206208
memset(&launch, 0, sizeof(launch));
@@ -211,6 +213,8 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
211213
return SLURM_ERROR;
212214
}
213215

216+
io_key = slurm_cred_get_signature(ctx->step_resp->cred);
217+
214218
/* Initialize the callback pointers */
215219
if (callbacks != NULL) {
216220
/* copy the user specified callback pointers */
@@ -360,8 +364,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
360364
ctx->launch_state->io =
361365
client_io_handler_create(params->local_fds,
362366
ctx->step_req->num_tasks,
363-
launch.nnodes,
364-
ctx->step_resp->cred,
367+
launch.nnodes, io_key,
365368
params->labelio,
366369
params->het_job_offset,
367370
params->het_job_task_offset);
@@ -401,6 +404,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
401404
xfree(launch.io_port);
402405

403406
fail1:
407+
xfree(io_key);
404408
xfree(launch.complete_nodelist);
405409
xfree(launch.cwd);
406410
xfree(launch.stepmgr);
@@ -430,6 +434,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
430434
uint16_t resp_port = 0;
431435
bool preserve_env = params->preserve_env;
432436
uint32_t mpi_plugin_id;
437+
char *io_key = NULL;
433438

434439
debug("Entering %s", __func__);
435440

@@ -439,6 +444,8 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
439444
return SLURM_ERROR;
440445
}
441446

447+
io_key = slurm_cred_get_signature(ctx->step_resp->cred);
448+
442449
mpi_plugin_id = mpi_g_client_init((char **)&params->mpi_plugin_name);
443450
if (!mpi_plugin_id) {
444451
slurm_seterrno(SLURM_MPI_PLUGIN_NAME_INVALID);
@@ -544,8 +551,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
544551
ctx->launch_state->io =
545552
client_io_handler_create(params->local_fds,
546553
ctx->step_req->num_tasks,
547-
launch.nnodes,
548-
ctx->step_resp->cred,
554+
launch.nnodes, io_key,
549555
params->labelio,
550556
params->het_job_offset,
551557
params->het_job_task_offset);
@@ -586,6 +592,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
586592

587593
fail1:
588594
/* clean up */
595+
xfree(io_key);
589596
xfree(launch.resp_port);
590597
xfree(launch.io_port);
591598

src/common/pack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,17 @@ void grow_buf(buf_t *buffer, uint32_t size)
214214

215215
extern int try_grow_buf(buf_t *buffer, uint32_t size)
216216
{
217-
uint64_t new_size = ((uint64_t) size) + buffer->size;
217+
uint64_t new_size = buffer->size + BUF_SIZE;
218218

219219
xassert(buffer->magic == BUF_MAGIC);
220220

221+
/*
222+
* Force increase to always be at least BUF_SIZE to reduce number of
223+
* successive xrealloc()s that get called while packing larger RPCs
224+
*/
225+
if (size >= BUF_SIZE)
226+
new_size += size;
227+
221228
if (buffer->mmaped || buffer->shadow)
222229
return EINVAL;
223230
if (new_size > MAX_BUF_SIZE) {

src/plugins/data_parser/v0.0.40/parsing.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1268,7 +1268,12 @@ static int _dump_pointer(const parser_t *const parser, void *src, data_t *dst,
12681268
const parser_t *pt = find_parser_by_type(parser->pointer_type);
12691269
void **ptr = src;
12701270

1271-
if (!*ptr && !is_complex_mode(args)) {
1271+
if (!*ptr) {
1272+
if (is_complex_mode(args)) {
1273+
xassert(data_get_type(dst) == DATA_TYPE_NULL);
1274+
return SLURM_SUCCESS;
1275+
}
1276+
12721277
/* Fully resolve pointer on NULL to use correct model */
12731278
while (pt->pointer_type)
12741279
pt = find_parser_by_type(pt->pointer_type);

src/sattach/sattach.c

Lines changed: 24 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,13 @@ static void _mpir_cleanup(void);
6868
static void _mpir_dump_proctable(void);
6969
static void _pty_restore(void);
7070
static void print_layout_info(slurm_step_layout_t *layout);
71-
static slurm_cred_t *_generate_fake_cred(slurm_step_id_t stepid,
72-
uid_t uid, char *nodelist,
73-
uint32_t node_cnt);
71+
static char *_generate_io_key(void);
7472
static uint32_t _nodeid_from_layout(slurm_step_layout_t *layout,
7573
uint32_t taskid);
7674
static void _set_exit_code(void);
7775
static int _attach_to_tasks(slurm_step_id_t stepid,
7876
slurm_step_layout_t *layout,
79-
slurm_cred_t *fake_cred,
77+
char *io_key,
8078
uint16_t num_resp_ports,
8179
uint16_t *resp_ports,
8280
int num_io_ports,
@@ -118,11 +116,10 @@ int sattach(int argc, char **argv)
118116
{
119117
log_options_t logopt = LOG_OPTS_STDERR_ONLY;
120118
slurm_step_layout_t *layout;
121-
slurm_cred_t *fake_cred;
122119
message_thread_state_t *mts;
123120
uint32_t jobid, stepid;
124121
client_io_t *io;
125-
char *hosts;
122+
char *io_key = NULL;
126123

127124
slurm_init(NULL);
128125
log_init(xbasename(argv[0]), logopt, 0, NULL);
@@ -169,16 +166,11 @@ int sattach(int argc, char **argv)
169166
_nodeid_from_layout(layout, opt.fds.input.taskid);
170167
}
171168

172-
if (layout->front_end)
173-
hosts = layout->front_end;
174-
else
175-
hosts = layout->node_list;
176-
fake_cred = _generate_fake_cred(opt.selected_step->step_id,
177-
opt.uid, hosts, layout->node_cnt);
169+
io_key = _generate_io_key();
178170
mts = _msg_thr_create(layout->node_cnt, layout->task_cnt);
179171

180172
io = client_io_handler_create(opt.fds, layout->task_cnt,
181-
layout->node_cnt, fake_cred,
173+
layout->node_cnt, io_key,
182174
opt.labelio, NO_VAL, NO_VAL);
183175
client_io_handler_start(io);
184176

@@ -197,7 +189,7 @@ int sattach(int argc, char **argv)
197189
xsignal_block(pty_sigarray);
198190
}
199191

200-
_attach_to_tasks(opt.selected_step->step_id, layout, fake_cred,
192+
_attach_to_tasks(opt.selected_step->step_id, layout, io_key,
201193
mts->num_resp_port, mts->resp_port,
202194
io->num_listen, io->listenport,
203195
mts->tasks_started);
@@ -213,6 +205,7 @@ int sattach(int argc, char **argv)
213205
client_io_handler_finish(io);
214206
client_io_handler_destroy(io);
215207
_mpir_cleanup();
208+
xfree(io_key);
216209

217210
return global_rc;
218211
}
@@ -278,44 +271,22 @@ static void print_layout_info(slurm_step_layout_t *layout)
278271
hostlist_destroy(nl);
279272
}
280273

281-
282-
/* return a faked job credential */
283-
static slurm_cred_t *_generate_fake_cred(slurm_step_id_t stepid,
284-
uid_t uid, char *nodelist,
285-
uint32_t node_cnt)
274+
/*
275+
* The io_key requires a modest amount of entropy to prevent someone guessing
276+
* it, then racing to initiate a connection to the sattach command.
277+
* By (ab)using the auth token generation mechanisms, the key should be
278+
* sufficiently random for our purposes. (An attacker would need to request
279+
* an auth key be generated at the same time by the same uid on the same host.)
280+
*/
281+
static char *_generate_io_key(void)
286282
{
287-
slurm_cred_t *cred;
288-
slurm_cred_arg_t *arg = xmalloc(sizeof(*arg));
289-
290-
arg->step_id.job_id = stepid.job_id;
291-
arg->step_id.step_id = stepid.step_id;
292-
arg->step_id.step_het_comp = stepid.step_het_comp;
293-
arg->uid = uid;
294-
295-
arg->job_hostlist = nodelist;
296-
arg->job_nhosts = node_cnt;
297-
298-
arg->step_hostlist = nodelist;
299-
300-
arg->job_core_bitmap = bit_alloc(node_cnt);
301-
bit_set_all(arg->job_core_bitmap);
302-
arg->step_core_bitmap = bit_alloc(node_cnt);
303-
bit_set_all(arg->step_core_bitmap);
304-
305-
arg->cores_per_socket = xmalloc(sizeof(uint16_t));
306-
arg->cores_per_socket[0] = 1;
307-
arg->sockets_per_node = xmalloc(sizeof(uint16_t));
308-
arg->sockets_per_node[0] = 1;
309-
arg->sock_core_rep_count = xmalloc(sizeof(uint32_t));
310-
arg->sock_core_rep_count[0] = node_cnt;
311-
312-
cred = slurm_cred_faker(arg);
313-
314-
/* Don't free, this memory will be free'd later */
315-
arg->job_hostlist = NULL;
316-
arg->step_hostlist = NULL;
317-
slurm_cred_free_args(arg);
318-
return cred;
283+
char *key = auth_g_create(AUTH_DEFAULT_INDEX, slurm_conf.authinfo,
284+
0, NULL, 0);
285+
286+
if (!key)
287+
fatal("failed to generate a suitable io_key");
288+
289+
return key;
319290
}
320291

321292
void _handle_response_msg(slurm_msg_type_t msg_type, void *msg,
@@ -383,7 +354,7 @@ void _handle_response_msg_list(List other_nodes_resp, bitstr_t *tasks_started)
383354
*/
384355
static int _attach_to_tasks(slurm_step_id_t stepid,
385356
slurm_step_layout_t *layout,
386-
slurm_cred_t *fake_cred,
357+
char *io_key,
387358
uint16_t num_resp_ports,
388359
uint16_t *resp_ports,
389360
int num_io_ports,
@@ -402,7 +373,7 @@ static int _attach_to_tasks(slurm_step_id_t stepid,
402373
reattach_msg.num_resp_port = num_resp_ports;
403374
reattach_msg.resp_port = resp_ports; /* array of response ports */
404375
reattach_msg.num_io_port = num_io_ports;
405-
reattach_msg.io_key = slurm_cred_get_signature(fake_cred);
376+
reattach_msg.io_key = xstrdup(io_key);
406377
reattach_msg.io_port = io_ports;
407378

408379
slurm_msg_set_r_uid(&msg, SLURM_AUTH_UID_ANY);

0 commit comments

Comments
 (0)