Skip to content

Commit

Permalink
Merge branch 'slurm-24.05' into 24.05.ug-reduce-patch
Browse files Browse the repository at this point in the history
  • Loading branch information
itkovian committed Jan 29, 2025
2 parents c42de33 + 5df9c81 commit 5b1e2d1
Show file tree
Hide file tree
Showing 18 changed files with 166 additions and 210 deletions.
20 changes: 11 additions & 9 deletions src/api/step_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,27 @@
#include <string.h>
#include <time.h>

#include "src/api/step_io.h"
#include "src/api/step_launch.h"

#include "src/common/eio.h"
#include "src/common/fd.h"
#include "src/common/hostlist.h"
#include "src/common/io_hdr.h"
#include "src/common/log.h"
#include "src/common/macros.h"
#include "src/common/net.h"
#include "src/common/pack.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/slurm_protocol_pack.h"
#include "src/interfaces/cred.h"
#include "src/common/write_labelled_message.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xsignal.h"
#include "src/common/eio.h"
#include "src/common/io_hdr.h"
#include "src/common/net.h"
#include "src/common/write_labelled_message.h"
#include "src/common/xstring.h"

#include "src/api/step_io.h"
#include "src/api/step_launch.h"
#include "src/interfaces/cred.h"

#define STDIO_MAX_FREE_BUF 1024

Expand Down Expand Up @@ -1071,7 +1073,7 @@ _estimate_nports(int nclients, int cli_per_port)
}

client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
int num_nodes, slurm_cred_t *cred,
int num_nodes, char *io_key,
bool label, uint32_t het_job_offset,
uint32_t het_job_task_offset)
{
Expand All @@ -1090,7 +1092,7 @@ client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
else
cio->taskid_width = 0;

cio->io_key = slurm_cred_get_signature(cred);
cio->io_key = xstrdup(io_key);

cio->eio = eio_handle_create(slurm_conf.eio_timeout);

Expand Down
16 changes: 4 additions & 12 deletions src/api/step_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@
#ifndef _HAVE_STEP_IO_H
#define _HAVE_STEP_IO_H

#include <stdint.h>
#include <pthread.h>
#include <stdint.h>

#include "slurm/slurm.h"

#include "src/common/bitstring.h"
#include "src/common/eio.h"
#include "src/common/list.h"
#include "src/common/bitstring.h"
#include "src/common/slurm_step_layout.h"
struct step_launch_state;

struct step_launch_state;

typedef struct {
/* input parameters - set (indirectly) by user */
Expand Down Expand Up @@ -96,16 +96,8 @@ typedef struct {
I/O problem. */
} client_io_t;

/*
* IN cred - cred need not be a real job credential, it may be a "fake"
* credential generated with slurm_cred_faker(). The credential is
* sent to the slurmstepd (via the slurmd) which generates a signature
* string from the credential. The slurmstepd sends the signature back
* back to the client when it establishes the IO connection as a sort
* of validity check.
*/
client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
int num_nodes, slurm_cred_t *cred,
int num_nodes, char *io_key,
bool label, uint32_t het_job_offset,
uint32_t het_job_task_offset);

Expand Down
27 changes: 17 additions & 10 deletions src/api/step_launch.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@
#include <pthread.h>
#include <signal.h>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/stat.h>
Expand All @@ -59,6 +59,9 @@

#include "slurm/slurm.h"

#include "src/api/pmi_server.h"
#include "src/api/step_launch.h"

#include "src/common/cpu_frequency.h"
#include "src/common/eio.h"
#include "src/common/fd.h"
Expand All @@ -68,9 +71,6 @@
#include "src/common/macros.h"
#include "src/common/net.h"
#include "src/common/read_config.h"
#include "src/interfaces/auth.h"
#include "src/interfaces/cred.h"
#include "src/interfaces/mpi.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/slurm_time.h"
Expand All @@ -80,8 +80,9 @@
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"

#include "src/api/step_launch.h"
#include "src/api/pmi_server.h"
#include "src/interfaces/auth.h"
#include "src/interfaces/cred.h"
#include "src/interfaces/mpi.h"

#include "src/srun/step_ctx.h"

Expand Down Expand Up @@ -201,6 +202,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
int rc = SLURM_SUCCESS;
bool preserve_env = params->preserve_env;
uint32_t mpi_plugin_id;
char *io_key = NULL;

debug("Entering %s", __func__);
memset(&launch, 0, sizeof(launch));
Expand All @@ -211,6 +213,8 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
return SLURM_ERROR;
}

io_key = slurm_cred_get_signature(ctx->step_resp->cred);

/* Initialize the callback pointers */
if (callbacks != NULL) {
/* copy the user specified callback pointers */
Expand Down Expand Up @@ -360,8 +364,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
ctx->launch_state->io =
client_io_handler_create(params->local_fds,
ctx->step_req->num_tasks,
launch.nnodes,
ctx->step_resp->cred,
launch.nnodes, io_key,
params->labelio,
params->het_job_offset,
params->het_job_task_offset);
Expand Down Expand Up @@ -401,6 +404,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
xfree(launch.io_port);

fail1:
xfree(io_key);
xfree(launch.complete_nodelist);
xfree(launch.cwd);
xfree(launch.stepmgr);
Expand Down Expand Up @@ -430,6 +434,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
uint16_t resp_port = 0;
bool preserve_env = params->preserve_env;
uint32_t mpi_plugin_id;
char *io_key = NULL;

debug("Entering %s", __func__);

Expand All @@ -439,6 +444,8 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
return SLURM_ERROR;
}

io_key = slurm_cred_get_signature(ctx->step_resp->cred);

mpi_plugin_id = mpi_g_client_init((char **)&params->mpi_plugin_name);
if (!mpi_plugin_id) {
slurm_seterrno(SLURM_MPI_PLUGIN_NAME_INVALID);
Expand Down Expand Up @@ -544,8 +551,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
ctx->launch_state->io =
client_io_handler_create(params->local_fds,
ctx->step_req->num_tasks,
launch.nnodes,
ctx->step_resp->cred,
launch.nnodes, io_key,
params->labelio,
params->het_job_offset,
params->het_job_task_offset);
Expand Down Expand Up @@ -586,6 +592,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,

fail1:
/* clean up */
xfree(io_key);
xfree(launch.resp_port);
xfree(launch.io_port);

Expand Down
9 changes: 8 additions & 1 deletion src/common/pack.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,17 @@ void grow_buf(buf_t *buffer, uint32_t size)

extern int try_grow_buf(buf_t *buffer, uint32_t size)
{
uint64_t new_size = ((uint64_t) size) + buffer->size;
uint64_t new_size = buffer->size + BUF_SIZE;

xassert(buffer->magic == BUF_MAGIC);

/*
* Force increase to always be at least BUF_SIZE to reduce number of
* successive xrealloc()s that get called while packing larger RPCs
*/
if (size >= BUF_SIZE)
new_size += size;

if (buffer->mmaped || buffer->shadow)
return EINVAL;
if (new_size > MAX_BUF_SIZE) {
Expand Down
7 changes: 6 additions & 1 deletion src/plugins/data_parser/v0.0.40/parsing.c
Original file line number Diff line number Diff line change
Expand Up @@ -1268,7 +1268,12 @@ static int _dump_pointer(const parser_t *const parser, void *src, data_t *dst,
const parser_t *pt = find_parser_by_type(parser->pointer_type);
void **ptr = src;

if (!*ptr && !is_complex_mode(args)) {
if (!*ptr) {
if (is_complex_mode(args)) {
xassert(data_get_type(dst) == DATA_TYPE_NULL);
return SLURM_SUCCESS;
}

/* Fully resolve pointer on NULL to use correct model */
while (pt->pointer_type)
pt = find_parser_by_type(pt->pointer_type);
Expand Down
77 changes: 24 additions & 53 deletions src/sattach/sattach.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,13 @@ static void _mpir_cleanup(void);
static void _mpir_dump_proctable(void);
static void _pty_restore(void);
static void print_layout_info(slurm_step_layout_t *layout);
static slurm_cred_t *_generate_fake_cred(slurm_step_id_t stepid,
uid_t uid, char *nodelist,
uint32_t node_cnt);
static char *_generate_io_key(void);
static uint32_t _nodeid_from_layout(slurm_step_layout_t *layout,
uint32_t taskid);
static void _set_exit_code(void);
static int _attach_to_tasks(slurm_step_id_t stepid,
slurm_step_layout_t *layout,
slurm_cred_t *fake_cred,
char *io_key,
uint16_t num_resp_ports,
uint16_t *resp_ports,
int num_io_ports,
Expand Down Expand Up @@ -118,11 +116,10 @@ int sattach(int argc, char **argv)
{
log_options_t logopt = LOG_OPTS_STDERR_ONLY;
slurm_step_layout_t *layout;
slurm_cred_t *fake_cred;
message_thread_state_t *mts;
uint32_t jobid, stepid;
client_io_t *io;
char *hosts;
char *io_key = NULL;

slurm_init(NULL);
log_init(xbasename(argv[0]), logopt, 0, NULL);
Expand Down Expand Up @@ -169,16 +166,11 @@ int sattach(int argc, char **argv)
_nodeid_from_layout(layout, opt.fds.input.taskid);
}

if (layout->front_end)
hosts = layout->front_end;
else
hosts = layout->node_list;
fake_cred = _generate_fake_cred(opt.selected_step->step_id,
opt.uid, hosts, layout->node_cnt);
io_key = _generate_io_key();
mts = _msg_thr_create(layout->node_cnt, layout->task_cnt);

io = client_io_handler_create(opt.fds, layout->task_cnt,
layout->node_cnt, fake_cred,
layout->node_cnt, io_key,
opt.labelio, NO_VAL, NO_VAL);
client_io_handler_start(io);

Expand All @@ -197,7 +189,7 @@ int sattach(int argc, char **argv)
xsignal_block(pty_sigarray);
}

_attach_to_tasks(opt.selected_step->step_id, layout, fake_cred,
_attach_to_tasks(opt.selected_step->step_id, layout, io_key,
mts->num_resp_port, mts->resp_port,
io->num_listen, io->listenport,
mts->tasks_started);
Expand All @@ -213,6 +205,7 @@ int sattach(int argc, char **argv)
client_io_handler_finish(io);
client_io_handler_destroy(io);
_mpir_cleanup();
xfree(io_key);

return global_rc;
}
Expand Down Expand Up @@ -278,44 +271,22 @@ static void print_layout_info(slurm_step_layout_t *layout)
hostlist_destroy(nl);
}


/* return a faked job credential */
static slurm_cred_t *_generate_fake_cred(slurm_step_id_t stepid,
uid_t uid, char *nodelist,
uint32_t node_cnt)
/*
* The io_key requires a modest amount of entropy to prevent someone guessing
* it, then racing to initiate a connection to the sattach command.
* By (ab)using the auth token generation mechanisms, the key should be
* sufficiently random for our purposes. (An attacker would need to request
* an auth key be generated at the same time by the same uid on the same host.)
*/
static char *_generate_io_key(void)
{
slurm_cred_t *cred;
slurm_cred_arg_t *arg = xmalloc(sizeof(*arg));

arg->step_id.job_id = stepid.job_id;
arg->step_id.step_id = stepid.step_id;
arg->step_id.step_het_comp = stepid.step_het_comp;
arg->uid = uid;

arg->job_hostlist = nodelist;
arg->job_nhosts = node_cnt;

arg->step_hostlist = nodelist;

arg->job_core_bitmap = bit_alloc(node_cnt);
bit_set_all(arg->job_core_bitmap);
arg->step_core_bitmap = bit_alloc(node_cnt);
bit_set_all(arg->step_core_bitmap);

arg->cores_per_socket = xmalloc(sizeof(uint16_t));
arg->cores_per_socket[0] = 1;
arg->sockets_per_node = xmalloc(sizeof(uint16_t));
arg->sockets_per_node[0] = 1;
arg->sock_core_rep_count = xmalloc(sizeof(uint32_t));
arg->sock_core_rep_count[0] = node_cnt;

cred = slurm_cred_faker(arg);

/* Don't free, this memory will be free'd later */
arg->job_hostlist = NULL;
arg->step_hostlist = NULL;
slurm_cred_free_args(arg);
return cred;
char *key = auth_g_create(AUTH_DEFAULT_INDEX, slurm_conf.authinfo,
0, NULL, 0);

if (!key)
fatal("failed to generate a suitable io_key");

return key;
}

void _handle_response_msg(slurm_msg_type_t msg_type, void *msg,
Expand Down Expand Up @@ -383,7 +354,7 @@ void _handle_response_msg_list(List other_nodes_resp, bitstr_t *tasks_started)
*/
static int _attach_to_tasks(slurm_step_id_t stepid,
slurm_step_layout_t *layout,
slurm_cred_t *fake_cred,
char *io_key,
uint16_t num_resp_ports,
uint16_t *resp_ports,
int num_io_ports,
Expand All @@ -402,7 +373,7 @@ static int _attach_to_tasks(slurm_step_id_t stepid,
reattach_msg.num_resp_port = num_resp_ports;
reattach_msg.resp_port = resp_ports; /* array of response ports */
reattach_msg.num_io_port = num_io_ports;
reattach_msg.io_key = slurm_cred_get_signature(fake_cred);
reattach_msg.io_key = xstrdup(io_key);
reattach_msg.io_port = io_ports;

slurm_msg_set_r_uid(&msg, SLURM_AUTH_UID_ANY);
Expand Down
Loading

0 comments on commit 5b1e2d1

Please sign in to comment.