diff --git a/docs/Makefile.am b/docs/Makefile.am index f19e7f43d2..6ef9e92e61 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2022-2023 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # -# Copyright (c) 2023 Nanook Consulting. All rights reserved. +# Copyright (c) 2023-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -39,6 +39,8 @@ RST_SOURCE_FILES = \ $(srcdir)/prrte-rst-content/*.rst \ $(srcdir)/placement/*.rst \ $(srcdir)/hosts/*.rst \ + $(srcdir)/how-things-work/*.rst \ + $(srcdir)/how-things-work/schedulers/*.rst \ $(srcdir)/developers/*.rst \ $(srcdir)/man/*.rst \ $(srcdir)/man/man1/*.rst \ diff --git a/docs/how-things-work/index.rst b/docs/how-things-work/index.rst new file mode 100644 index 0000000000..16fababe8c --- /dev/null +++ b/docs/how-things-work/index.rst @@ -0,0 +1,13 @@ +How Things Work +=============== + +This section explains how some of the functions inside +PRRTE perform their job. If you have questions about, for +example, how session directory trees work, then you will +find information on that subject here. + +.. toctree:: + :maxdepth: 2 + + session_dirs.rst + schedulers/index.rst diff --git a/docs/how-things-work/schedulers/flow_of_control.rst b/docs/how-things-work/schedulers/flow_of_control.rst new file mode 100644 index 0000000000..0033be602b --- /dev/null +++ b/docs/how-things-work/schedulers/flow_of_control.rst @@ -0,0 +1,28 @@ +Flow-of-Control +=============== + +Describe how allocation requests are flowed to the scheduler, +scheduler attachment upon startup of either side, where response +to allocation requests are returned and how they get back to +the requestor, where session controls are received for allocation +instantiation and how that is done, where session control is +called to indicate all jobs complete in session. + +Probably best to start with an overview of how things flow +and then drill down into the respective steps. + +Allocation requests have to be serviced by the scheduler. Since +a requestor could attach directly to the scheduler (e.g., in the +case of a tool submitting a session request) or to a daemon within +the RM (who must then relay it to its system controller for forwarding +to the scheduler), the scheduler must inform the RM of the allocation +via the ``PMIx_Session_control`` API - i.e., the RM cannot guarantee +its ability to intercept and process an allocation response to learn +of a session that needs to be instantiated. + +PRRTE's role in the ``PMIx_Allocation_request`` flow is simply to +pass the request on to the scheduler, and transport the reply back +to the requestor. + +PRRTE only creates a session object as a result of a call from the +scheduler via ``PMIx_Session_control``. diff --git a/docs/how-things-work/schedulers/index.rst b/docs/how-things-work/schedulers/index.rst new file mode 100644 index 0000000000..564a554ff8 --- /dev/null +++ b/docs/how-things-work/schedulers/index.rst @@ -0,0 +1,14 @@ + +Scheduler Integration +===================== + +This section describes PRRTE support for interactions +with schedulers, including PRRTE's own ``psched`` +pseudo-scheduler. + +.. toctree:: + :maxdepth: 2 + + overview.rst + structures.rst + flow_of_control.rst diff --git a/docs/how-things-work/schedulers/overview.rst b/docs/how-things-work/schedulers/overview.rst new file mode 100644 index 0000000000..bc0bd38f7c --- /dev/null +++ b/docs/how-things-work/schedulers/overview.rst @@ -0,0 +1,23 @@ +Overview +======== + +While PRRTE does not itself contain a scheduler, it does support scheduler-related +operations as a means of providing users and researchers with an environment within +which they can investigate/explore dynamic operations. It will , of course, interact +with any PMIx-based scheduler to support allocation requests and directives for +executing jobs within an allocation. + +In addition, however, PRRTE provides a pseudo-scheduler for research purposes. The +``psched`` daemon is not a full scheduler - i.e., it does not provide even remotely +optimal resource assignments, nor is it in any way production qualified. For example, +the ``psched`` algorithms are not particularly fast and are focused more on providing +functionality than supporting large volumes of requests. + +Within that context, ``psched`` will: + +* assemble a resource pool based on the usual PRRTE resource discovery methods. This + includes reading the allocation made by a host environment, or provided by hostfile + and/or dash-host command line options. + +* upon receipt of allocation requests (either directly from tools or relayed by PRRTE), + check to see if the specified resources are available. diff --git a/docs/how-things-work/schedulers/structures.rst b/docs/how-things-work/schedulers/structures.rst new file mode 100644 index 0000000000..1e16df52d9 --- /dev/null +++ b/docs/how-things-work/schedulers/structures.rst @@ -0,0 +1,270 @@ +Objects and Definitions +======================= + +The following objects and definitions are used in PRRTE for support of +scheduler operations. Where applicable, modifications to previously +existing structures is highlighted. + + +prte_session_t +-------------- +PMIx "sessions" equate to a scheduler's allocation - i.e., a session consists +of a collection of assigned resources within which the RTE will execute one +or more jobs. Sessions can be related to one another - i.e., a session can +be defined as a result of a ``PMIx_Allocation_request`` from a process within +an existing session. This relationship must be tracked as termination of the +parent session mandates termination of all child sessions, unless the user +requested that child sessions continue on their own. + +PRRTE defines the following structure for tracking sessions: + +.. code:: c + + typedef struct{ + pmix_object_t super; + uint32_t session_id; + char *user_refid; + char *alloc_refid; + pmix_pointer_array_t *nodes; + pmix_pointer_array_t *jobs; + pmix_pointer_array_t *children; + } prte_session_t; + +with the following fields: + +* super: the usual PRRTE object header + +* session_id: the numerical ID assigned by the scheduler to + the session. Schedulers can assign both a numerical ID and + a string ID for a session - the two do not necessarily relate + to one another. Both IDs are provided as the numerical ID + can provide a faster lookup of the session, while the string ID + is considered more user-friendly. Note that the scheduler is + required to provide the numerical ID, but the string ID is + optional. The session ID is included in both the response to + the allocation request and in the ``PMIx_Session_control`` + directive given to the RTE via the ``PMIX_SESSION_ID`` attribute. + +* user_refid: the user's string ID provided to the allocation request + that generated this session via the ``PMIX_ALLOC_REQ_ID``. This will + be ``NULL`` if the user chose not to provide the ID. + +* alloc_refid: the string ID assigned by the scheduler to the session. + Note that a scheduler is not required to assign such an ID. It is + communicated in the ``PMIx_Session_control`` directive to the RTE + via the ``PMIX_ALLOC_ID`` attribute. + +* nodes: a pointer array containing pointers to `prte_node_t` objects + describing the individual nodes included in this session. Nodes can + belong to multiple sessions, although this isn't common in HPC (most + facilities configure their schedulers for sole-occupancy of resources). + +* jobs: a pointer array containing pointers to `prte_job_t` objects + describing the individual jobs executing within the session. Note that + jobs are not allowed to operate across sessions, though they may have + child jobs (i.e., jobs spawned by a process from within the parent job) + executing in another session. + +* children: a pointer array containing pointers to `prte_session_t` objects + that describe sessions resulting from a call to ``PMIx_Allocation_request`` + by a process executing within this session. Child sessions are terminated + by default upon completion of their parent session - this behavior can be + overridden by including the ``PMIX_ALLOC_CHILD_SEP`` attribute when calling + ``PMIx_Allocation_request``, or by issuing a ``PMIx_Session_control`` request + with the ``PMIX_SESSION_SEP`` attribute. + +The session object is created upon receiving a ``PMIx_Session_control`` +directive from the scheduler - this occurs in ``src/prted/pmix/pmix_server_session.c`` +as an upcall from the PMIx server library. Session objects are stored in the +``prte_sessions`` global pointer array. + + + +prte_job_t +---------- +Extended to include pointer to the session within which this job is executing. +Note addition of ``PMIX_SPAWN_CHILD_SEP`` and ``PMIX_JOB_CTRL_SEP`` attributes. + +.. code:: c + + typedef struct{ + pmix_list_item_t super; + int exit_code; + char **personality; + struct prte_schizo_base_module_t *schizo; + pmix_nspace_t nspace; + char *session_dir; + int index; + pmix_rank_t offset; + prte_session_t *session; <======== ADDED + pmix_pointer_array_t *apps; + prte_app_idx_t num_apps; + pmix_rank_t stdin_target; + int32_t total_slots_alloc; + pmix_rank_t num_procs; + pmix_pointer_array_t *procs; + struct prte_job_map_t *map; + prte_node_t *bookmark; + prte_job_state_t state; + pmix_rank_t num_mapped; + pmix_rank_t num_launched; + pmix_rank_t num_reported; + pmix_rank_t num_terminated; + pmix_rank_t num_daemons_reported; + pmix_rank_t num_ready_for_debug; + pmix_proc_t originator; + pmix_rank_t num_local_procs; + prte_job_flags_t flags; + pmix_list_t attributes; + pmix_data_buffer_t launch_msg; + pmix_list_t children; + pmix_nspace_t launcher; + uint32_t ntraces; + char **traces; + pmix_cli_result_t cli; + } prte_job_t; + +with the following fields: + +* super: the usual PRRTE list item header so the object can be + included on a PRRTE list + +* exit_code: the exit code for the job. This is usually taken as + the exit code from the first process to exit with a non-zero + status + +* personality: a string indicating the schizo component to be + used for parsing this job's command line (if applicable), + harvesting envars, and generally setting up the job + +* schizo: a pointer to the schizo module itself + +* nspace: the namespace of the job + +* session_dir: the job-level session directory assigned to the job + +* index: the position of this job object in the global ``prte_job_data`` + pointer array + +* offset: offset to the total number of procs so shared memory + components can potentially connect to any spawned jobs + +* session: (**ADDED**) pointer to the session within which this job + is executing. This is provided to accelerate lookup operations when + referencing the session behind a given job. + + .. warning:: + + One must `not` + ``PMIX_RETAIN`` the ``prte_session_t`` object before assigning it to + this field. Session objects clean up `after` all of their included + jobs terminate and clean up - a circular dependency can be created + that prevents job and session objects from executing their destructors. + The ``prte_job_t`` destructor will `not` release the ``session`` field. + +* apps: a pointer array containing pointers to the ``prte_app_context_t`` + objects describing the applications executing within this job. + +* num_apps: the number of applications executing within this job + +* stdin_target: the rank of the process that is to receive forwarded stdin + data. A rank of ``PMIX_RANK_WILDCARD`` indicates that all processes in + the job are to receive a copy of the data. + +* total_slots_alloc: the sum total of all available slots on the nodes + assigned to this job. Note that a job does not necessarily have access + to all resources assigned to the session within which the job is executing. + The job's resources can be modified by hostfile, add-hostfile, dash-host, + and add-host directives. + +* procs: a pointer array containing pointers to the ``prte_proc_t`` objects + describing the individual processes executing as part of the job + +* num_procs: the number of processes executing within the job + +* map: a pointer to the job map detailing the location and binding of + each process within the job + +* bookmark: bookmark for where we are in mapping - this indicates the last + node used to map the job. Should a process within the job initiate a + "spawn" request, mapping of the spawned job will commence from this + point, assuming that the resource list for the new job includes the + bookmark location. + +* state: the PRRTE state of the overall job. This is the state within the + PRRTE state machine within which the job is currently executing. + +* num_mapped: bookkeeping counter used in the mapper subsystem + +* num_launched: bookkeeping counter used during job launch + +* num_reported: number of processes that have called ``PMIx_Init`` + +* num_terminated: bookkeeping counter of process termination + +* num_daemons_reported: bookkeeping counter of number of daemons + spawned in support of the job that have reported "ready" + +* num_ready_for_debug: bookkeeping counter of number of processed + that have registered as ready for debug + +* originator: ID of process that requested spawn of this job + +* num_local_procs: bookkeeping counter of the number of processes + from this job on the local node + +* flags: set of bit-mapped flags used internally by PRRTE + +* attributes: list of job attributes controlling the job behavior + +* launch_msg: copy of the message sent to all daemons to launch + the job's processes + +* children: list of `prte_job_t` describing the jobs that have been + started by processes executing within this parent job. + +* launcher: the namespace of the tool that requested this job be + started + +* ntraces: number of stacktraces collected when PRRTE is asked to + collect stacktraces from failed processes + +* traces: the actual collected stacktraces from failed processes + +* cli: the results of parsing the command line used to generate + this job - only valid when the job is started from the ``prterun`` + command line + + +The job object is created in two places in the DVM system controller +(a.k.a, "master" daemon): + +* upon directly receiving a ``PMIx_Spawn`` request in the PMIx server + library, which is then upcalled in ``src/prted/pmix/pmix_server_dyn.c``. + In this case, the job object is used to assemble the job description + based on the PMIx attributes passed up to the PRRTE function. The + object is subsequently packed and sent to the DVM master for processing - if + the server itself is the DVM master, then it will just be sent to itself. + This object is a temporary holding place for the job description and + will be released upon completion of the spawn. + +* while unpacking a relayed spawn request from another daemon in the + DVM (who received the request from a local client or tool) or a + "send-to-self" from the above function, in + ``src/mca/plm/base/plm_base_receive.c``. The job object is assigned + the relevant ``prte_session_t`` object based on the following (in + order of priority): + + * the session ID, if specified + * the allocation ID, if given + * the user's allocation reference ID, if given + * the session of the parent job, if the spawn requestor is an + application process (and therefore has a parent job) + * the default session, which is composed of the global node pool, if + the spawn requestor is a tool + +A job is required to be assigned to a session - if no session is found, +or the specified session is unknown to PRRTE, then the spawn request +will be denied with an appropriate error code. + + diff --git a/docs/how-things-work/session_dirs.rst b/docs/how-things-work/session_dirs.rst new file mode 100644 index 0000000000..f5d5ed6637 --- /dev/null +++ b/docs/how-things-work/session_dirs.rst @@ -0,0 +1,115 @@ +.. _session-dir-detail-label: + +Session Directories +=================== + +In general, servers, tools, and application processes all have access to their own ``session directory`` - a location where scratch files can be safely placed with a reasonable guarantee of automatic cleanup upon termination. Session directories provide a safe location (i.e., in a temporary file system and guaranteed not to conflict with other sessions/jobs/applications) for executables to use when creating scratch files such as shared memory backing files and rendezvous files. PMIx and PRRTE also provide a reasonable guarantee that any files and/or subdirectories created under the specified location will be automatically cleaned up at finalize and/or termination. In this case, ``reasonable`` means that we will do our best to remove all files and subdirectories, but cannot fully guarantee removal in situations outside of our control (e.g., being forcibly terminated via `SIGKILL`). + + .. note:: In general, the host (e.g., PRRTE) is responsible for creating session + directories. In some cases, PMIx creates a limited set of session + directories if the host does not provide them - e.g., in the case of a + self-launched tool - for storing contact information. This is outlined + below. + +The following attributes can be used to pass session directory information to the PMIx library: + +* ``PMIX_SYSTEM_TMPDIR``: temporary directory for this system (typically ``/tmp`` for Linux systems). + The PMIx server will place tool rendezvous points and contact info in this location. In the + absence of this attribute during ``PMIx_Init`` (or the server/tool version), the PMIx library + will first look for the ``TMPDIR`` envar, then ``TEMP``, and finally ``TMP`` - if none of + those are found, the library will default to the ``/tmp`` location. + +* ``PMIX_SERVER_TMPDIR``: session directory where the PMIx server will place client rendezvous + points and contact info. If not provided, the library will first look for the ``PMIX_SERVER_TMPDIR`` + envar, then ``TMPDIR``, ``TEMP``, and finally ``TMP`` - if none of those are found, the + library will default to the ``/tmp`` location. + +* ``PMIX_TMPDIR``: top-level temporary directory assigned to a session. Often equated to + ``PMIX_SERVER_TMPDIR`` by host environments. + +* ``PMIX_NSDIR``: session directory assigned to a namespace. Usually placed underneath the + ``PMIX_SERVER_TMPDIR`` and given a name based on the namespace itself. + +* ``PMIX_PROCDIR``: session directory assigned to an individual process. Usually placed underneath + the ``PMIX_NSDIR`` assigned to the namespace of the process, and given a string name equivalent + to the rank of the process. + +* ``PMIX_LAUNCHER_RENDEZVOUS_FILE``: the full path name of a file wherein a tool shall output its + connection information - e.g., a launcher to store contact information so that a debugger + can attach to it. + +* ``PMIX_TDIR_RMCLEAN``: the host environment (often known as the "resource manager" or "RM") will + cleanup the session directories. In the absence of this attribute, the PMIx library will + remove all files in any session directory it created and then remove the directory itself. + +These same attributes can be used in calls to ``PMIx_Get`` by application processes to retrieve +the session directory locations for their own use. + + +Client Session Directories +-------------------------- + +The PMIx client library does not create its own session directories as it does not publish +contact information. Host daemons often create one or more session directory levels for use +by client application processes (e.g., for storing shared memory backing files) - the location +of those directories is passed to clients using the above attributes. + +As the client library never creates session directories, it does not perform any cleanup of +the session directory tree. + + +Tool and Server Session Directories +----------------------------------- + +The PMIx library utilizes appropriate session directory locations to store one or more +"rendezvous files" - i.e., files containing connection information. Note that the library does not +always create all levels of the session directory tree, although the process +itself can of course create directories as it sees fit. Only the directories that (a) are required +for generating the particular rendezvous file, and (b) do not already exist are created. Only +directories actually created by the library are cleaned up and removed upon finalization. + +The following rendezvous files are provided: + +* if ``PMIX_LAUNCHER_RENDEZVOUS_FILE`` is given (either via an attribute to an "init" function + or as an envar), then the specified file (including any required path elements) will be created. + +* if the server is designated as a "system" server (i.e., the ``PMIX_SERVER_SYSTEM_SUPPORT`` attribute + was provided to ``PMIx_server_init``), then a rendezvous file named "pmix.sys." will be + created in the ``PMIX_SYSTEM_TMPDIR`` location. + +* if the server is designated as a "session" server (i.e., the ``PMIX_SERVER_SESSION_SUPPORT`` attribute + was provided to ``PMIx_server_init``), then a rendezvous file named "pmix.sys." will be + created in the ``PMIX_SERVER_TMPDIR`` location. + +* if the server declares that it will support tool connections (i.e., the ``PMIX_SERVER_TOOL_SUPPORT`` + attribute was provided to ``PMIx_server_init``), then the following rendezvous files will be created + under the ``PMIX_SERVER_TMPDIR`` location: + + * a PID file: "pmix.." + + * a namespace file using the nspace of the server: "pmix.." + +* if the server is designated as a "scheduler" (i.e., the ``PMIX_SERVER_SCHEDULER`` attribute + was provided to ``PMIx_server_init``), then a rendezvous file named "pmix.sched." will be + created in the ``PMIX_SYSTEM_TMPDIR`` location. + +* if the server is designated as a "system controller" (i.e., the ``PMIX_SERVER_SYS_CONTROLLER`` attribute + was provided to ``PMIx_server_init``), then a rendezvous file named "pmix.sysctrlr." will be + created in the ``PMIX_SYSTEM_TMPDIR`` location. + + .. note:: The above rendezvous files are additive - i.e., generating any one of the files has + no bearing on whether another file will be output. Thus, a single server could + generate anywhere from one to five (or more) rendezvous files spanning several + directory levels. + + .. warning:: There is a potential conflict in rendezvous file names - e.g., if multiple processes + declare themselves to be a "session" server on the same node. The format of the + names used by PMIx are intended to support tool connection in the absence of specific + connection directives - i.e., they provide a means by which PMIx can search for and + find the rendezvous file for a particular type of process without requiring the user + to manually identify it. Thus, a tool can request connection to the "system controller" + without necessarily knowing the PID of that process and PMIx can facilitate the + connection. As a result, only one system server can be operating on a node at a time. + This is also (independently) true for schedulers and system controllers. + + diff --git a/docs/index.rst b/docs/index.rst index 316cfd1e86..dbd58d58dd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,6 +31,8 @@ Table of contents getting-help install configuration + terminology + how-things-work/index hosts/index placement/index notifications diff --git a/docs/session-directory.rst b/docs/session-directory.rst index 34dbbb5c7d..8f47a635fd 100644 --- a/docs/session-directory.rst +++ b/docs/session-directory.rst @@ -8,6 +8,9 @@ daemon and its child processes. This is done to enable quick and easy cleanup in the event that PRRTE is unable to fully cleanup after itself. +More detail on session directories is provided in the How Things Work +:ref:`session directory ` section. + Directory location ------------------ @@ -20,6 +23,13 @@ examining the following (in precedence order): .. note:: MCA parameters can be set via environment variables, on the command line, or in a parameter file. + .. note:: If necessary, the value of the top session directory on + the local node where the launcher (e.g., ``prun``, ``prterun``, + or ``mpirun``) is executing can be set separately from + the value to be used on compute nodes via the + ``prte_local_tmpdir_base`` and ``prte_remote_tmpdir_base`` + parameters. + #. If the environment variable ``TMPDIR`` is not empty, use that. #. If the environment variable ``TEMP`` is not empty, use that. #. If the environment variable ``TMP`` is not empty, use that. @@ -32,17 +42,24 @@ By default, the session directory name is set to .. code:: - prte.. + ... -The session directory name can further be altered to include the PID -of the daemon process, if desired: +where `tool` is the argv[0] of the process setting up the +session directory. In most cases, this will be either `prte`, +`prterun`, or `prted` - though special tools such as `psched` +may also create a session directory tree. -.. code:: +The session directory name includes the PID +of the daemon process to allow a user to have multiple +instances of a tool concurrently executing on a node. + +.. note:: - prte... + Each tool will generate its own session directory tree. This + is done to avoid cleanup race conditions where one tool might + cleanup the session directory, and thereby remove the contact + information for a tool that is continuing to execute. -by setting the ``prte_add_pid_to_session_dirname`` MCA parameter to a -"true" value (e.g., 1). Tools ----- diff --git a/docs/terminology.rst b/docs/terminology.rst new file mode 100644 index 0000000000..8ca77d6e3f --- /dev/null +++ b/docs/terminology.rst @@ -0,0 +1,29 @@ +Terminology +================= + +PRRTE uses terms adopted from PMIx to describe its operations. Terms include: + +``Workload Manager (WLM)`` is often called the ``scheduler`` and is responsible for +scheduling and assigning resources. + +``Resource Manager (RM)`` is the runtime environment (RTE) + +``Session`` refers to a set of resources assigned by the WLM that has been +reserved for one or more users. +A session is identified by a `session ID` that is +unique within the scope of the governing WLM. +Historically, HPC sessions have consisted of a static allocation of resources - i.e., a block of resources assigned to a user in response to a specific request and managed as a unified collection. However, this is changing in response to the growing use of dynamic programming models that require on-the-fly allocation and release of system resources. Accordingly, the term ``session`` in this project refers to a potentially dynamic entity, perhaps comprised of resources accumulated as a result of multiple allocation requests that are managed as a single unit by the WLM. + +``Job`` refers to a set of one or more ``applications`` executed as a single invocation by the user within a session with a unique identifier, the ``job ID``, assigned by the RM or launcher. For example, the command line `mpiexec -n 1 app1 : -n 2 app2` generates a single MPMD job containing two applications. A user may execute multiple jobs within a given session, either sequentially or concurrently. + +``Namespace`` refers to a character string value assigned by the RM to a job. All applications executed as part of that job share the same namespace. The namespace assigned to each job must be unique within the scope of the governing RM and often is implemented as a string representation of the numerical ``Job ID``. The namespace and job terms will be used interchangeably throughout the project. + +``Application`` represents a set of identical, but not necessarily unique, +execution contexts within a job. + +``Process`` is assumed for ease of presentation to be an operating system process, also commonly referred to as a heavyweight process. A process is often comprised of multiple lightweight threads, commonly known as simply `threads`. + +``Client`` refers to a process that was registered with the PMIx server prior to being started, and connects to that PMIx server via ``PMIx_Init`` using its assigned namespace and rank with the information required to connect to that server being provided to the process at time of start of execution. + +``Tool`` refers to a process that may or may not have been registered with the PMIx server prior to being started and intializes using ``PMIx_tool_init``. + diff --git a/src/mca/ess/hnp/ess_hnp_module.c b/src/mca/ess/hnp/ess_hnp_module.c index 36fef91498..0adbb711c6 100644 --- a/src/mca/ess/hnp/ess_hnp_module.c +++ b/src/mca/ess/hnp/ess_hnp_module.c @@ -184,6 +184,9 @@ static int rte_init(int argc, char **argv) goto error; } + /* Set the session of the daemon job to the default session */ + jdata->session = prte_default_session; + /* mark that the daemons have reported as we are the * only ones in the system right now, and we definitely * are running! diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index 4dade0fac7..d8f31848e7 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -17,7 +17,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * @@ -149,8 +149,8 @@ void prte_plm_base_daemons_reported(int fd, short args, void *cbdata) */ if (!prte_managed_allocation || prte_set_slots_override) { caddy->jdata->total_slots_alloc = 0; - for (i = 0; i < prte_node_pool->size; i++) { - node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, i); + for (i = 0; i < caddy->jdata->session->nodes->size; i++) { + node = (prte_node_t *) pmix_pointer_array_get_item(caddy->jdata->session->nodes, i); if (NULL == node) { continue; } diff --git a/src/mca/plm/base/plm_base_receive.c b/src/mca/plm/base/plm_base_receive.c index 740fc02764..05af9099a3 100644 --- a/src/mca/plm/base/plm_base_receive.c +++ b/src/mca/plm/base/plm_base_receive.c @@ -64,6 +64,8 @@ #include "src/mca/plm/plm.h" #include "src/mca/plm/plm_types.h" +#include "src/prted/pmix/pmix_server_internal.h" + static bool recv_issued = false; int prte_plm_base_comm_start(void) @@ -118,6 +120,7 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, prte_plm_cmd_flag_t command; int32_t count; pmix_nspace_t job; + prte_session_t *session; prte_job_t *jdata, *parent, jb; pmix_data_buffer_t *answer; pmix_rank_t vpid; @@ -125,13 +128,14 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, prte_proc_state_t state; prte_exit_code_t exit_code; int32_t rc = PRTE_SUCCESS, ret; + uint32_t ui32, *ui32_ptr; prte_app_context_t *app, *child_app; pmix_proc_t name, *nptr; pid_t pid; bool debugging, found; int i, room, *rmptr = &room; char **env; - char *prefix_dir, *tmp; + char *prefix_dir, *tmp, *endptr; pmix_rank_t tgt, *tptr; pmix_value_t pidval = PMIX_VALUE_STATIC_INIT; PRTE_HIDE_UNUSED_PARAMS(status, tag, cbdata); @@ -232,6 +236,74 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, goto ANSWER_LAUNCH; } + /* If an alloc id was given specifying the session within which + * the job is to be spawned, then use it - otherwise default to parent session */ + session = NULL; + ui32_ptr = &ui32; + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_SESSION_ID, (void **) &ui32_ptr, PMIX_UINT32)) { + session = prte_get_session_object(ui32); + if (NULL == session) { + /* if the caller specified a session and we don't know about it, then + * that is an unrecoverable error */ + rc = PRTE_ERR_NOT_FOUND; + goto ANSWER_LAUNCH; + } + + } else if (prte_get_attribute(&jdata->attributes, PRTE_JOB_ALLOC_ID, (void **) &tmp, PMIX_STRING)) { + session = prte_get_session_object_from_id(tmp); + if (NULL == session) { + /* if the caller specified a session and we don't know about it, then + * that is an unrecoverable error */ + rc = PRTE_ERR_NOT_FOUND; + goto ANSWER_LAUNCH; + } + + } else if (prte_get_attribute(&jdata->attributes, PRTE_JOB_REF_ID, (void **) &tmp, PMIX_STRING)) { + session = prte_get_session_object_from_refid(tmp); + if (NULL == session) { + /* if the caller specified a session and we don't know about it, then + * that is an unrecoverable error */ + rc = PRTE_ERR_NOT_FOUND; + goto ANSWER_LAUNCH; + } + + } else { + /* try defaulting to parent session */ + if (NULL != (parent = prte_get_job_data_object(nptr->nspace))) { + session = parent->session; + + // (RHC) This next clause merits some thought - not sure I fully + // understand the conditionals + } else if (!prte_pmix_server_globals.scheduler_connected || + PMIX_CHECK_PROCID(nptr, &prte_pmix_server_globals.scheduler)) { + /* The proc requesting the spawn is a tool, hence does not have a session. + * If we don't have a scheduler connected (or the tool itself is the scheduler) we allow + * it to spawn into the default session, i.e. the global node pool + */ + session = prte_default_session; + } else { + PRTE_ERROR_LOG(PRTE_ERR_PERM); + rc = PRTE_ERR_PERM; + goto ANSWER_LAUNCH; + } + } + +#if 0 + // (RHC) I'm not sure the following is true - merits some thought + + /* Jobs are only allowed to be spawned in the the session of the requestor + * or one of its child sessions. */ + if (NULL == session || + !prte_sessions_related(prte_get_job_data_object(nptr->nspace)->session, session)) { + PRTE_ERROR_LOG(PRTE_ERR_PERM); + rc = PRTE_ERR_PERM; + goto ANSWER_LAUNCH; + } +#endif + + jdata->session = session; + pmix_pointer_array_add(jdata->session->jobs, jdata); + /* get the parent's job object */ if (NULL != (parent = prte_get_job_data_object(nptr->nspace)) && !PMIX_CHECK_NSPACE(parent->nspace, PRTE_PROC_MY_NAME->nspace)) { diff --git a/src/mca/ras/base/ras_base_allocate.c b/src/mca/ras/base/ras_base_allocate.c index bc9db628f5..7b759a7cfa 100644 --- a/src/mca/ras/base/ras_base_allocate.c +++ b/src/mca/ras/base/ras_base_allocate.c @@ -15,7 +15,7 @@ * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * @@ -975,6 +975,7 @@ int prte_ras_base_add_hosts(prte_job_t *jdata) node = PMIX_NEW(prte_node_t); node->name = strdup(cptr); node->slots = slots; + node->state = PRTE_NODE_STATE_ADDED; pmix_list_append(&nodes, &node->super); } free(line); @@ -998,7 +999,7 @@ int prte_ras_base_add_hosts(prte_job_t *jdata) /* We next check for and add any add-host options. Note this is * a -little- different than dash-host in that (a) we add these - * nodes to the global pool regardless of what may already be there, + * nodes to the global pool (avoiding duplication), * and (b) as a result, any job and/or app_context can access them. * * Note that any relative node syntax found in the add-host lists will diff --git a/src/mca/rmaps/base/rmaps_base_support_fns.c b/src/mca/rmaps/base/rmaps_base_support_fns.c index 833aea2262..0f13070e2c 100644 --- a/src/mca/rmaps/base/rmaps_base_support_fns.c +++ b/src/mca/rmaps/base/rmaps_base_support_fns.c @@ -54,7 +54,7 @@ int prte_rmaps_base_filter_nodes(prte_app_context_t *app, pmix_list_t *nodes, bool remove) { int rc = PRTE_ERR_TAKE_NEXT_OPTION; - char *hosts; + char *hosts, *alloc_id; /* did the app_context contain a hostfile? */ hosts = NULL; @@ -180,18 +180,18 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, PMIX_DESTRUCT(&nodes); return PRTE_ERR_SILENT; } - /* find the nodes in our node array and assemble them + /* find the nodes in the session and assemble them * in list order as that is what the user specified. Note * that the prte_node_t objects on the nodes list are not * fully filled in - they only contain the user-provided * name of the node as a temp object. Thus, we cannot just * check to see if the node pointer matches that of a node - * in the node_pool. + * in the session. */ PMIX_LIST_FOREACH_SAFE(nptr, next, &nodes, prte_node_t) { - for (i = 0; i < prte_node_pool->size; i++) { - node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, i); + for (i = 0; i < jdata->session->nodes->size; i++) { + node = (prte_node_t *) pmix_pointer_array_get_item(jdata->session->nodes, i); if (NULL == node) { continue; } @@ -263,24 +263,11 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, * this point either has the HNP node or nothing, and the HNP * node obviously has a daemon on it (us!) */ - if (0 == pmix_list_get_size(allocated_nodes)) { - /* the list is empty - if the HNP is allocated, then add it */ - if (prte_hnp_is_allocated) { - nd = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, 0); - if (!PRTE_FLAG_TEST(nd, PRTE_NODE_NON_USABLE)) { - PMIX_RETAIN(nd); - pmix_list_append(allocated_nodes, &nd->super); - } else { - nd = NULL; - } - } else { - nd = NULL; - } - } else { - nd = (prte_node_t *) pmix_list_get_last(allocated_nodes); - } - for (i = 1; i < prte_node_pool->size; i++) { - if (NULL != (node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, i))) { + nd = (prte_node_t *) pmix_list_get_last(allocated_nodes); + for (i = 0; i < jdata->session->nodes->size; i++) { + if (NULL != (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->session->nodes, i))) { + + /* ignore nodes that are non-usable */ if (PRTE_FLAG_TEST(node, PRTE_NODE_NON_USABLE)) { continue; @@ -312,6 +299,11 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, "NODE %s HAS NO DAEMON", node->name)); continue; } + if(0 == node->index && !prte_hnp_is_allocated){ + PMIX_OUTPUT_VERBOSE((10, prte_rmaps_base_framework.framework_output, + "HNP NODE %s IS NOT ALLOCATED", node->name)); + continue; + } /* retain a copy for our use in case the item gets * destructed along the way */ diff --git a/src/mca/state/dvm/state_dvm.c b/src/mca/state/dvm/state_dvm.c index 1551f8221b..72fea602f2 100644 --- a/src/mca/state/dvm/state_dvm.c +++ b/src/mca/state/dvm/state_dvm.c @@ -503,6 +503,7 @@ static void lkcbfunc(pmix_status_t status, void *cbdata) static void check_complete(int fd, short args, void *cbdata) { prte_state_caddy_t *caddy = (prte_state_caddy_t *) cbdata; + prte_session_t *session; prte_job_t *jdata, *jptr; prte_proc_t *proc; int i, rc; @@ -720,6 +721,17 @@ static void check_complete(int fd, short args, void *cbdata) * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ + session = jdata->session; + if(NULL != session){ + for(i = 0; i < session->jobs->size; i++){ + if(NULL != (jptr = pmix_pointer_array_get_item(session->jobs, i))){ + if(PMIX_CHECK_NSPACE(jdata->nspace, jptr->nspace)){ + pmix_pointer_array_set_item(session->jobs, i, NULL); + break; + } + } + } + } if (NULL != jdata->map) { map = jdata->map; takeall = false; diff --git a/src/prted/pmix/Makefile.am b/src/prted/pmix/Makefile.am index f8f7cb85f5..b3edd789f1 100644 --- a/src/prted/pmix/Makefile.am +++ b/src/prted/pmix/Makefile.am @@ -1,7 +1,7 @@ # # Copyright (c) 2014-2020 Intel, Inc. All rights reserved. # Copyright (c) 2014-2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -21,4 +21,5 @@ libprrte_la_SOURCES += \ prted/pmix/pmix_server_pub.c \ prted/pmix/pmix_server_gen.c \ prted/pmix/pmix_server_queries.c \ + prted/pmix/pmix_server_allocate.c \ prted/pmix/pmix_server_session.c diff --git a/src/prted/pmix/pmix_server.c b/src/prted/pmix/pmix_server.c index f5450a30d9..23513a81e3 100644 --- a/src/prted/pmix/pmix_server.c +++ b/src/prted/pmix/pmix_server.c @@ -65,6 +65,7 @@ #include "src/mca/errmgr/errmgr.h" #include "src/mca/grpcomm/grpcomm.h" +#include "src/mca/ras/base/ras_private.h" #include "src/rml/rml_contact.h" #include "src/rml/rml.h" #include "src/runtime/prte_data_server.h" @@ -74,6 +75,7 @@ #include "src/util/proc_info.h" #include "src/util/session_dir.h" #include "src/util/pmix_show_help.h" +#include "src/util/dash_host/dash_host.h" #include "src/prted/pmix/pmix_server.h" #include "src/prted/pmix/pmix_server_internal.h" @@ -118,7 +120,7 @@ static pmix_server_module_t pmix_server = { .push_stdin = pmix_server_stdin_fn, .group = pmix_server_group_fn, .allocate = pmix_server_alloc_fn, -#if PMIX_NUMERIC_VERSION >= 0x00050000 +#ifdef PMIX_SESSION_INSTANTIATE .session_control = pmix_server_session_ctrl_fn #endif }; @@ -927,6 +929,10 @@ void pmix_server_start(void) PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_JOBID_RESP, PRTE_RML_PERSISTENT, pmix_server_jobid_return, NULL); + /* setup recv for alloc request response */ + PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_SCHED_RESP, + PRTE_RML_PERSISTENT, pmix_server_alloc_request_resp, NULL); + if (PRTE_PROC_IS_MASTER) { /* setup recv for logging requests */ PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_LOGGING, @@ -953,6 +959,7 @@ void pmix_server_finalize(void) PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_LAUNCH_RESP); PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_DATA_CLIENT); PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_NOTIFICATION); + PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_SCHED_RESP); if (PRTE_PROC_IS_MASTER) { PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_LOGGING); PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_SCHED); @@ -1738,6 +1745,64 @@ static void pmix_server_log(int status, pmix_proc_t *sender, } } +/* alloc callback to send the results to the requesting daemon */ +void send_alloc_resp(pmix_status_t status, + pmix_info_t info[], size_t ninfo, + void *cbdata, + pmix_release_cbfunc_t release_fn, + void *release_cbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + pmix_data_buffer_t *buf; + pmix_status_t rc; + + /* pack the status */ + PMIX_DATA_BUFFER_CREATE(buf); + if (PMIX_SUCCESS != (rc = PMIx_Data_pack(NULL, buf, &status, 1, PMIX_STATUS))) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return; + } + + /* pack the remote daemon's request index */ + if (PMIX_SUCCESS != (rc = PMIx_Data_pack(NULL, buf, &req->remote_index, 1, PMIX_INT))) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return; + } + + /* pack any provided info */ + if (PMIX_SUCCESS != (rc = PMIx_Data_pack(NULL, buf, &ninfo, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return; + } + + if (0 < ninfo) { + if (PMIX_SUCCESS != (rc = PMIx_Data_pack(NULL, buf, info, ninfo, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return; + } + } + + /* send the response */ + PRTE_RML_SEND(rc, req->proxy.rank, buf, PRTE_RML_TAG_SCHED_RESP); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + } + + /* Call the provided callback function */ + if (NULL != release_fn) { + release_fn(release_cbdata); + } + + /* Release the server op request data */ + pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); + PMIX_RELEASE(req); +} + static void pmix_server_sched(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer, prte_rml_tag_t tg, void *cbdata) @@ -1745,14 +1810,17 @@ static void pmix_server_sched(int status, pmix_proc_t *sender, pmix_status_t rc; uint8_t cmd; int32_t cnt; - size_t ninfo; + size_t ninfo, need_id, n; pmix_alloc_directive_t allocdir; uint32_t sessionID; pmix_info_t *info = NULL; pmix_proc_t source; pmix_server_req_t *req; + prte_session_t *session; + prte_job_t *client_job; + char alloc_id[64]; int refid; - PRTE_HIDE_UNUSED_PARAMS(status, sender, tg, cbdata); + PRTE_HIDE_UNUSED_PARAMS(status, tg, cbdata); /* unpack the command */ cnt = 1; @@ -1780,7 +1848,7 @@ static void pmix_server_sched(int status, pmix_proc_t *sender, goto reply; } - if (0 == cmd) { + if (PRTE_PMIX_ALLOC_REQ == cmd) { /* allocation request - unpack the directive */ cnt = 1; rc = PMIx_Data_unpack(NULL, buffer, &allocdir, &cnt, PMIX_ALLOC_DIRECTIVE); @@ -1805,8 +1873,10 @@ static void pmix_server_sched(int status, pmix_proc_t *sender, PMIX_ERROR_LOG(rc); goto reply; } +#ifdef PMIX_REQUESTOR + // need to add the requestor's ID to the info array, so expand it if (0 < ninfo) { - PMIX_INFO_CREATE(info, ninfo); + PMIX_INFO_CREATE(info, ninfo+1); cnt = ninfo; rc = PMIx_Data_unpack(NULL, buffer, info, &cnt, PMIX_INFO); if (PMIX_SUCCESS != rc) { @@ -1814,55 +1884,64 @@ static void pmix_server_sched(int status, pmix_proc_t *sender, PMIX_INFO_FREE(info, ninfo); goto reply; } + } else { + ninfo = 1; + PMIX_INFO_CREATE(info, 1); } + PMIX_INFO_LOAD(&info[ninfo], PMIX_REQUESTOR, &source, PMIX_PROC); +#else + if (0 < ninfo) { + PMIX_INFO_CREATE(info, ninfo); + cnt = ninfo; + rc = PMIx_Data_unpack(NULL, buffer, info, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(info, ninfo); + goto reply; + } + } +#endif - if (!prte_pmix_server_globals.scheduler_connected) { + /* we are the DVM master, so handle this ourselves - start + * by ensuring the scheduler is connected to us */ + rc = prte_pmix_set_scheduler(); + if (PMIX_SUCCESS != rc) { /* the scheduler has not attached to us - there is * nothing we can do */ rc = PMIX_ERR_NOT_SUPPORTED; - if (NULL != info) { - PMIX_INFO_FREE(info, ninfo); - } + PMIX_INFO_FREE(info, ninfo); goto reply; } - /* if we have not yet set the scheduler as our server, do so */ - if (!prte_pmix_server_globals.scheduler_set_as_server) { - rc = PMIx_tool_set_server(&prte_pmix_server_globals.scheduler, NULL, 0); - if (PMIX_SUCCESS != rc) { - if (NULL != info) { - PMIX_INFO_FREE(info, ninfo); - } - goto reply; - } - prte_pmix_server_globals.scheduler_set_as_server = true; - } - /* track the request */ req = PMIX_NEW(pmix_server_req_t); - - if (0 == cmd) { - rc = PMIx_Allocation_request_nb(allocdir, info, ninfo, - req->infocbfunc, req); + req->remote_index = refid; + req->copy = true; + req->info = info; + req->ninfo = ninfo; + PMIX_PROC_LOAD(&req->proxy, sender->nspace, sender->rank); + PMIX_PROC_LOAD(&req->tproc, source.nspace, source.rank); + if (PRTE_PMIX_ALLOC_REQ == cmd) { + pmix_asprintf(&req->operation, "ALLOCATE: %u", allocdir); + rc = PMIx_Allocation_request_nb(allocdir, req->info, req->ninfo, + send_alloc_resp, req); } else { + pmix_asprintf(&req->operation, "SESSIONCTRL: %u", sessionID); #if PMIX_NUMERIC_VERSION < 0x00050000 rc = PMIX_ERR_NOT_SUPPORTED; #else - rc = PMIx_Session_control(sessionID, info, ninfo, - req->infocbfunc, req); + rc = PMIx_Session_control(sessionID, req->info, req->ninfo, + send_alloc_resp, req); #endif } if (PMIX_SUCCESS != rc) { - if (NULL != info) { - PMIX_INFO_FREE(info, ninfo); - } goto reply; } return; reply: /* send an error response */ - + send_alloc_resp(rc, NULL, 0, req, NULL, NULL); return; } @@ -1903,8 +1982,6 @@ static void opcon(prte_pmix_server_op_caddy_t *p) p->apps = NULL; p->napps = 0; p->cbfunc = NULL; - p->allocdir = 0; - p->sessionID = UINT32_MAX; p->infocbfunc = NULL; p->toolcbfunc = NULL; p->spcbfunc = NULL; @@ -1927,11 +2004,14 @@ static void rqcon(pmix_server_req_t *p) p->flag = true; p->launcher = false; p->scheduler = false; + p->copy = false; p->local_index = -1; p->remote_index = -1; p->uid = 0; p->gid = 0; p->pid = 0; + p->allocdir = 0; + p->sessionID = UINT32_MAX; p->info = NULL; p->ninfo = 0; p->data = NULL; @@ -1950,12 +2030,16 @@ static void rqcon(pmix_server_req_t *p) p->toolcbfunc = NULL; p->infocbfunc = NULL; p->cbdata = NULL; + p->rlcbdata = NULL; } static void rqdes(pmix_server_req_t *p) { if (NULL != p->operation) { free(p->operation); } + if (NULL != p->info && p->copy) { + PMIX_INFO_FREE(p->info, p->ninfo); + } if (NULL != p->cmdline) { free(p->cmdline); } diff --git a/src/prted/pmix/pmix_server_allocate.c b/src/prted/pmix/pmix_server_allocate.c new file mode 100644 index 0000000000..9447eaad8d --- /dev/null +++ b/src/prted/pmix/pmix_server_allocate.c @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2022-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "prte_config.h" + +#include "src/pmix/pmix-internal.h" +#include "src/prted/pmix/pmix_server_internal.h" +#include "src/rml/rml.h" +#include "src/util/dash_host/dash_host.h" +#include "src/mca/ras/base/ras_private.h" + +static void localrelease(void *cbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + + pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); + PMIX_RELEASE(req); +} + +void pmix_server_alloc_request_resp(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tg, + void *cbdata) +{ + + int req_index, cnt; + pmix_status_t ret, rc; + pmix_server_req_t *req; + + PRTE_HIDE_UNUSED_PARAMS(status, sender, tg, cbdata); + + /* unpack the status - this is already a PMIx value */ + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &ret, &cnt, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + ret = prte_pmix_convert_rc(rc); + } + + /* we let the above errors fall thru in the vain hope that the req number can + * be successfully unpacked, thus allowing us to respond to the requestor */ + + /* unpack our tracking room number */ + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &req_index, &cnt, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + /* we are hosed */ + return; + } + + req = pmix_pointer_array_get_item(&prte_pmix_server_globals.local_reqs, req_index); + + /* Report the error */ + if (ret != PMIX_SUCCESS) { + goto ANSWER; + } + + rc = PMIx_Data_unpack(NULL, buffer, &req->ninfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + ret = prte_pmix_convert_rc(rc); + goto ANSWER; + } + + if (0 < req->ninfo) { + PMIX_INFO_CREATE(req->info, req->ninfo); + + cnt = req->ninfo; + rc = PMIx_Data_unpack(NULL, buffer, req->info, &cnt, PMIX_INFO); + + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + ret = prte_pmix_convert_rc(rc); + req->ninfo = 0; + goto ANSWER; + } + } + +ANSWER: + if (NULL != req->infocbfunc) { + // pass the response back to the requestor + req->infocbfunc(ret, req->info, req->ninfo, req, localrelease, req); + } else { + PMIX_RELEASE(req); + } +} + +pmix_status_t prte_pmix_set_scheduler(void) +{ + pmix_status_t rc; + pmix_info_t info[2]; + + if (!prte_pmix_server_globals.scheduler_connected) { + /* the scheduler has not attached to us - see if we + * can attach to it, make it optional so we don't + * hang if there is no scheduler available */ + PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_TO_SCHEDULER, NULL, PMIX_BOOL); + PMIX_INFO_LOAD(&info[1], PMIX_TOOL_CONNECT_OPTIONAL, NULL, PMIX_BOOL); + rc = PMIx_tool_attach_to_server(NULL, &prte_pmix_server_globals.scheduler, + info, 2); + PMIX_INFO_DESTRUCT(&info[0]); + PMIX_INFO_DESTRUCT(&info[1]); + if (PMIX_SUCCESS != rc) { + return rc; + } + prte_pmix_server_globals.scheduler_set_as_server = true; + } + + /* if we have not yet set the scheduler as our server, do so */ + if (!prte_pmix_server_globals.scheduler_set_as_server) { + rc = PMIx_tool_set_server(&prte_pmix_server_globals.scheduler, NULL, 0); + if (PMIX_SUCCESS != rc) { + return rc; + } + prte_pmix_server_globals.scheduler_set_as_server = true; + } + + return PMIX_SUCCESS; +} + +pmix_status_t prte_server_send_request(uint8_t cmd, pmix_server_req_t *req) +{ + pmix_data_buffer_t *buf; + pmix_status_t rc; + + PMIX_DATA_BUFFER_CREATE(buf); + + /* construct a request message for the command */ + rc = PMIx_Data_pack(NULL, buf, &cmd, 1, PMIX_UINT8); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + + /* pack the local reference ID */ + rc = PMIx_Data_pack(NULL, buf, &req->local_index, 1, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + + /* pack the requestor */ + rc = PMIx_Data_pack(NULL, buf, &req->tproc, 1, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + + if (PRTE_PMIX_ALLOC_REQ == cmd) { + /* pack the allocation directive */ + rc = PMIx_Data_pack(NULL, buf, &req->allocdir, 1, PMIX_ALLOC_DIRECTIVE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + } else { + /* pack the sessionID */ + rc = PMIx_Data_pack(NULL, buf, &req->sessionID, 1, PMIX_UINT32); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + } + + /* pack the number of info */ + rc = PMIx_Data_pack(NULL, buf, &req->ninfo, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + if (0 < req->ninfo) { + /* pack the info */ + rc = PMIx_Data_pack(NULL, buf, req->info, req->ninfo, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + } + + /* send this request to the DVM controller */ + PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, buf, PRTE_RML_TAG_SCHED); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + return PMIX_SUCCESS; +} + +/* Callbacks to process an allocate request answer from the scheduler + * and pass on any results to the requesting client + */ +static void passthru(int sd, short args, void *cbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + + if (NULL != req->infocbfunc) { + // call the requestor's callback with the returned info + req->infocbfunc(req->status, req->info, req->ninfo, req->cbdata, req->rlcbfunc, req->rlcbdata); + } else { + // let them cleanup + req->rlcbfunc(req->rlcbdata); + } + // cleanup our request + pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); + PMIX_RELEASE(req); +} + +static void infocbfunc(pmix_status_t status, + pmix_info_t *info, size_t ninfo, + void *cbdata, + pmix_release_cbfunc_t rel, void *relcbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + // need to pass this into our progress thread for processing + // since we touch the global request array + req->status = status; + if (req->copy && NULL != req->info) { + PMIX_INFO_FREE(req->info, req->ninfo); + req->copy = false; + } + req->info = info; + req->ninfo = ninfo; + req->rlcbfunc = rel; + req->rlcbdata = relcbdata; + + prte_event_set(prte_event_base, &req->ev, -1, PRTE_EV_WRITE, passthru, req); + PMIX_POST_OBJECT(req); + prte_event_active(&req->ev, PRTE_EV_WRITE, 1); +} + +static void pass_request(int sd, short args, void *cbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + pmix_status_t rc; + size_t n; + pmix_info_t *xfer; + + /* add this request to our local request tracker array */ + req->local_index = pmix_pointer_array_add(&prte_pmix_server_globals.local_reqs, req); + + if (!PRTE_PROC_IS_MASTER) { + /* if we are not the DVM master, then we have to send + * this request to the master for processing */ + rc = prte_server_send_request(PRTE_PMIX_ALLOC_REQ, req); + if (PRTE_SUCCESS != rc) { + goto callback; + } + return; + } + + /* if we are the DVM master, then handle this ourselves - start + * by ensuring the scheduler is connected to us */ + rc = prte_pmix_set_scheduler(); + if (PMIX_SUCCESS != rc) { + goto callback; + } + +#ifdef PMIX_REQUESTOR + // we need to pass the request on to the scheduler + // need to add the requestor's ID to the info array + PMIX_INFO_CREATE(xfer, req->ninfo + 1); + for (n=0; n < req->ninfo; n++) { + PMIX_INFO_XFER(&xfer[n], &req->info[n]); + } + PMIX_INFO_LOAD(&xfer[req->ninfo], PMIX_REQUESTOR, &req->tproc, PMIX_PROC); + // the current req object points to the caller's info array, so leave it alone + req->copy = true; + req->info = xfer; + req->ninfo++; +#endif + /* pass the request to the scheduler */ + rc = PMIx_Allocation_request_nb(req->allocdir, req->info, req->ninfo, + infocbfunc, req); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto callback; + } + return; + +callback: + /* this section gets executed solely upon an error */ + if (NULL != req->infocbfunc) { + req->infocbfunc(rc, req->info, req->ninfo, req->cbdata, localrelease, req); + return; + } + pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); + PMIX_RELEASE(req); +} + +/* this is the upcall from the PMIx server for the allocation + * request support. Since we are going to touch global structures + * (e.g., the session tracker pointer array), we have to threadshift + * this request into our own internal progress thread. Note that the + * allocation request could have come to this host from the + * scheduler, or a tool, or even an application process. */ +pmix_status_t pmix_server_alloc_fn(const pmix_proc_t *client, + pmix_alloc_directive_t directive, + const pmix_info_t data[], size_t ndata, + pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + pmix_server_req_t *req; + + + pmix_output_verbose(2, prte_pmix_server_globals.output, + "%s allocate upcalled on behalf of proc %s:%u with %" PRIsize_t " infos", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), client->nspace, client->rank, ndata); + + /* create a request tracker for this operation */ + req = PMIX_NEW(pmix_server_req_t); + pmix_asprintf(&req->operation, "ALLOCATE: %u", directive); + PMIX_PROC_LOAD(&req->tproc, client->nspace, client->rank); + req->allocdir = directive; + req->info = (pmix_info_t *) data; + req->ninfo = ndata; + req->infocbfunc = cbfunc; + req->cbdata = cbdata; + + prte_event_set(prte_event_base, &req->ev, -1, PRTE_EV_WRITE, pass_request, req); + PMIX_POST_OBJECT(req); + prte_event_active(&req->ev, PRTE_EV_WRITE, 1); + return PRTE_SUCCESS; +} diff --git a/src/prted/pmix/pmix_server_dyn.c b/src/prted/pmix/pmix_server_dyn.c index 3720158df6..295ad74cd5 100644 --- a/src/prted/pmix/pmix_server_dyn.c +++ b/src/prted/pmix/pmix_server_dyn.c @@ -194,188 +194,24 @@ static void spawn(int sd, short args, void *cbdata) PMIX_RELEASE(req); } -static void interim(int sd, short args, void *cbdata) +int prte_pmix_xfer_job_info(prte_job_t *jdata, pmix_info_t *inarray) { - prte_pmix_server_op_caddy_t *cd = (prte_pmix_server_op_caddy_t *) cbdata; - pmix_proc_t *requestor = &cd->proc; - pmix_envar_t envar; - prte_job_t *jdata, *djob; - prte_app_context_t *app; - pmix_app_t *papp; - pmix_info_t *info; - int rc, i; - char cwd[PRTE_PATH_MAX]; + pmix_info_t *iptr, *info; + size_t ninfo, n; + int i, rc; bool flag; - size_t m, n; + uint32_t u32; uint16_t u16; - pmix_rank_t rank; - prte_rmaps_options_t options; - prte_schizo_base_module_t *schizo; - PRTE_HIDE_UNUSED_PARAMS(sd, args); + prte_job_t *djob; + prte_app_context_t *app; + pmix_envar_t envar; - pmix_output_verbose(2, prte_pmix_server_globals.output, - "%s spawn called from proc %s with %d apps", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(requestor), - (int) cd->napps); + iptr = (pmix_info_t*)inarray->value.data.darray->array; + ninfo = inarray->value.data.darray->size; - /* create the job object */ - jdata = PMIX_NEW(prte_job_t); - jdata->map = PMIX_NEW(prte_job_map_t); - /* default to the requestor as the originator */ - PMIX_LOAD_PROCID(&jdata->originator, requestor->nspace, requestor->rank); - /* find the personality being passed - we need this info to direct - * option parsing */ - for (n=0; n < cd->ninfo; n++) { - if (PMIX_CHECK_KEY(&cd->info[n], PMIX_PERSONALITY)) { - jdata->personality = PMIX_ARGV_SPLIT_COMPAT(cd->info[n].value.data.string, ','); - jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(cd->info[n].value.data.string); - pmix_server_cache_job_info(jdata, &cd->info[n]); - break; - } - } - if (NULL == jdata->personality) { - /* use the default */ - jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(NULL); - } + for (n = 0; n < ninfo; n++) { + info = &iptr[n]; - /* transfer the apps across */ - for (n = 0; n < cd->napps; n++) { - papp = &cd->apps[n]; - app = PMIX_NEW(prte_app_context_t); - app->job = (struct prte_job_t*)jdata; - app->idx = pmix_pointer_array_add(jdata->apps, app); - jdata->num_apps++; - if (NULL != papp->cmd) { - app->app = strdup(papp->cmd); - } else if (NULL == papp->argv || NULL == papp->argv[0]) { - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - PMIX_RELEASE(jdata); - rc = PRTE_ERR_BAD_PARAM; - goto complete; - } else { - app->app = strdup(papp->argv[0]); - } - if (NULL != papp->argv) { - app->argv = PMIX_ARGV_COPY_COMPAT(papp->argv); - } - if (NULL != papp->env) { - app->env = PMIX_ARGV_COPY_COMPAT(papp->env); - } - if (NULL != papp->cwd) { - app->cwd = strdup(papp->cwd); - } - app->num_procs = papp->maxprocs; - if (NULL != papp->info) { - for (m = 0; m < papp->ninfo; m++) { - info = &papp->info[m]; - if (PMIX_CHECK_KEY(info, PMIX_HOST)) { - prte_set_attribute(&app->attributes, PRTE_APP_DASH_HOST, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else if (PMIX_CHECK_KEY(info, PMIX_HOSTFILE)) { - prte_set_attribute(&app->attributes, PRTE_APP_HOSTFILE, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else if (PMIX_CHECK_KEY(info, PMIX_ADD_HOSTFILE)) { - prte_set_attribute(&app->attributes, PRTE_APP_ADD_HOSTFILE, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else if (PMIX_CHECK_KEY(info, PMIX_ADD_HOST)) { - prte_set_attribute(&app->attributes, PRTE_APP_ADD_HOST, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else if (PMIX_CHECK_KEY(info, PMIX_PREFIX)) { - prte_set_attribute(&app->attributes, PRTE_APP_PREFIX_DIR, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else if (PMIX_CHECK_KEY(info, PMIX_WDIR)) { - /* if this is a relative path, convert it to an absolute path */ - if (pmix_path_is_absolute(info->value.data.string)) { - app->cwd = strdup(info->value.data.string); - } else { - /* get the cwd */ - if (PRTE_SUCCESS != (rc = pmix_getcwd(cwd, sizeof(cwd)))) { - pmix_show_help("help-prted.txt", "cwd", true, "spawn", rc); - PMIX_RELEASE(jdata); - goto complete; - } - /* construct the absolute path */ - app->cwd = pmix_os_path(false, cwd, info->value.data.string, NULL); - } - } else if (PMIX_CHECK_KEY(info, PMIX_WDIR_USER_SPECIFIED)) { - flag = PMIX_INFO_TRUE(info); - prte_set_attribute(&app->attributes, PRTE_APP_USER_CWD, PRTE_ATTR_GLOBAL, - &flag, PMIX_BOOL); - } else if (PMIX_CHECK_KEY(info, PMIX_SET_SESSION_CWD)) { - flag = PMIX_INFO_TRUE(info); - prte_set_attribute(&app->attributes, PRTE_APP_SSNDIR_CWD, PRTE_ATTR_GLOBAL, - &flag, PMIX_BOOL); - } else if (PMIX_CHECK_KEY(info, PMIX_PRELOAD_FILES)) { - prte_set_attribute(&app->attributes, PRTE_APP_PRELOAD_FILES, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - - } else if (PMIX_CHECK_KEY(info, PMIX_PRELOAD_BIN)) { - prte_set_attribute(&app->attributes, PRTE_APP_PRELOAD_BIN, PRTE_ATTR_GLOBAL, - NULL, PMIX_BOOL); - /*** ENVIRONMENTAL VARIABLE DIRECTIVES ***/ - /* there can be multiple of these, so we add them to the attribute list */ - } else if (PMIX_CHECK_KEY(info, PMIX_SET_ENVAR)) { - envar.envar = info->value.data.envar.envar; - envar.value = info->value.data.envar.value; - envar.separator = info->value.data.envar.separator; - prte_prepend_attribute(&app->attributes, PRTE_APP_SET_ENVAR, - PRTE_ATTR_GLOBAL, - &envar, PMIX_ENVAR); - } else if (PMIX_CHECK_KEY(info, PMIX_ADD_ENVAR)) { - envar.envar = info->value.data.envar.envar; - envar.value = info->value.data.envar.value; - envar.separator = info->value.data.envar.separator; - prte_prepend_attribute(&app->attributes, PRTE_APP_ADD_ENVAR, - PRTE_ATTR_GLOBAL, - &envar, PMIX_ENVAR); - } else if (PMIX_CHECK_KEY(info, PMIX_UNSET_ENVAR)) { - prte_prepend_attribute(&app->attributes, PRTE_APP_UNSET_ENVAR, - PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else if (PMIX_CHECK_KEY(info, PMIX_PREPEND_ENVAR)) { - envar.envar = info->value.data.envar.envar; - envar.value = info->value.data.envar.value; - envar.separator = info->value.data.envar.separator; - prte_prepend_attribute(&app->attributes, PRTE_APP_PREPEND_ENVAR, - PRTE_ATTR_GLOBAL, - &envar, PMIX_ENVAR); - } else if (PMIX_CHECK_KEY(info, PMIX_APPEND_ENVAR)) { - envar.envar = info->value.data.envar.envar; - envar.value = info->value.data.envar.value; - envar.separator = info->value.data.envar.separator; - prte_prepend_attribute(&app->attributes, PRTE_APP_APPEND_ENVAR, - PRTE_ATTR_GLOBAL, - &envar, PMIX_ENVAR); - - } else if (PMIX_CHECK_KEY(info, PMIX_PSET_NAME)) { - prte_set_attribute(&app->attributes, PRTE_APP_PSET_NAME, PRTE_ATTR_GLOBAL, - info->value.data.string, PMIX_STRING); - } else { - /* unrecognized key */ - if (9 < pmix_output_get_verbosity(prte_pmix_server_globals.output)) { - pmix_show_help("help-prted.txt", "bad-key", true, "spawn", "application", - info->key); - } - } - } - } - } - /* initiate the default runtime options - had to delay this until - * after we parsed the apps as some runtime options are for - * the apps themselves */ - memset(&options, 0, sizeof(prte_rmaps_options_t)); - options.stream = prte_rmaps_base_framework.framework_output; - options.verbosity = 5; // usual value for base-level functions - schizo = (prte_schizo_base_module_t*)jdata->schizo; - rc = schizo->set_default_rto(jdata, &options); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - goto complete; - } - - /* transfer the job info across */ - for (m = 0; m < cd->ninfo; m++) { - info = &cd->info[m]; /*** REQUESTED MAPPER ***/ if (PMIX_CHECK_KEY(info, PMIX_MAPPER)) { jdata->map->req_mapper = strdup(info->value.data.string); @@ -386,6 +222,21 @@ static void interim(int sd, short args, void *cbdata) prte_set_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_ALLOC, PRTE_ATTR_GLOBAL, &flag, PMIX_BOOL); + /*** ALLOC/SESSION IDs ***/ + } else if (PMIX_CHECK_KEY(info, PMIX_SESSION_ID)) { + PMIX_VALUE_GET_NUMBER(rc, &info->value, u32, uint32_t); + if (PMIX_SUCCESS != rc) { + return PRTE_ERR_BAD_PARAM; + } + prte_set_attribute(&jdata->attributes, PRTE_JOB_SESSION_ID, + PRTE_ATTR_GLOBAL, &u32, PMIX_UINT32); + } else if (PMIX_CHECK_KEY(info, PMIX_ALLOC_ID)) { + prte_set_attribute(&jdata->attributes, PRTE_JOB_ALLOC_ID, + PRTE_ATTR_GLOBAL, info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_ALLOC_REQ_ID)) { + prte_set_attribute(&jdata->attributes, PRTE_JOB_REF_ID, + PRTE_ATTR_GLOBAL, info->value.data.string, PMIX_STRING); + /*** DISPLAY MAP ***/ } else if (PMIX_CHECK_KEY(info, PMIX_DISPLAY_MAP)) { flag = PMIX_INFO_TRUE(info); @@ -431,8 +282,7 @@ static void interim(int sd, short args, void *cbdata) pmix_show_help("help-prte-rmaps-base.txt", "redefining-policy", true, "mapping", info->value.data.string, prte_rmaps_base_print_mapping(prte_rmaps_base.mapping)); - rc = PRTE_ERR_BAD_PARAM; - goto complete; + return PRTE_ERR_BAD_PARAM; } PRTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, PRTE_MAPPING_PPR); prte_set_attribute(&jdata->attributes, PRTE_JOB_PPR, PRTE_ATTR_GLOBAL, @@ -442,7 +292,7 @@ static void interim(int sd, short args, void *cbdata) } else if (PMIX_CHECK_KEY(info, PMIX_MAPBY)) { rc = prte_rmaps_base_set_mapping_policy(jdata, info->value.data.string); if (PRTE_SUCCESS != rc) { - goto complete; + return rc; } /*** colocation directives ***/ @@ -465,21 +315,21 @@ static void interim(int sd, short args, void *cbdata) } else if (PMIX_CHECK_KEY(info, PMIX_RANKBY)) { rc = prte_rmaps_base_set_ranking_policy(jdata, info->value.data.string); if (PRTE_SUCCESS != rc) { - goto complete; + return rc; } /*** BIND-TO ***/ } else if (PMIX_CHECK_KEY(info, PMIX_BINDTO)) { rc = prte_hwloc_base_set_binding_policy(jdata, info->value.data.string); if (PRTE_SUCCESS != rc) { - goto complete; + return rc; } /*** RUNTIME OPTIONS - SHOULD ONLY APPEAR IF NOT PRE-PROCESSED BY SCHIZO ***/ } else if (PMIX_CHECK_KEY(info, PMIX_RUNTIME_OPTIONS)) { rc = prte_state_base_set_runtime_options(jdata, info->value.data.string); if (PRTE_SUCCESS != rc) { - goto complete; + return rc; } /*** ABORT_NON_ZERO ***/ @@ -756,8 +606,7 @@ static void interim(int sd, short args, void *cbdata) } else { PMIX_VALUE_GET_NUMBER(i, &info->value, rc, int); if (PMIX_SUCCESS != i) { - rc = i; - goto complete; + return PRTE_ERR_BAD_PARAM; } } prte_set_attribute(&jdata->attributes, PRTE_SPAWN_TIMEOUT, @@ -773,8 +622,7 @@ static void interim(int sd, short args, void *cbdata) } else { PMIX_VALUE_GET_NUMBER(i, &info->value, rc, int); if (PMIX_SUCCESS != i) { - rc = i; - goto complete; + return PRTE_ERR_BAD_PARAM; } } prte_set_attribute(&jdata->attributes, PRTE_JOB_TIMEOUT, @@ -805,6 +653,210 @@ static void interim(int sd, short args, void *cbdata) pmix_server_cache_job_info(jdata, info); } } + return PRTE_SUCCESS; +} + +int prte_pmix_xfer_app(prte_job_t *jdata, pmix_app_t *papp) +{ + prte_app_context_t *app; + pmix_info_t *info; + size_t m; + int rc; + bool flag; + pmix_envar_t envar; + char cwd[PRTE_PATH_MAX]; + + app = PMIX_NEW(prte_app_context_t); + app->job = (struct prte_job_t*)jdata; + app->idx = pmix_pointer_array_add(jdata->apps, app); + jdata->num_apps++; + if (NULL != papp->cmd) { + app->app = strdup(papp->cmd); + } else if (NULL == papp->argv || NULL == papp->argv[0]) { + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + PMIX_RELEASE(jdata); + return PRTE_ERR_BAD_PARAM; + } else { + app->app = strdup(papp->argv[0]); + } + if (NULL != papp->argv) { + app->argv = PMIX_ARGV_COPY_COMPAT(papp->argv); + } + if (NULL != papp->env) { + app->env = PMIX_ARGV_COPY_COMPAT(papp->env); + } + if (NULL != papp->cwd) { + app->cwd = strdup(papp->cwd); + } + app->num_procs = papp->maxprocs; + + if (NULL != papp->info) { + for (m = 0; m < papp->ninfo; m++) { + info = &papp->info[m]; + if (PMIX_CHECK_KEY(info, PMIX_HOST)) { + prte_set_attribute(&app->attributes, PRTE_APP_DASH_HOST, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_HOSTFILE)) { + prte_set_attribute(&app->attributes, PRTE_APP_HOSTFILE, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_ADD_HOSTFILE)) { + prte_set_attribute(&app->attributes, PRTE_APP_ADD_HOSTFILE, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_ADD_HOST)) { + prte_set_attribute(&app->attributes, PRTE_APP_ADD_HOST, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_PREFIX)) { + prte_set_attribute(&app->attributes, PRTE_APP_PREFIX_DIR, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_WDIR)) { + /* if this is a relative path, convert it to an absolute path */ + if (pmix_path_is_absolute(info->value.data.string)) { + app->cwd = strdup(info->value.data.string); + } else { + /* get the cwd */ + if (PRTE_SUCCESS != (rc = pmix_getcwd(cwd, sizeof(cwd)))) { + pmix_show_help("help-prted.txt", "cwd", true, "spawn", rc); + PMIX_RELEASE(jdata); + return rc; + } + /* construct the absolute path */ + app->cwd = pmix_os_path(false, cwd, info->value.data.string, NULL); + } + } else if (PMIX_CHECK_KEY(info, PMIX_WDIR_USER_SPECIFIED)) { + flag = PMIX_INFO_TRUE(info); + prte_set_attribute(&app->attributes, PRTE_APP_USER_CWD, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); + } else if (PMIX_CHECK_KEY(info, PMIX_SET_SESSION_CWD)) { + flag = PMIX_INFO_TRUE(info); + prte_set_attribute(&app->attributes, PRTE_APP_SSNDIR_CWD, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); + } else if (PMIX_CHECK_KEY(info, PMIX_PRELOAD_FILES)) { + prte_set_attribute(&app->attributes, PRTE_APP_PRELOAD_FILES, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + + } else if (PMIX_CHECK_KEY(info, PMIX_PRELOAD_BIN)) { + prte_set_attribute(&app->attributes, PRTE_APP_PRELOAD_BIN, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); + /*** ENVIRONMENTAL VARIABLE DIRECTIVES ***/ + /* there can be multiple of these, so we add them to the attribute list */ + } else if (PMIX_CHECK_KEY(info, PMIX_SET_ENVAR)) { + envar.envar = info->value.data.envar.envar; + envar.value = info->value.data.envar.value; + envar.separator = info->value.data.envar.separator; + prte_prepend_attribute(&app->attributes, PRTE_APP_SET_ENVAR, + PRTE_ATTR_GLOBAL, + &envar, PMIX_ENVAR); + } else if (PMIX_CHECK_KEY(info, PMIX_ADD_ENVAR)) { + envar.envar = info->value.data.envar.envar; + envar.value = info->value.data.envar.value; + envar.separator = info->value.data.envar.separator; + prte_prepend_attribute(&app->attributes, PRTE_APP_ADD_ENVAR, + PRTE_ATTR_GLOBAL, + &envar, PMIX_ENVAR); + } else if (PMIX_CHECK_KEY(info, PMIX_UNSET_ENVAR)) { + prte_prepend_attribute(&app->attributes, PRTE_APP_UNSET_ENVAR, + PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else if (PMIX_CHECK_KEY(info, PMIX_PREPEND_ENVAR)) { + envar.envar = info->value.data.envar.envar; + envar.value = info->value.data.envar.value; + envar.separator = info->value.data.envar.separator; + prte_prepend_attribute(&app->attributes, PRTE_APP_PREPEND_ENVAR, + PRTE_ATTR_GLOBAL, + &envar, PMIX_ENVAR); + } else if (PMIX_CHECK_KEY(info, PMIX_APPEND_ENVAR)) { + envar.envar = info->value.data.envar.envar; + envar.value = info->value.data.envar.value; + envar.separator = info->value.data.envar.separator; + prte_prepend_attribute(&app->attributes, PRTE_APP_APPEND_ENVAR, + PRTE_ATTR_GLOBAL, + &envar, PMIX_ENVAR); + + } else if (PMIX_CHECK_KEY(info, PMIX_PSET_NAME)) { + prte_set_attribute(&app->attributes, PRTE_APP_PSET_NAME, PRTE_ATTR_GLOBAL, + info->value.data.string, PMIX_STRING); + } else { + /* unrecognized key */ + if (9 < pmix_output_get_verbosity(prte_pmix_server_globals.output)) { + pmix_show_help("help-prted.txt", "bad-key", true, "spawn", "application", + info->key); + } + } + } + } + return PRTE_SUCCESS; +} + +static void interim(int sd, short args, void *cbdata) +{ + prte_pmix_server_op_caddy_t *cd = (prte_pmix_server_op_caddy_t *) cbdata; + pmix_proc_t *requestor = &cd->proc; + prte_job_t *jdata, *djob; + prte_app_context_t *app; + pmix_app_t *papp; + pmix_info_t *info; + int rc, i; + char *endptr; + bool flag; + size_t m, n; + uint16_t u16; + uint32_t u32; + pmix_rank_t rank; + prte_rmaps_options_t options; + prte_schizo_base_module_t *schizo; + PRTE_HIDE_UNUSED_PARAMS(sd, args); + + pmix_output_verbose(2, prte_pmix_server_globals.output, + "%s spawn called from proc %s with %d apps", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(requestor), + (int) cd->napps); + + /* create the job object */ + jdata = PMIX_NEW(prte_job_t); + jdata->map = PMIX_NEW(prte_job_map_t); + /* default to the requestor as the originator */ + PMIX_LOAD_PROCID(&jdata->originator, requestor->nspace, requestor->rank); + /* find the personality being passed - we need this info to direct + * option parsing */ + for (n=0; n < cd->ninfo; n++) { + if (PMIX_CHECK_KEY(&cd->info[n], PMIX_PERSONALITY)) { + jdata->personality = PMIX_ARGV_SPLIT_COMPAT(cd->info[n].value.data.string, ','); + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(cd->info[n].value.data.string); + pmix_server_cache_job_info(jdata, &cd->info[n]); + break; + } + } + if (NULL == jdata->personality) { + /* use the default */ + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(NULL); + } + + /* transfer the apps across */ + for (n = 0; n < cd->napps; n++) { + rc = prte_pmix_xfer_app(jdata, &cd->apps[n]); + if (PRTE_SUCCESS != rc) { + goto complete; + } + } + + /* initiate the default runtime options - had to delay this until + * after we parsed the apps as some runtime options are for + * the apps themselves */ + memset(&options, 0, sizeof(prte_rmaps_options_t)); + options.stream = prte_rmaps_base_framework.framework_output; + options.verbosity = 5; // usual value for base-level functions + schizo = (prte_schizo_base_module_t*)jdata->schizo; + rc = schizo->set_default_rto(jdata, &options); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + goto complete; + } + + /* transfer the job info across */ + rc = prte_pmix_xfer_job_info(jdata, cd->info); + if (PRTE_SUCCESS != rc) { + goto complete; + } /* set debugger flags on apps if needed */ if (PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_TOOL)) { diff --git a/src/prted/pmix/pmix_server_gen.c b/src/prted/pmix/pmix_server_gen.c index 42037f0163..cdbcee845d 100644 --- a/src/prted/pmix/pmix_server_gen.c +++ b/src/prted/pmix/pmix_server_gen.c @@ -19,7 +19,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -693,11 +693,11 @@ void pmix_tool_connected_fn(pmix_info_t *info, size_t ninfo, /* need to threadshift this request */ cd = PMIX_NEW(pmix_server_req_t); - cd->info = info; - cd->ninfo = ninfo; cd->toolcbfunc = cbfunc; cd->cbdata = cbdata; cd->target.rank = 0; // set default for tool + cd->info = info; + cd->ninfo = ninfo; prte_event_set(prte_event_base, &(cd->ev), -1, PRTE_EV_WRITE, _toolconn, cd); PMIX_POST_OBJECT(cd); diff --git a/src/prted/pmix/pmix_server_internal.h b/src/prted/pmix/pmix_server_internal.h index 4508c34b98..864c469915 100644 --- a/src/prted/pmix/pmix_server_internal.h +++ b/src/prted/pmix/pmix_server_internal.h @@ -75,9 +75,12 @@ typedef struct { bool flag; bool launcher; bool scheduler; + bool copy; // info array has been copied and must be released uid_t uid; gid_t gid; pid_t pid; + pmix_alloc_directive_t allocdir; + uint32_t sessionID; pmix_info_t *info; size_t ninfo; char *data; @@ -96,6 +99,7 @@ typedef struct { pmix_tool_connection_cbfunc_t toolcbfunc; pmix_info_cbfunc_t infocbfunc; void *cbdata; + void *rlcbdata; } pmix_server_req_t; PMIX_CLASS_DECLARATION(pmix_server_req_t); @@ -122,8 +126,6 @@ typedef struct { size_t napps; pmix_query_t *queries; size_t nqueries; - pmix_alloc_directive_t allocdir; - uint32_t sessionID; pmix_op_cbfunc_t cbfunc; pmix_info_cbfunc_t infocbfunc; pmix_tool_connection_cbfunc_t toolcbfunc; @@ -319,12 +321,30 @@ PRTE_EXPORT extern int prte_pmix_server_register_tool(pmix_nspace_t nspace); PRTE_EXPORT extern int pmix_server_cache_job_info(prte_job_t *jdata, pmix_info_t *info); -#if PMIX_NUMERIC_VERSION >= 0x00050000 +PRTE_EXPORT extern int prte_pmix_xfer_job_info(prte_job_t *jdata, pmix_info_t *info); + +PRTE_EXPORT extern int prte_pmix_xfer_app(prte_job_t *jdata, pmix_app_t *app); + +PRTE_EXPORT extern void pmix_server_alloc_request_resp(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, prte_rml_tag_t tg, + void *cbdata); + +PRTE_EXPORT extern pmix_status_t prte_pmix_set_scheduler(void); + +PRTE_EXPORT extern pmix_status_t prte_server_send_request(uint8_t cmd, pmix_server_req_t *req); + +#define PRTE_PMIX_ALLOC_REQ 0 +#define PRTE_PMIX_SESSION_CTRL 1 + + +#ifdef PMIX_SESSION_INSTANTIATE + PRTE_EXPORT extern pmix_status_t pmix_server_session_ctrl_fn(const pmix_proc_t *requestor, uint32_t sessionID, const pmix_info_t directives[], size_t ndirs, pmix_info_cbfunc_t cbfunc, void *cbdata); + #endif /* exposed shared variables */ diff --git a/src/prted/pmix/pmix_server_register_fns.c b/src/prted/pmix/pmix_server_register_fns.c index e46fffb60e..1f0b2195aa 100644 --- a/src/prted/pmix/pmix_server_register_fns.c +++ b/src/prted/pmix/pmix_server_register_fns.c @@ -88,7 +88,7 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) size_t nmsize; pmix_server_pset_t *pset; pmix_cpuset_t cpuset; - uint32_t ui32; + uint32_t ui32, *ui32_ptr; prte_job_t *parent = NULL; pmix_device_distance_t *distances; size_t ndist; @@ -125,14 +125,16 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) return rc; } - /* pass the session ID - just a 32-bit hash of our nspace for now */ - PRTE_HASH_STR(prte_process_info.myproc.nspace, ui32); - PMIX_INFO_LIST_ADD(ret, info, PMIX_SESSION_ID, &ui32, PMIX_UINT32); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PMIX_INFO_LIST_RELEASE(info); - rc = prte_pmix_convert_status(ret); - return rc; + /* pass the session ID */ + ui32_ptr = &ui32; + if(prte_get_attribute(&jdata->attributes, PRTE_JOB_SESSION_ID, (void **) &ui32_ptr, PMIX_UINT32)){ + PMIX_INFO_LIST_ADD(ret, info, PMIX_SESSION_ID, &ui32, PMIX_UINT32); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PMIX_INFO_LIST_RELEASE(info); + rc = prte_pmix_convert_status(ret); + return rc; + } } /* jobid */ diff --git a/src/prted/pmix/pmix_server_session.c b/src/prted/pmix/pmix_server_session.c index 67714f61ae..de2e1aa172 100644 --- a/src/prted/pmix/pmix_server_session.c +++ b/src/prted/pmix/pmix_server_session.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2022-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -12,242 +12,398 @@ #include "src/pmix/pmix-internal.h" #include "src/prted/pmix/pmix_server_internal.h" #include "src/rml/rml.h" +#include "src/util/dash_host/dash_host.h" +#include "src/mca/ras/base/ras_private.h" +#include "src/mca/rmaps/rmaps.h" +#include "src/mca/schizo/base/base.h" -static void localrelease(void *cbdata) -{ - pmix_server_req_t *req = (pmix_server_req_t*)cbdata; - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); - PMIX_RELEASE(req); -} +#ifdef PMIX_SESSION_INSTANTIATE -static void infocbfunc(pmix_status_t status, - pmix_info_t *info, size_t ninfo, - void *cbdata, - pmix_release_cbfunc_t rel, void *relcbdata) +static void localrelease(void *cbdata) { pmix_server_req_t *req = (pmix_server_req_t*)cbdata; - if (NULL != req->infocbfunc) { - req->infocbfunc(status, info, ninfo, req->cbdata, localrelease, req); - if (NULL != rel) { - rel(relcbdata); - } - return; - } - /* need to cleanup ourselves */ - if (NULL != rel) { - rel(relcbdata); - } pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); PMIX_RELEASE(req); } -static void pass_request(int sd, short args, void *cbdata) +/* Process the session control directive from the scheduler */ +static int process_directive(pmix_server_req_t *req) { - prte_pmix_server_op_caddy_t *cd = (prte_pmix_server_op_caddy_t*)cbdata; - pmix_server_req_t *req; - pmix_data_buffer_t *buf; - uint8_t command; + char *user_refid = NULL, *alloc_refid = NULL; + pmix_info_t *personality = NULL; + pmix_proc_t *requestor = NULL; + char *hosts = NULL; + prte_session_t *session = NULL; + prte_job_t *jdata = NULL; + prte_app_context_t *app; + pmix_app_t *apps; + size_t napps; + size_t n, i; pmix_status_t rc; - pmix_info_t info[2]; + bool terminate = false; + bool pause = false; + bool resume = false; + bool preempt = false; + bool restore = false; + bool signal = false; + bool extend = false; + int sigvalue, tval; - /* create a request tracker for this operation */ - req = PMIX_NEW(pmix_server_req_t); - if (0 < cd->allocdir) { - pmix_asprintf(&req->operation, "ALLOCATE: %u", cd->allocdir); - command = 0; - } else { - pmix_asprintf(&req->operation, "SESSIONCTRL: %u", cd->sessionID); - command = 1; - } - req->infocbfunc = cd->infocbfunc; - req->cbdata = cd->cbdata; - /* add this request to our local request tracker array */ - req->local_index = pmix_pointer_array_add(&prte_pmix_server_globals.local_reqs, req); + /* cycle across the directives to see what we are being asked to do */ + for (n = 0; n < req->ninfo; n++) { + + if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_INSTANTIATE)) { + // check to see if we already have this session + session = prte_get_session_object(req->sessionID); + if (NULL != session) { + // this is an error - cannot instantiate a preexisting + // session + rc = PMIX_ERR_BAD_PARAM; + goto ANSWER; + } + /* create a new session object */ + session = PMIX_NEW(prte_session_t); + if (NULL == session) { + rc = PRTE_ERR_OUT_OF_RESOURCE; + PRTE_ERROR_LOG(rc); + goto ANSWER; + } + session->session_id = req->sessionID; + rc = prte_set_session_object(session); + if (PRTE_SUCCESS != rc) { + PMIX_RELEASE(session); + goto ANSWER; + } + // if a job has already been described, add it here + if (NULL != jdata) { + pmix_pointer_array_add(session->jobs, jdata); + } + if (NULL != alloc_refid) { + session->alloc_refid = strdup(alloc_refid); + alloc_refid = NULL; + } + if (NULL != user_refid) { + session->user_refid = strdup(user_refid); + user_refid = NULL; + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_JOB)) { + if (NULL == jdata) { + jdata = PMIX_NEW(prte_job_t); + jdata->map = PMIX_NEW(prte_job_map_t); + /* default to the requestor as the originator */ + if (NULL != requestor) { + PMIX_LOAD_PROCID(&jdata->originator, requestor->nspace, requestor->rank); + } + } + // transfer the job description across + rc = prte_pmix_xfer_job_info(jdata, &req->info[n]); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + rc = prte_pmix_convert_rc(rc); + goto ANSWER; + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_APP)) { + // the apps that are to be started in the session + if (NULL == jdata) { + jdata = PMIX_NEW(prte_job_t); + jdata->map = PMIX_NEW(prte_job_map_t); + /* default to the requestor as the originator */ + if (NULL != requestor) { + PMIX_LOAD_PROCID(&jdata->originator, requestor->nspace, requestor->rank); + } + } + apps = (pmix_app_t*)req->info[n].value.data.darray->array; + napps = req->info[n].value.data.darray->size; + for (i=0; i < napps; i++) { + rc = prte_pmix_xfer_app(jdata, &apps[n]); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + rc = prte_pmix_convert_rc(rc); + goto ANSWER; + } + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_PERSONALITY)) { + personality = &req->info[n]; + if (NULL != jdata && NULL == jdata->personality) { + jdata->personality = PMIX_ARGV_SPLIT_COMPAT(personality->value.data.string, ','); + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(personality->value.data.string); + pmix_server_cache_job_info(jdata, personality); + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_REQUESTOR)) { + requestor = req->info[n].value.data.proc; + if (NULL != jdata) { + PMIX_LOAD_PROCID(&jdata->originator, requestor->nspace, requestor->rank); + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION) || + PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION_NODES) | + PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION_IMAGE)) { + // we don't support these directives + rc = PMIX_ERR_NOT_SUPPORTED; + goto ANSWER; + + } else if(PMIX_CHECK_KEY(&req->info[n], PMIX_ALLOC_ID)) { + if (NULL != session) { + session->alloc_refid = strdup(req->info[n].value.data.string); + } else { + alloc_refid = req->info[n].value.data.string; + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_ALLOC_REQ_ID)) { + if (NULL != session) { + session->user_refid = strdup(req->info[n].value.data.string); + } else { + user_refid = req->info[n].value.data.string; + } + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_ALLOC_NODE_LIST)) { + hosts = req->info[n].value.data.string; + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_ALLOC_NUM_CPU_LIST)) { + // we don't support this directive + rc = PMIX_ERR_NOT_SUPPORTED; + goto ANSWER; + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PAUSE)) { + pause = PMIX_INFO_TRUE(&req->info[n]); + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_RESUME)) { + resume = PMIX_INFO_TRUE(&req->info[n]); + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_TERMINATE)) { + terminate = PMIX_INFO_TRUE(&req->info[n]); - /* if we are the DVM master, then handle this ourselves */ - if (PRTE_PROC_IS_MASTER) { - if (!prte_pmix_server_globals.scheduler_connected) { - /* the scheduler has not attached to us - see if we - * can attach to it, make it optional so we don't - * hang if there is no scheduler available */ - PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_TO_SCHEDULER, NULL, PMIX_BOOL); - PMIX_INFO_LOAD(&info[1], PMIX_TOOL_CONNECT_OPTIONAL, NULL, PMIX_BOOL); - rc = PMIx_tool_attach_to_server(NULL, &prte_pmix_server_globals.scheduler, - info, 2); - PMIX_INFO_DESTRUCT(&info[0]); - PMIX_INFO_DESTRUCT(&info[1]); + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PREEMPT)) { + preempt = PMIX_INFO_TRUE(&req->info[n]); + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_RESTORE)) { + restore = PMIX_INFO_TRUE(&req->info[n]); + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_SIGNAL)) { + signal = true; + PMIX_VALUE_GET_NUMBER(rc, &req->info[n].value, sigvalue, int); if (PMIX_SUCCESS != rc) { - goto callback; + return PRTE_ERR_BAD_PARAM; } - prte_pmix_server_globals.scheduler_set_as_server = true; - } - /* if we have not yet set the scheduler as our server, do so */ - if (!prte_pmix_server_globals.scheduler_set_as_server) { - rc = PMIx_tool_set_server(&prte_pmix_server_globals.scheduler, NULL, 0); + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_EXTEND)) { + extend = PMIX_INFO_TRUE(&req->info[n]); + + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_TIMEOUT)) { + PMIX_VALUE_GET_NUMBER(rc, &req->info[n].value, tval, int); if (PMIX_SUCCESS != rc) { - goto callback; + return PRTE_ERR_BAD_PARAM; } - prte_pmix_server_globals.scheduler_set_as_server = true; } + } // for loop - if (0 == command) { - rc = PMIx_Allocation_request_nb(cd->allocdir, cd->info, cd->ninfo, - infocbfunc, req); - } else { -#if PMIX_NUMERIC_VERSION < 0x00050000 - rc = PMIX_ERR_NOT_SUPPORTED; -#else - rc = PMIx_Session_control(cd->sessionID, cd->info, cd->ninfo, - infocbfunc, req); -#endif + + if (NULL == session) { + // this operation is referring to a previously instantiated session + session = prte_get_session_object(req->sessionID); + if (NULL == session) { + // we don't know about it + return PRTE_ERR_NOT_FOUND; } - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto callback; + // TODO: execute the specified operation on this session + return PRTE_SUCCESS; + } + + // continue working on a new instantiation + if (NULL != jdata) { + /* find the personality being passed - we need this info to direct + * option parsing */ + if (NULL == personality) { + // do a quick search for it + for (i=0; i < req->ninfo; i++) { + if (PMIX_CHECK_KEY(&req->info[i], PMIX_PERSONALITY)) { + personality = &req->info[i]; + break; + } + } + } + if (NULL == personality) { + /* use the default */ + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(NULL); + } else { + jdata->personality = PMIX_ARGV_SPLIT_COMPAT(personality->value.data.string, ','); + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy(personality->value.data.string); + pmix_server_cache_job_info(jdata, personality); } - return; } - PMIX_DATA_BUFFER_CREATE(buf); + if (NULL != hosts) { + /* add the designation to the apps in the job, if one was provided. These + * will be added to the global pool when the job is setup for launch */ + if (NULL != jdata) { + for (n=0; n < jdata->num_apps; n++) { + app = (prte_app_context_t*)pmix_pointer_array_get_item(jdata->apps, n); + // the add_host attribute will be removed after processing + prte_set_attribute(&app->attributes, PRTE_APP_ADD_HOST, PRTE_ATTR_GLOBAL, + hosts, PMIX_STRING); + /* also provide it as dash-host (if not already specified) so the + * job will use those nodes */ + if (!prte_get_attribute(&app->attributes, PRTE_APP_DASH_HOST, NULL, PMIX_STRING)) { + prte_set_attribute(&app->attributes, PRTE_APP_DASH_HOST, PRTE_ATTR_GLOBAL, + hosts, PMIX_STRING); + } + } + } + } + return PRTE_SUCCESS; - /* construct a request message for the command */ - rc = PMIx_Data_pack(NULL, buf, &command, 1, PMIX_UINT8); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); - goto callback; +// only come here upon error +ANSWER: + if (NULL != req->infocbfunc) { + req->infocbfunc(rc, req->info, req->ninfo, req->cbdata, localrelease, req); + } else { + PMIX_RELEASE(req); } + return PRTE_SUCCESS; +} - /* pack the local requestor ID */ - rc = PMIx_Data_pack(NULL, buf, &req->local_index, 1, PMIX_INT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); - goto callback; +/* Callbacks to process a session control answer from the scheduler + * and pass on any results to the requesting client + */ +static void passthru(int sd, short args, void *cbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + + if (NULL != req->infocbfunc) { + // call the requestor's callback with the returned info + req->infocbfunc(req->status, req->info, req->ninfo, req->cbdata, req->rlcbfunc, req->rlcbdata); + } else { + // let them cleanup + req->rlcbfunc(req->rlcbdata); } + // cleanup our request + pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); + PMIX_RELEASE(req); +} - /* pack the requestor */ - rc = PMIx_Data_pack(NULL, buf, &cd->proc, 1, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); - goto callback; +static void infocbfunc(pmix_status_t status, + pmix_info_t *info, size_t ninfo, + void *cbdata, + pmix_release_cbfunc_t rel, void *relcbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + + // need to pass this into our progress thread for processing + // since we touch the global request array + req->status = status; + if (req->copy && NULL != req->info) { + PMIX_INFO_FREE(req->info, req->ninfo); + req->copy = false; } + req->info = info; + req->ninfo = ninfo; + req->rlcbfunc = rel; + req->rlcbdata = relcbdata; - if (0 == command) { - /* pack the allocation directive */ - rc = PMIx_Data_pack(NULL, buf, &cd->allocdir, 1, PMIX_ALLOC_DIRECTIVE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); - goto callback; - } - } else { - /* pack the sessionID */ - rc = PMIx_Data_pack(NULL, buf, &cd->sessionID, 1, PMIX_UINT32); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); + prte_event_set(prte_event_base, &req->ev, -1, PRTE_EV_WRITE, passthru, req); + PMIX_POST_OBJECT(req); + prte_event_active(&req->ev, PRTE_EV_WRITE, 1); +} + +static void pass_request(int sd, short args, void *cbdata) +{ + pmix_server_req_t *req = (pmix_server_req_t*)cbdata; + pmix_status_t rc; + size_t n; + pmix_info_t *xfer; + + /* add this request to our local request tracker array */ + req->local_index = pmix_pointer_array_add(&prte_pmix_server_globals.local_reqs, req); + + if (!PRTE_PROC_IS_MASTER) { + /* if we are not the DVM master, then we have to send + * this request to the master for processing */ + rc = prte_server_send_request(PRTE_PMIX_SESSION_CTRL, req); + if (PRTE_SUCCESS != rc) { goto callback; } + return; } - /* pack the number of info */ - rc = PMIx_Data_pack(NULL, buf, &cd->ninfo, 1, PMIX_SIZE); + /* if we are the DVM master, then handle this ourselves - start + * by ensuring the scheduler is connected to us */ + rc = prte_pmix_set_scheduler(); if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); goto callback; } - if (0 < cd->ninfo) { - /* pack the info */ - rc = PMIx_Data_pack(NULL, buf, cd->info, cd->ninfo, PMIX_INFO); + + /* if the requestor is the scheduler, then this is a directive + * to the DVM - e.g., to instantiate or terminate a session. + * In that case, we process it here */ + if (PMIX_CHECK_PROCID(&prte_pmix_server_globals.scheduler, &req->tproc)) { + rc = process_directive(req); + } else { + // we need to pass the request on to the scheduler + // need to add the requestor's ID to the info array + PMIX_INFO_CREATE(xfer, req->ninfo + 1); + for (n=0; n < req->ninfo; n++) { + PMIX_INFO_XFER(&xfer[n], &req->info[n]); + } + PMIX_INFO_LOAD(&xfer[req->ninfo], PMIX_REQUESTOR, &req->tproc, PMIX_PROC); + // the current req object points to the caller's info array, so leave it alone + req->copy = true; + req->info = xfer; + req->ninfo++; + rc = PMIx_Session_control(req->sessionID, req->info, req->ninfo, + infocbfunc, req); if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); goto callback; } + return; } - /* send this request to the DVM controller */ - PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, buf, PRTE_RML_TAG_SCHED); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); - PMIX_DATA_BUFFER_RELEASE(buf); - goto callback; - } - PMIX_RELEASE(cd); - return; - callback: - PMIX_RELEASE(cd); - /* this section gets executed solely upon an error */ + /* this section gets executed solely upon an error or if this was a + * directive to us from the scheduler */ if (NULL != req->infocbfunc) { req->infocbfunc(rc, req->info, req->ninfo, req->cbdata, localrelease, req); return; } + pmix_pointer_array_set_item(&prte_pmix_server_globals.local_reqs, req->local_index, NULL); PMIX_RELEASE(req); } -pmix_status_t pmix_server_alloc_fn(const pmix_proc_t *client, - pmix_alloc_directive_t directive, - const pmix_info_t data[], size_t ndata, - pmix_info_cbfunc_t cbfunc, void *cbdata) -{ - prte_pmix_server_op_caddy_t *cd; - - - pmix_output_verbose(2, prte_pmix_server_globals.output, - "%s allocate upcalled on behalf of proc %s:%u with %" PRIsize_t " infos", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), client->nspace, client->rank, ndata); - - cd = PMIX_NEW(prte_pmix_server_op_caddy_t); - PMIX_LOAD_PROCID(&cd->proc, client->nspace, client->rank); - cd->allocdir = directive; - cd->info = (pmix_info_t *) data; - cd->ninfo = ndata; - cd->infocbfunc = cbfunc; - cd->cbdata = cbdata; - prte_event_set(prte_event_base, &cd->ev, -1, PRTE_EV_WRITE, pass_request, cd); - PMIX_POST_OBJECT(cd); - prte_event_active(&cd->ev, PRTE_EV_WRITE, 1); - return PRTE_SUCCESS; -} - -#if PMIX_NUMERIC_VERSION >= 0x00050000 - +/* this is the upcall from the PMIx server for the session + * control support. Since we are going to touch global structures + * (e.g., the session tracker pointer array), we have to threadshift + * this request into our own internal progress thread. Note that the + * session control directive could have come to this host from the + * scheduler, or a tool, or even an application process. */ pmix_status_t pmix_server_session_ctrl_fn(const pmix_proc_t *requestor, uint32_t sessionID, const pmix_info_t directives[], size_t ndirs, pmix_info_cbfunc_t cbfunc, void *cbdata) { - prte_pmix_server_op_caddy_t *cd; - + pmix_server_req_t *req; pmix_output_verbose(2, prte_pmix_server_globals.output, "%s session ctrl upcalled on behalf of proc %s:%u with %" PRIsize_t " directives", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), requestor->nspace, requestor->rank, ndirs); - cd = PMIX_NEW(prte_pmix_server_op_caddy_t); - PMIX_LOAD_PROCID(&cd->proc, requestor->nspace, requestor->rank); - cd->sessionID = sessionID; - cd->info = (pmix_info_t *) directives; - cd->ninfo = ndirs; - cd->infocbfunc = cbfunc; - cd->cbdata = cbdata; - prte_event_set(prte_event_base, &cd->ev, -1, PRTE_EV_WRITE, pass_request, cd); - PMIX_POST_OBJECT(cd); - prte_event_active(&cd->ev, PRTE_EV_WRITE, 1); + /* create a request tracker for this operation */ + req = PMIX_NEW(pmix_server_req_t); + pmix_asprintf(&req->operation, "SESSIONCTRL: %u", sessionID); + PMIX_PROC_LOAD(&req->tproc, requestor->nspace, requestor->rank); + req->sessionID = sessionID; + req->info = (pmix_info_t *) directives; + req->ninfo = ndirs; + req->infocbfunc = cbfunc; + req->cbdata = cbdata; + + prte_event_set(prte_event_base, &req->ev, -1, PRTE_EV_WRITE, pass_request, req); + PMIX_POST_OBJECT(req); + prte_event_active(&req->ev, PRTE_EV_WRITE, 1); return PRTE_SUCCESS; } diff --git a/src/rml/rml_types.h b/src/rml/rml_types.h index acee04b136..91dafb4cd2 100644 --- a/src/rml/rml_types.h +++ b/src/rml/rml_types.h @@ -15,7 +15,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -222,6 +222,7 @@ typedef void (*prte_rml_buffer_callback_fn_t)(int status, pmix_proc_t *peer, /* scheduler requests */ #define PRTE_RML_TAG_SCHED 72 +#define PRTE_RML_TAG_SCHED_RESP 73 #define PRTE_RML_TAG_MAX 100 diff --git a/src/runtime/prte_globals.c b/src/runtime/prte_globals.c index b277554273..4926d609a9 100644 --- a/src/runtime/prte_globals.c +++ b/src/runtime/prte_globals.c @@ -123,6 +123,7 @@ prte_timer_t *prte_mpiexec_timeout = NULL; int prte_stack_trace_wait_timeout = 30; /* global arrays for data storage */ +pmix_pointer_array_t *prte_sessions = NULL; pmix_pointer_array_t *prte_job_data = NULL; pmix_pointer_array_t *prte_node_pool = NULL; pmix_pointer_array_t *prte_node_topologies = NULL; @@ -143,6 +144,7 @@ char *prte_default_hostfile = NULL; bool prte_default_hostfile_given = false; int prte_num_allocated_nodes = 0; char *prte_default_dash_host = NULL; +prte_session_t *prte_default_session = NULL; /* tool communication controls */ bool prte_report_events = false; @@ -257,6 +259,131 @@ int prte_set_job_data_object(prte_job_t *jdata) return PRTE_SUCCESS; } +prte_session_t *prte_get_session_object(const uint32_t session_id) +{ + prte_session_t *session; + int i; + + /* if the job data wasn't setup, we cannot provide the data */ + if (NULL == prte_sessions) { + return NULL; + } + for (i = 0; i < prte_sessions->size; i++) { + session = (prte_session_t *) pmix_pointer_array_get_item(prte_sessions, i); + if (NULL == session) { + continue; + } + if (session->session_id == session_id) { + return session; + } + } + return NULL; +} + +prte_session_t *prte_get_session_object_from_id(const char *id) +{ + prte_session_t *session; + int i; + + /* if the job data wasn't setup, we cannot provide the data */ + if (NULL == prte_sessions) { + return NULL; + } + /* if the session ID is invalid, then reject it */ + if (NULL == id) { + return NULL; + } + for (i = 0; i < prte_sessions->size; i++) { + session = (prte_session_t *) pmix_pointer_array_get_item(prte_sessions, i); + if (NULL == session) { + continue; + } + if (0 == strcasecmp(session->alloc_refid, id)) { + return session; + } + } + return NULL; +} + +prte_session_t *prte_get_session_object_from_refid(const char *refid) +{ + prte_session_t *session; + int i; + + /* if the job data wasn't setup, we cannot provide the data */ + if (NULL == prte_sessions) { + return NULL; + } + /* if the session ID is invalid, then reject it */ + if (NULL == refid) { + return NULL; + } + for (i = 0; i < prte_sessions->size; i++) { + session = (prte_session_t *) pmix_pointer_array_get_item(prte_sessions, i); + if (NULL == session) { + continue; + } + if (0 == strcasecmp(session->user_refid, refid)) { + return session; + } + } + return NULL; +} + +int prte_set_session_object(prte_session_t *session) +{ + prte_session_t *session_ptr; + int i, save = -1; + + /* if the session data wasn't setup, we cannot set the data */ + if (NULL == prte_sessions) { + return PRTE_ERROR; + } + /* verify that we don't already have this object */ + for (i = 0; i < prte_sessions->size; i++) { + session_ptr = (prte_session_t *) pmix_pointer_array_get_item(prte_sessions, i); + if (NULL == session_ptr) { + if (0 > save) { + save = i; + } + continue; + } + if (session_ptr->session_id == session->session_id) { + return PRTE_EXISTS; + } + } + + if (-1 == save) { + session->index = pmix_pointer_array_add(prte_sessions, session); + if (0 > session->index) { + return PRTE_ERROR; + } + } else { + session->index = save; + pmix_pointer_array_set_item(prte_sessions, save, session); + } + return PRTE_SUCCESS; +} + +bool prte_sessions_related(prte_session_t *session1, prte_session_t *session2){ + size_t n; + prte_session_t *session_ptr; + + if(session1->session_id == session2->session_id){ + return true; + } + + for (n = 0; n < session1->children->size; n++){ + session_ptr = (prte_session_t *) pmix_pointer_array_get_item(session1->children, n); + if (NULL != session_ptr) { + if (session_ptr->session_id == session2->session_id) { + return true; + } + } + } + return false; +} + prte_proc_t *prte_get_proc_object(const pmix_proc_t *proc) { prte_job_t *jdata; @@ -466,6 +593,7 @@ PMIX_CLASS_INSTANCE(prte_app_context_t, pmix_object_t, prte_app_context_construc static void prte_job_construct(prte_job_t *job) { job->exit_code = 0; + job->session = NULL; job->personality = NULL; job->schizo = NULL; PMIX_LOAD_NSPACE(job->nspace, NULL); @@ -793,6 +921,75 @@ static void tdes(prte_topology_t *t) PMIX_CLASS_INSTANCE(prte_topology_t, pmix_object_t, tcon, tdes); +static void session_con(prte_session_t *s) +{ + s->index = -1; + s->session_id = UINT32_MAX; + s->user_refid = NULL; + s->alloc_refid = NULL; + memset(&s->timeout, 0, sizeof(struct timeval)); + s->nodes = PMIX_NEW(pmix_pointer_array_t); + pmix_pointer_array_init(s->nodes, PRTE_GLOBAL_ARRAY_BLOCK_SIZE, + PRTE_GLOBAL_ARRAY_MAX_SIZE, + PRTE_GLOBAL_ARRAY_BLOCK_SIZE); + s->jobs = PMIX_NEW(pmix_pointer_array_t); + pmix_pointer_array_init(s->jobs, PRTE_GLOBAL_ARRAY_BLOCK_SIZE, + PRTE_GLOBAL_ARRAY_MAX_SIZE, + PRTE_GLOBAL_ARRAY_BLOCK_SIZE); + s->children = PMIX_NEW(pmix_pointer_array_t); + pmix_pointer_array_init(s->children, PRTE_GLOBAL_ARRAY_BLOCK_SIZE, + PRTE_GLOBAL_ARRAY_MAX_SIZE, + PRTE_GLOBAL_ARRAY_BLOCK_SIZE); +} +static void session_des(prte_session_t *s) +{ + int n; + prte_node_t *nd; + prte_job_t *job; + prte_session_t *session; + + if (NULL != s->user_refid) { + free(s->user_refid); + } + if (NULL != s->alloc_refid) { + free(s->alloc_refid); + } + + for (n=0; n < s->nodes->size; s++) { + nd = (prte_node_t*)pmix_pointer_array_get_item(s->nodes, n); + if (NULL != nd) { + PMIX_RELEASE(nd); + pmix_pointer_array_set_item(s->nodes, n, NULL); + } + } + PMIX_RELEASE(s->nodes); + + for (n=0; n < s->jobs->size; s++) { + job = (prte_job_t*)pmix_pointer_array_get_item(s->jobs, n); + if (NULL != job) { + PMIX_RELEASE(job); + pmix_pointer_array_set_item(s->jobs, n, NULL); + } + } + PMIX_RELEASE(s->jobs); + + for (n=0; n < s->children->size; s++) { + session = (prte_session_t*)pmix_pointer_array_get_item(s->children, n); + if (NULL != session) { + PMIX_RELEASE(session); + pmix_pointer_array_set_item(s->children, n, NULL); + } + } + PMIX_RELEASE(s->children); + // remove this from the global array + if (0 <= s->index) { + pmix_pointer_array_set_item(prte_sessions, s->index, NULL); + } +} +PMIX_CLASS_INSTANCE(prte_session_t, + pmix_list_item_t, + session_con, session_des); + #if PRTE_PICKY_COMPILERS void prte_hide_unused_params(int x, ...) { diff --git a/src/runtime/prte_globals.h b/src/runtime/prte_globals.h index 7dd57cbe12..d3cd3495e8 100644 --- a/src/runtime/prte_globals.h +++ b/src/runtime/prte_globals.h @@ -206,6 +206,20 @@ typedef struct { } prte_topology_t; PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_topology_t); +/* Object for tracking allocations */ +typedef struct{ + pmix_object_t super; + int index; + uint32_t session_id; + char *user_refid; // PMIX_ALLOC_REQ_ID + char *alloc_refid; // PMIX_ALLOC_ID + struct timeval timeout; // time limit on session + pmix_pointer_array_t *nodes; + pmix_pointer_array_t *jobs; + pmix_pointer_array_t *children; +} prte_session_t; +PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_session_t); + /** * Information about a specific application to be launched in the RTE. */ @@ -321,6 +335,8 @@ typedef struct { /* offset to the total number of procs so shared memory * components can potentially connect to any spawned jobs*/ pmix_rank_t offset; + /* session this job is running in */ + prte_session_t *session; /* app_context array for this job */ pmix_pointer_array_t *apps; /* number of app_contexts in the array */ @@ -434,6 +450,15 @@ struct prte_proc_t { typedef struct prte_proc_t prte_proc_t; PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_proc_t); +/** Get session object */ +PRTE_EXPORT prte_session_t *prte_get_session_object(const uint32_t session_id); +PRTE_EXPORT prte_session_t *prte_get_session_object_from_id(const char *id); +PRTE_EXPORT prte_session_t *prte_get_session_object_from_refid(const char *refid); + +PRTE_EXPORT int prte_set_session_object(prte_session_t *session); + +PRTE_EXPORT bool prte_sessions_related(prte_session_t *session1, prte_session_t *session2); + /** * Get a job data object * We cannot just reference a job data object with its jobid as @@ -557,6 +582,7 @@ PRTE_EXPORT extern float prte_max_timeout; PRTE_EXPORT extern prte_timer_t *prte_mpiexec_timeout; /* global arrays for data storage */ +PRTE_EXPORT extern pmix_pointer_array_t *prte_sessions; PRTE_EXPORT extern pmix_pointer_array_t *prte_job_data; PRTE_EXPORT extern pmix_pointer_array_t *prte_node_pool; PRTE_EXPORT extern pmix_pointer_array_t *prte_node_topologies; @@ -577,6 +603,7 @@ PRTE_EXPORT extern char *prte_default_hostfile; PRTE_EXPORT extern bool prte_default_hostfile_given; PRTE_EXPORT extern int prte_num_allocated_nodes; PRTE_EXPORT extern char *prte_default_dash_host; +PRTE_EXPORT extern prte_session_t *prte_default_session; /* tool communication controls */ PRTE_EXPORT extern bool prte_report_events; diff --git a/src/runtime/prte_init.c b/src/runtime/prte_init.c index c19b98a97a..93541a37c6 100644 --- a/src/runtime/prte_init.c +++ b/src/runtime/prte_init.c @@ -18,7 +18,7 @@ * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -308,6 +308,27 @@ int prte_init(int *pargc, char ***pargv, prte_proc_type_t flags) error = "setup node array"; goto error; } + prte_sessions = PMIX_NEW(pmix_pointer_array_t); + ret = pmix_pointer_array_init(prte_sessions, + PRTE_GLOBAL_ARRAY_BLOCK_SIZE, + PRTE_GLOBAL_ARRAY_MAX_SIZE, + PRTE_GLOBAL_ARRAY_BLOCK_SIZE); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + error = "setup session array"; + goto error; + } + /* Create a session object for the DVM and let it point to the node pool */ + prte_default_session = PMIX_NEW(prte_session_t); + prte_default_session->session_id = UINT32_MAX; + PMIX_RELEASE(prte_default_session->nodes); + prte_default_session->nodes = prte_node_pool; + ret = prte_set_session_object(prte_default_session); + if (PRTE_SUCCESS != ret) { + error = "set session object"; + goto error; + } + prte_node_topologies = PMIX_NEW(pmix_pointer_array_t); ret = pmix_pointer_array_init(prte_node_topologies, PRTE_GLOBAL_ARRAY_BLOCK_SIZE, PRTE_GLOBAL_ARRAY_MAX_SIZE, diff --git a/src/runtime/prte_mca_params.c b/src/runtime/prte_mca_params.c index 090968a3a6..69477eefaa 100644 --- a/src/runtime/prte_mca_params.c +++ b/src/runtime/prte_mca_params.c @@ -54,7 +54,6 @@ static int prte_progress_thread_debug_level = -1; static char *prte_tmpdir_base = NULL; static char *prte_local_tmpdir_base = NULL; static char *prte_remote_tmpdir_base = NULL; -static char *prte_top_session_dir = NULL; static char *local_setup_slots = NULL; char *prte_signal_string = NULL; diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index 4938269489..2bb9f571d5 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -1185,7 +1185,6 @@ int main(int argc, char *argv[]) if (verbose) { pmix_output(0, "Spawning job"); } - /* let the PMIx server handle it for us so that all the job infos * get properly recorded - e.g., forwarding IOF */ PRTE_PMIX_CONSTRUCT_LOCK(&lock); diff --git a/src/tools/psched/scheduler.c b/src/tools/psched/scheduler.c index c84e2c66ed..24d5617de7 100644 --- a/src/tools/psched/scheduler.c +++ b/src/tools/psched/scheduler.c @@ -4,7 +4,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -70,6 +70,17 @@ void psched_request_init(int fd, short args, void *cbdata) req->user_refid = strdup(req->data[n].value.data.string); } else if (PMIX_CHECK_KEY(&req->data[n], PMIX_ALLOC_ID)) { req->alloc_refid = strdup(req->data[n].value.data.string); + } else if (PMIX_CHECK_KEY(&req->data[n], PMIX_SESSION_ID)) { + PMIX_VALUE_GET_NUMBER(rc, &req->data[n].value, req->sessionID, uint32_t); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + // track the first error + if (PMIX_SUCCESS == rcerr) { + rcerr = rc; + } + } + // continue processing as we may need some of the info + // when reporting back the error } else if (PMIX_CHECK_KEY(&req->data[n], PMIX_ALLOC_NUM_NODES)) { PMIX_VALUE_GET_NUMBER(rc, &req->data[n].value, req->num_nodes, uint64_t); if (PMIX_SUCCESS != rc) { diff --git a/src/util/attr.c b/src/util/attr.c index 98b45ab5c1..2bcca8e33f 100644 --- a/src/util/attr.c +++ b/src/util/attr.c @@ -3,7 +3,7 @@ * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * Copyright (c) 2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -499,6 +499,12 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key) return "DISPLAY PARSEABLE OUTPUT"; case PRTE_JOB_EXTEND_DVM: return "EXTEND DVM"; + case PRTE_JOB_SESSION_ID: + return "SESSION ID"; + case PRTE_JOB_ALLOC_ID: + return "ALLOC ID"; + case PRTE_JOB_REF_ID: + return "ALLOC REF ID"; case PRTE_PROC_NOBARRIER: return "PROC-NOBARRIER"; diff --git a/src/util/attr.h b/src/util/attr.h index 13eecb8344..7d8d48086d 100644 --- a/src/util/attr.h +++ b/src/util/attr.h @@ -3,7 +3,7 @@ * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * Copyright (c) 2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -220,6 +220,12 @@ typedef uint16_t prte_job_flags_t; // are to be displayed #define PRTE_JOB_DISPLAY_PARSEABLE_OUTPUT (PRTE_JOB_START_KEY + 110) // bool - display output in machine parsable format #define PRTE_JOB_EXTEND_DVM (PRTE_JOB_START_KEY + 111) // bool - DVM is being extended +#define PRTE_JOB_SESSION_ID (PRTE_JOB_START_KEY + 112) // uint32_t - session id of this job +#define PRTE_JOB_ALLOC_ID (PRTE_JOB_START_KEY + 113) // char* - string identifier assigned by the host for the session + // within which the job is to execute +#define PRTE_JOB_REF_ID (PRTE_JOB_START_KEY + 114) // char* - string identifier assigned by the user to an allocation + // request - carried along with the session that resulted + // from the request #define PRTE_JOB_MAX_KEY (PRTE_JOB_START_KEY + 200) diff --git a/src/util/dash_host/dash_host.c b/src/util/dash_host/dash_host.c index d20cf8b4c7..cffb1a85ec 100644 --- a/src/util/dash_host/dash_host.c +++ b/src/util/dash_host/dash_host.c @@ -14,7 +14,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -738,6 +738,7 @@ int prte_util_filter_dash_host_nodes(pmix_list_t *nodes, char *hosts, bool remov return rc; } + int prte_util_get_ordered_dash_host_list(pmix_list_t *nodes, char *hosts) { int rc, i;