From 9408cb6362de0a8cba3bb8e4ce817446c7a50946 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 19 Oct 2024 07:56:41 -0600 Subject: [PATCH] Implement the "limit" modifier for binding Sometimes a user wants to map/bind procs by an object type, but needs to limit the number of procs bound to any particular object to some arbitrary number (i.e., not the number of available CPUs on the object). Example might be to map/bind to a cache level, but limit the number of procs on any given cache to some smaller number before moving to the next cache. Docs were updated to explain this in a prior commit. Signed-off-by: Ralph Castain --- src/hwloc/hwloc-internal.h | 15 +++++++ src/hwloc/hwloc.c | 8 ++++ src/hwloc/hwloc_base_util.c | 60 +++++++++++++++++++++++++ src/mca/rmaps/base/rmaps_base_binding.c | 16 +++++++ src/mca/rmaps/base/rmaps_base_map_job.c | 2 + 5 files changed, 101 insertions(+) diff --git a/src/hwloc/hwloc-internal.h b/src/hwloc/hwloc-internal.h index eb238d6f9c..d89b67a161 100644 --- a/src/hwloc/hwloc-internal.h +++ b/src/hwloc/hwloc-internal.h @@ -111,6 +111,18 @@ typedef struct { } prte_hwloc_topo_data_t; PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_hwloc_topo_data_t); +/** + * Struct used to cache object-level data used + * when computing process placement - the struct + * is attached to the userdata of each object + * in the topology upon first use of that object + * in a placement computation + */ +typedef struct { + pmix_object_t super; + unsigned nprocs; +} prte_hwloc_obj_data_t; +PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_hwloc_obj_data_t); /* define binding policies */ typedef uint16_t prte_binding_policy_t; @@ -263,6 +275,9 @@ PRTE_EXPORT hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, unsigned int instance); +// reset all obj counters +PRTE_EXPORT void prte_hwloc_base_reset_counters(void); + /** * Get the number of pu's under a given hwloc object. */ diff --git a/src/hwloc/hwloc.c b/src/hwloc/hwloc.c index c341a40e28..806a68e798 100644 --- a/src/hwloc/hwloc.c +++ b/src/hwloc/hwloc.c @@ -683,3 +683,11 @@ PMIX_CLASS_INSTANCE(prte_hwloc_topo_data_t, pmix_object_t, topo_data_const, NULL); + +static void obj_data_const(prte_hwloc_obj_data_t *ptr) +{ + ptr->nprocs = 0; +} +PMIX_CLASS_INSTANCE(prte_hwloc_obj_data_t, + pmix_object_t, + obj_data_const, NULL); diff --git a/src/hwloc/hwloc_base_util.c b/src/hwloc/hwloc_base_util.c index 80cd5187f8..92ef5f232a 100644 --- a/src/hwloc/hwloc_base_util.c +++ b/src/hwloc/hwloc_base_util.c @@ -1860,3 +1860,63 @@ int prte_hwloc_print(char **output, char *prefix, hwloc_topology_t src) *output = tmp; return PRTE_SUCCESS; } + +void prte_hwloc_base_reset_counters(void) +{ + prte_topology_t *ptopo; + hwloc_topology_t topo; + hwloc_obj_type_t type; + hwloc_obj_t obj; + prte_hwloc_obj_data_t *objcnt; + unsigned width, w; + unsigned depth, d; + int n; + + /* this can be a fairly expensive operation as we must traverse + * all objects of interest in all topologies since we cannot + * know which ones might have been used. Fortunately, we almost + * always have only one topology, and there aren't that many + * objects in it - so this normally goes fairly quickly + */ + + for (n = 0; n < prte_node_topologies->size; n++) { + ptopo = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, n); + if (NULL == ptopo) { + continue; + } + topo = ptopo->topo; + + /* get the max depth of the topology */ + depth = hwloc_topology_get_depth(topo); + + /* start at the first depth below the top machine level */ + for (d = 1; d < depth; d++) { + /* get the object type at this depth */ + type = hwloc_get_depth_type(topo, d); + /* if it isn't one of interest, then ignore it */ + if (HWLOC_OBJ_NUMANODE != type && HWLOC_OBJ_PACKAGE != type && + HWLOC_OBJ_L1CACHE != type && HWLOC_OBJ_L2CACHE != type && HWLOC_OBJ_L3CACHE != type && + HWLOC_OBJ_CORE != type && HWLOC_OBJ_PU != type) { + continue; + } + + /* get the width of the topology at this depth */ + width = hwloc_get_nbobjs_by_depth(topo, d); + if (0 == width) { + continue; + } + + /* scan all objects at this depth to see if + * the location overlaps with them + */ + for (w = 0; w < width; w++) { + /* get the object at this depth/index */ + obj = hwloc_get_obj_by_depth(topo, d, w); + if (NULL != obj->userdata) { + objcnt = (prte_hwloc_obj_data_t*)obj->userdata; + objcnt->nprocs = 0; + } + } + } + } +} diff --git a/src/mca/rmaps/base/rmaps_base_binding.c b/src/mca/rmaps/base/rmaps_base_binding.c index dbdf945445..5ab45bded5 100644 --- a/src/mca/rmaps/base/rmaps_base_binding.c +++ b/src/mca/rmaps/base/rmaps_base_binding.c @@ -62,6 +62,7 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, hwloc_obj_t target; hwloc_cpuset_t tgtcpus, tmpcpus; int nobjs, n; + prte_hwloc_obj_data_t *objcnt; pmix_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps: bind %s with policy %s", @@ -95,6 +96,18 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, for (n=0; n < nobjs; n++) { tmp_obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, options->hwb, n); + // if a limit on the number of procs/object has been set, + // then check it here + if (NULL == tmp_obj->userdata) { + objcnt = PMIX_NEW(prte_hwloc_obj_data_t); + tmp_obj->userdata = (void*)objcnt; + } else { + objcnt = (prte_hwloc_obj_data_t*)tmp_obj->userdata; + } + if (0 < options->limit && options->limit <= objcnt->nprocs) { + // skip this object + continue; + } tmpcpus = tmp_obj->cpuset; hwloc_bitmap_and(prte_rmaps_base.available, node->available, tmpcpus); hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, prte_rmaps_base.baseset); @@ -115,6 +128,9 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, } if (0 < ncpus) { trg_obj = tmp_obj; + if (0 < options->limit) { + objcnt->nprocs++; + } break; } } diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index 3c7d301b22..f62f1b0725 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -111,6 +111,8 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } if (prte_get_attribute(&jdata->attributes, PRTE_JOB_BINDING_LIMIT, (void**) &u16ptr, PMIX_UINT16)) { options.limit = u16; + // reset any prior counters + prte_hwloc_base_reset_counters(); } pmix_output_verbose(5, prte_rmaps_base_framework.framework_output,