UCP/PROTO: Add CPU performance type

yosefe · yosefe · commit fe51fea8be71 · 2022-07-12T11:57:51.000+03:00
When there are multiple parallel stages and each of them consumes CPU
time, we need to accumulate this CPU time rather than taking just the
maximum.
diff --git a/.clang-format b/.clang-format
@@ -73,6 +73,7 @@ ForEachMacros: ['_UCS_BITMAP_FOR_EACH_WORD',
                 'kh_foreach',
                 'kh_foreach_key',
                 'kh_foreach_value',
+                'UCP_PROTO_PERF_TYPE_FOREACH',
                 'ucp_unpacked_address_for_each',
                 'ucs_array_for_each',
                 'UCS_BITMAP_FOR_EACH_BIT',
diff --git a/src/ucp/proto/proto.c b/src/ucp/proto/proto.c
@@ -78,7 +78,8 @@ const ucp_proto_t *ucp_protocols[] = {
 
 const char *ucp_proto_perf_type_names[] = {
     [UCP_PROTO_PERF_TYPE_SINGLE] = "single",
-    [UCP_PROTO_PERF_TYPE_MULTI]  = "multi"
+    [UCP_PROTO_PERF_TYPE_MULTI]  = "multi",
+    [UCP_PROTO_PERF_TYPE_CPU]    = "cpu"
 };
 
 const char *ucp_operation_names[] = {
@@ -131,3 +132,33 @@ void ucp_proto_default_query(const ucp_proto_query_params_t *params,
     ucs_strncpy_safe(attr->desc, params->proto->desc, sizeof(attr->desc));
     ucs_strncpy_safe(attr->config, "", sizeof(attr->config));
 }
+
+void ucp_proto_perf_set(ucs_linear_func_t perf[UCP_PROTO_PERF_TYPE_LAST],
+                        ucs_linear_func_t func)
+{
+    ucp_proto_perf_type_t perf_type;
+
+    UCP_PROTO_PERF_TYPE_FOREACH(perf_type) {
+        perf[perf_type] = func;
+    }
+}
+
+void ucp_proto_perf_copy(ucs_linear_func_t dest[UCP_PROTO_PERF_TYPE_LAST],
+                         const ucs_linear_func_t src[UCP_PROTO_PERF_TYPE_LAST])
+{
+    ucp_proto_perf_type_t perf_type;
+
+    UCP_PROTO_PERF_TYPE_FOREACH(perf_type) {
+        dest[perf_type] = src[perf_type];
+    }
+}
+
+void ucp_proto_perf_add(ucs_linear_func_t perf[UCP_PROTO_PERF_TYPE_LAST],
+                        ucs_linear_func_t func)
+{
+    ucp_proto_perf_type_t perf_type;
+
+    UCP_PROTO_PERF_TYPE_FOREACH(perf_type) {
+        ucs_linear_func_add_inplace(&perf[perf_type], func);
+    }
+}
diff --git a/src/ucp/proto/proto.h b/src/ucp/proto/proto.h
@@ -113,16 +113,33 @@ typedef struct {
  *                                        +---------+---------------+
  */
 typedef enum {
+    UCP_PROTO_PERF_TYPE_FIRST,
+
     /* Time to complete this operation assuming it's the only one. */
-    UCP_PROTO_PERF_TYPE_SINGLE,
+    UCP_PROTO_PERF_TYPE_SINGLE = UCP_PROTO_PERF_TYPE_FIRST,
 
     /* Time to complete this operation after all previous ones complete. */
     UCP_PROTO_PERF_TYPE_MULTI,
 
+    /* CPU time the operation consumes (it would be less than or equal to the
+     * SINGLE and MULTI times).
+     */
+    UCP_PROTO_PERF_TYPE_CPU,
+
     UCP_PROTO_PERF_TYPE_LAST
 } ucp_proto_perf_type_t;
 
 
+/*
+ * Iterate over performance types.
+ *
+ * @param _perf_type  Performance type iterator variable.
+ */
+#define UCP_PROTO_PERF_TYPE_FOREACH(_perf_type) \
+    for (_perf_type = UCP_PROTO_PERF_TYPE_FIRST; \
+         _perf_type < UCP_PROTO_PERF_TYPE_LAST; ++(_perf_type))
+
+
 /*
  * Performance estimation for a range of message sizes.
  */
@@ -312,20 +329,16 @@ unsigned ucp_protocols_count(void);
 void ucp_proto_default_query(const ucp_proto_query_params_t *params,
                              ucp_proto_query_attr_t *attr);
 
-static inline void
-ucp_proto_perf_copy(ucs_linear_func_t dest[UCP_PROTO_PERF_TYPE_LAST],
-                    const ucs_linear_func_t src[UCP_PROTO_PERF_TYPE_LAST])
-{
-    dest[UCP_PROTO_PERF_TYPE_SINGLE] = src[UCP_PROTO_PERF_TYPE_SINGLE];
-    dest[UCP_PROTO_PERF_TYPE_MULTI]  = src[UCP_PROTO_PERF_TYPE_MULTI];
-}
-
-static inline void
-ucp_proto_perf_add(ucs_linear_func_t perf[UCP_PROTO_PERF_TYPE_LAST],
-                   ucs_linear_func_t func)
-{
-    ucs_linear_func_add_inplace(&perf[UCP_PROTO_PERF_TYPE_SINGLE], func);
-    ucs_linear_func_add_inplace(&perf[UCP_PROTO_PERF_TYPE_MULTI], func);
-}
+
+void ucp_proto_perf_set(ucs_linear_func_t perf[UCP_PROTO_PERF_TYPE_LAST],
+                        ucs_linear_func_t func);
+
+
+void ucp_proto_perf_copy(ucs_linear_func_t dest[UCP_PROTO_PERF_TYPE_LAST],
+                         const ucs_linear_func_t src[UCP_PROTO_PERF_TYPE_LAST]);
+
+
+void ucp_proto_perf_add(ucs_linear_func_t perf[UCP_PROTO_PERF_TYPE_LAST],
+                        ucs_linear_func_t func);
 
 #endif
diff --git a/src/ucp/proto/proto_debug.h b/src/ucp/proto/proto_debug.h
@@ -36,10 +36,12 @@
  * of different types. See ucp_proto_perf_type_t */
 #define UCP_PROTO_PERF_FUNC_TYPES_FMT \
     UCP_PROTO_PERF_FUNC_FMT(single) \
-    UCP_PROTO_PERF_FUNC_FMT(multi)
+    UCP_PROTO_PERF_FUNC_FMT(multi) \
+    UCP_PROTO_PERF_FUNC_FMT(cpu)
 #define UCP_PROTO_PERF_FUNC_TYPES_ARG(_perf_func) \
     UCP_PROTO_PERF_FUNC_ARG((&(_perf_func)[UCP_PROTO_PERF_TYPE_SINGLE])), \
-    UCP_PROTO_PERF_FUNC_ARG((&(_perf_func)[UCP_PROTO_PERF_TYPE_MULTI]))
+    UCP_PROTO_PERF_FUNC_ARG((&(_perf_func)[UCP_PROTO_PERF_TYPE_MULTI])), \
+    UCP_PROTO_PERF_FUNC_ARG((&(_perf_func)[UCP_PROTO_PERF_TYPE_CPU]))
 
 
 /*
diff --git a/src/ucp/proto/proto_init.c b/src/ucp/proto/proto_init.c
@@ -62,6 +62,9 @@ void ucp_proto_common_add_ppln_range(const ucp_proto_init_params_t *init_params,
     ppln_range->perf[UCP_PROTO_PERF_TYPE_MULTI] =
             frag_range->perf[UCP_PROTO_PERF_TYPE_MULTI];
 
+    ppln_range->perf[UCP_PROTO_PERF_TYPE_CPU] =
+            frag_range->perf[UCP_PROTO_PERF_TYPE_CPU];
+
     ppln_range->max_length = max_length;
 
     ucp_proto_perf_range_add_data(ppln_range);
@@ -96,6 +99,8 @@ void ucp_proto_perf_range_add_data(const ucp_proto_perf_range_t *range)
                                  range->perf[UCP_PROTO_PERF_TYPE_SINGLE]);
     ucp_proto_perf_node_add_data(range->node, "mult",
                                  range->perf[UCP_PROTO_PERF_TYPE_MULTI]);
+    ucp_proto_perf_node_add_data(range->node, "cpu",
+                                 range->perf[UCP_PROTO_PERF_TYPE_CPU]);
 }
 
 ucs_status_t
@@ -183,23 +188,24 @@ ucp_proto_perf_envelope_make(const ucp_proto_perf_list_t *perf_list,
 }
 
 ucs_status_t
-ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
+ucp_proto_init_parallel_stages(const ucp_proto_common_init_params_t *params,
                                size_t range_start, size_t range_end,
                                size_t frag_size, double bias,
                                const ucp_proto_perf_range_t **stages,
                                unsigned num_stages)
 {
-    ucp_proto_caps_t *caps      = params->caps;
+    ucp_proto_caps_t *caps      = params->super.caps;
     ucs_linear_func_t bias_func = ucs_linear_func_make(0.0, 1.0 - bias);
-    UCS_ARRAY_DEFINE_ONSTACK(stage_list, ucp_proto_perf_list, 4);
-    UCS_ARRAY_DEFINE_ONSTACK(concave, ucp_proto_perf_envelope, 4);
-    const ucs_linear_func_t *single_perf, *multi_perf;
+    UCS_ARRAY_DEFINE_ONSTACK(stage_list, ucp_proto_perf_list, 16);
+    UCS_ARRAY_DEFINE_ONSTACK(concave, ucp_proto_perf_envelope, 16);
+    ucs_linear_func_t perf[UCP_PROTO_PERF_TYPE_LAST];
+    ucs_linear_func_t sum_single_perf, sum_cpu_perf;
     const ucp_proto_perf_range_t **stage_elem;
     ucp_proto_perf_envelope_elem_t *elem;
     ucp_proto_perf_node_t *stage_node;
+    ucp_proto_perf_type_t perf_type;
     ucp_proto_perf_range_t *range;
     ucs_linear_func_t *perf_elem;
-    ucs_linear_func_t sum_perf;
     char frag_size_str[64];
     ucs_status_t status;
     char range_str[64];
@@ -211,18 +217,30 @@ ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
               frag_size_str, bias * 100.0);
 
     ucs_log_indent(1);
-    sum_perf = UCS_LINEAR_FUNC_ZERO;
+    sum_single_perf = UCS_LINEAR_FUNC_ZERO;
+    sum_cpu_perf    = UCS_LINEAR_FUNC_ZERO;
     ucs_carray_for_each(stage_elem, stages, num_stages) {
-        /* Single-fragment is adding overheads and transfer time */
-        single_perf = &(*stage_elem)->perf[UCP_PROTO_PERF_TYPE_SINGLE];
-        ucs_linear_func_add_inplace(&sum_perf, *single_perf);
+        UCP_PROTO_PERF_TYPE_FOREACH(perf_type) {
+            perf[perf_type] = (*stage_elem)->perf[perf_type];
+            /* For multi-fragment protocols, we need to apply the fragment
+             * size to the performance function linear factor.
+             */
+            if (!(params->flags & UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG)) {
+                perf[perf_type].m += perf[perf_type].c / frag_size;
+            }
+        }
 
-        /* account for the overhead of each fragment of a multi-fragment message */
-        multi_perf   = &(*stage_elem)->perf[UCP_PROTO_PERF_TYPE_MULTI];
-        perf_elem    = ucs_array_append(ucp_proto_perf_list, &stage_list,
-                                        status = UCS_ERR_NO_MEMORY; goto out);
-        perf_elem->c = multi_perf->c;
-        perf_elem->m = multi_perf->m + (multi_perf->c / frag_size);
+        /* Summarize single and CPU time */
+        ucs_linear_func_add_inplace(&sum_single_perf,
+                                    perf[UCP_PROTO_PERF_TYPE_SINGLE]);
+        ucs_linear_func_add_inplace(&sum_cpu_perf,
+                                    perf[UCP_PROTO_PERF_TYPE_CPU]);
+
+        /* Add all multi perf ranges to envelope array */
+        perf_elem  = ucs_array_append(ucp_proto_perf_list, &stage_list,
+                                      status = UCS_ERR_NO_MEMORY;
+                                      goto out);
+        *perf_elem = perf[UCP_PROTO_PERF_TYPE_MULTI];
 
         ucs_trace("stage[%zu] %s " UCP_PROTO_PERF_FUNC_TYPES_FMT
                   UCP_PROTO_PERF_FUNC_FMT(perf_elem),
@@ -232,6 +250,12 @@ ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
                   UCP_PROTO_PERF_FUNC_ARG(perf_elem));
     }
 
+    /* Add CPU time as another parallel stage */
+    perf_elem  = ucs_array_append(ucp_proto_perf_list, &stage_list,
+                                  status = UCS_ERR_NO_MEMORY;
+                                  goto out);
+    *perf_elem = sum_cpu_perf;
+
     /* Multi-fragment is pipelining overheads and network transfer */
     status = ucp_proto_perf_envelope_make(&stage_list, range_start, range_end,
                                           0, &concave);
@@ -242,16 +266,19 @@ ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
     ucs_array_for_each(elem, &concave) {
         range             = &caps->ranges[caps->num_ranges];
         range->max_length = elem->max_length;
-        range->node       = ucp_proto_perf_node_new_data(params->proto_name, "");
+        range->node       = ucp_proto_perf_node_new_data(params->super.proto_name,
+                                                         "");
 
         /* "single" performance estimation is sum of "stages" with the bias */
         range->perf[UCP_PROTO_PERF_TYPE_SINGLE] =
-                ucs_linear_func_compose(bias_func, sum_perf);
+                ucs_linear_func_compose(bias_func, sum_single_perf);
 
         /* "multiple" performance estimation is concave envelope of "stages" */
-        multi_perf = &ucs_array_elem(&stage_list, elem->index);
-        range->perf[UCP_PROTO_PERF_TYPE_MULTI] =
-                ucs_linear_func_compose(bias_func, *multi_perf);
+        range->perf[UCP_PROTO_PERF_TYPE_MULTI] = ucs_linear_func_compose(
+                bias_func, ucs_array_elem(&stage_list, elem->index));
+
+        /* CPU overhead is the sum of all stages */
+        range->perf[UCP_PROTO_PERF_TYPE_CPU] = sum_cpu_perf;
 
         ucp_proto_perf_range_add_data(range);
 
@@ -331,11 +358,13 @@ ucp_proto_common_init_send_perf(const ucp_proto_common_init_params_t *params,
         ucp_proto_perf_node_own_child(send_perf->node, &child_perf_node);
     }
 
-    /* Add constant CPU overhead */
-    send_overhead.c                              += tl_perf->send_pre_overhead;
-    send_perf->perf[UCP_PROTO_PERF_TYPE_SINGLE]   = send_overhead;
-    send_perf->perf[UCP_PROTO_PERF_TYPE_MULTI]    = send_overhead;
-    send_perf->perf[UCP_PROTO_PERF_TYPE_MULTI].c += tl_perf->send_post_overhead;
+    send_overhead.c                            += tl_perf->send_pre_overhead;
+    send_perf->perf[UCP_PROTO_PERF_TYPE_SINGLE] = send_overhead;
+
+    send_overhead.c                           += tl_perf->send_post_overhead;
+    send_perf->perf[UCP_PROTO_PERF_TYPE_MULTI] = send_overhead;
+    send_perf->perf[UCP_PROTO_PERF_TYPE_CPU]   = send_overhead;
+
     ucp_proto_perf_range_add_data(send_perf);
 
     return UCS_OK;
@@ -368,6 +397,7 @@ ucp_proto_common_init_xfer_perf(const ucp_proto_common_init_params_t *params,
     xfer_perf->perf[UCP_PROTO_PERF_TYPE_SINGLE].c += tl_perf->latency +
                                                      tl_perf->sys_latency;
     xfer_perf->perf[UCP_PROTO_PERF_TYPE_MULTI]     = xfer_time;
+    xfer_perf->perf[UCP_PROTO_PERF_TYPE_CPU]       = UCS_LINEAR_FUNC_ZERO;
 
     /*
      * Add the latency of response/ACK back from the receiver.
@@ -448,6 +478,8 @@ ucp_proto_common_init_recv_perf(const ucp_proto_common_init_params_t *params,
 
     recv_perf->perf[UCP_PROTO_PERF_TYPE_SINGLE] = recv_overhead;
     recv_perf->perf[UCP_PROTO_PERF_TYPE_MULTI]  = recv_overhead;
+    recv_perf->perf[UCP_PROTO_PERF_TYPE_CPU]    = UCS_LINEAR_FUNC_ZERO;
+
     ucp_proto_perf_range_add_data(recv_perf);
 
     return UCS_OK;
@@ -503,8 +535,8 @@ ucp_proto_common_init_caps(const ucp_proto_common_init_params_t *params,
     parallel_stages[2] = &recv_perf;
 
     /* Add ranges representing sending single fragment */
-    status = ucp_proto_init_parallel_stages(&params->super, 0, frag_size,
-                                            frag_size, 0.0, parallel_stages, 3);
+    status = ucp_proto_init_parallel_stages(params, 0, frag_size, frag_size,
+                                            0.0, parallel_stages, 3);
     if (status != UCS_OK) {
         goto out_deref_recv_perf;
     }
diff --git a/src/ucp/proto/proto_init.h b/src/ucp/proto/proto_init.h
@@ -79,7 +79,7 @@ ucp_proto_perf_envelope_make(const ucp_proto_perf_list_t *perf_list,
  * @param [in] num_stages    Number of parallel stages in the protocol.
  */
 ucs_status_t
-ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
+ucp_proto_init_parallel_stages(const ucp_proto_common_init_params_t *params,
                                size_t range_start, size_t range_end,
                                size_t frag_size, double bias,
                                const ucp_proto_perf_range_t **stages,
diff --git a/src/ucp/proto/proto_reconfig.c b/src/ucp/proto/proto_reconfig.c
@@ -70,7 +70,6 @@ static ucs_status_t
 ucp_proto_reconfig_init(const ucp_proto_init_params_t *init_params)
 {
     ucp_proto_perf_range_t *perf_range = &init_params->caps->ranges[0];
-    ucp_proto_perf_type_t perf_type;
 
     /* Default reconfiguration protocol is a fallback for any case protocol
      * selection is unsuccessful. The protocol keeps queuing requests until they
@@ -85,9 +84,7 @@ ucp_proto_reconfig_init(const ucp_proto_init_params_t *init_params)
 
     /* Set the performance estimation as worse than any other protocol */
     perf_range->max_length = SIZE_MAX;
-    for (perf_type = 0; perf_type < UCP_PROTO_PERF_TYPE_LAST; ++perf_type) {
-        perf_range->perf[perf_type] = ucs_linear_func_make(INFINITY, 0);
-    }
+    ucp_proto_perf_set(perf_range->perf, ucs_linear_func_make(INFINITY, 0));
 
     perf_range->node = ucp_proto_perf_node_new_data("dummy", "");
     return UCS_OK;
diff --git a/src/ucp/rma/amo_sw.c b/src/ucp/rma/amo_sw.c
@@ -393,7 +393,8 @@ ucp_proto_amo_sw_init(const ucp_proto_init_params_t *init_params, unsigned flags
         .super.hdr_size      = 0,
         .super.send_op       = UCT_EP_OP_AM_BCOPY,
         .super.memtype_op    = UCT_EP_OP_GET_SHORT,
-        .super.flags         = flags | UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE,
+        .super.flags         = flags | UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG |
+                               UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE,
         .lane_type           = UCP_LANE_TYPE_AM,
         .tl_cap_flags        = 0
     };
diff --git a/src/ucp/rndv/proto_rndv.c b/src/ucp/rndv/proto_rndv.c
@@ -275,7 +275,8 @@ ucp_proto_rndv_ctrl_init(const ucp_proto_rndv_ctrl_init_params_t *params)
     ucs_trace("rndv" UCP_PROTO_TIME_FMT(ctrl_latency),
               UCP_PROTO_TIME_ARG(ctrl_latency));
     ctrl_perf.perf[UCP_PROTO_PERF_TYPE_SINGLE] =
-    ctrl_perf.perf[UCP_PROTO_PERF_TYPE_MULTI]  = ucs_linear_func_add3(
+    ctrl_perf.perf[UCP_PROTO_PERF_TYPE_MULTI]  =
+    ctrl_perf.perf[UCP_PROTO_PERF_TYPE_CPU] = ucs_linear_func_add3(
             memreg_time, ucs_linear_func_make(ctrl_latency, 0.0),
             params->unpack_time);
     ucp_proto_perf_range_add_data(&ctrl_perf);
@@ -306,9 +307,9 @@ ucp_proto_rndv_ctrl_init(const ucp_proto_rndv_ctrl_init_params_t *params)
 
         parallel_stages[0] = &ctrl_perf;
         parallel_stages[1] = &remote_perf;
-        status = ucp_proto_init_parallel_stages(&params->super.super,
-                                                min_length, range_max_length,
-                                                SIZE_MAX, params->perf_bias,
+        status = ucp_proto_init_parallel_stages(&params->super, min_length,
+                                                range_max_length, SIZE_MAX,
+                                                params->perf_bias,
                                                 parallel_stages, 2);
         if (status != UCS_OK) {
             goto out_deref_perf_node;
@@ -400,7 +401,9 @@ ucp_proto_rndv_ack_perf(const ucp_proto_init_params_t *init_params,
 
     ack_perf[UCP_PROTO_PERF_TYPE_SINGLE] =
             ucs_linear_func_make(send_time + receive_time, 0);
-    ack_perf[UCP_PROTO_PERF_TYPE_MULTI] = ucs_linear_func_make(send_time, 0);
+    ack_perf[UCP_PROTO_PERF_TYPE_MULTI] =
+    ack_perf[UCP_PROTO_PERF_TYPE_CPU] =
+            ucs_linear_func_make(send_time, 0);
 
     return UCS_OK;
 }
@@ -440,6 +443,8 @@ ucs_status_t ucp_proto_rndv_ack_init(const ucp_proto_init_params_t *init_params,
                                  ack_perf[UCP_PROTO_PERF_TYPE_SINGLE]);
     ucp_proto_perf_node_add_data(ack_perf_node, "mult",
                                  ack_perf[UCP_PROTO_PERF_TYPE_MULTI]);
+    ucp_proto_perf_node_add_data(ack_perf_node, "cpu",
+                                 ack_perf[UCP_PROTO_PERF_TYPE_CPU]);
 
     /* Copy basic capabilities from bulk protocol */
     init_params->caps->cfg_thresh   = bulk_caps->cfg_thresh;
diff --git a/src/ucp/rndv/rndv_ats.c b/src/ucp/rndv/rndv_ats.c