@@ -62,6 +62,9 @@ void ucp_proto_common_add_ppln_range(const ucp_proto_init_params_t *init_params,
62
62
ppln_range -> perf [UCP_PROTO_PERF_TYPE_MULTI ] =
63
63
frag_range -> perf [UCP_PROTO_PERF_TYPE_MULTI ];
64
64
65
+ ppln_range -> perf [UCP_PROTO_PERF_TYPE_CPU ] =
66
+ frag_range -> perf [UCP_PROTO_PERF_TYPE_CPU ];
67
+
65
68
ppln_range -> max_length = max_length ;
66
69
67
70
ucp_proto_perf_range_add_data (ppln_range );
@@ -96,6 +99,8 @@ void ucp_proto_perf_range_add_data(const ucp_proto_perf_range_t *range)
96
99
range -> perf [UCP_PROTO_PERF_TYPE_SINGLE ]);
97
100
ucp_proto_perf_node_add_data (range -> node , "mult" ,
98
101
range -> perf [UCP_PROTO_PERF_TYPE_MULTI ]);
102
+ ucp_proto_perf_node_add_data (range -> node , "cpu" ,
103
+ range -> perf [UCP_PROTO_PERF_TYPE_CPU ]);
99
104
}
100
105
101
106
ucs_status_t
@@ -183,23 +188,24 @@ ucp_proto_perf_envelope_make(const ucp_proto_perf_list_t *perf_list,
183
188
}
184
189
185
190
ucs_status_t
186
- ucp_proto_init_parallel_stages (const ucp_proto_init_params_t * params ,
191
+ ucp_proto_init_parallel_stages (const ucp_proto_common_init_params_t * params ,
187
192
size_t range_start , size_t range_end ,
188
193
size_t frag_size , double bias ,
189
194
const ucp_proto_perf_range_t * * stages ,
190
195
unsigned num_stages )
191
196
{
192
- ucp_proto_caps_t * caps = params -> caps ;
197
+ ucp_proto_caps_t * caps = params -> super . caps ;
193
198
ucs_linear_func_t bias_func = ucs_linear_func_make (0.0 , 1.0 - bias );
194
- UCS_ARRAY_DEFINE_ONSTACK (stage_list , ucp_proto_perf_list , 4 );
195
- UCS_ARRAY_DEFINE_ONSTACK (concave , ucp_proto_perf_envelope , 4 );
196
- const ucs_linear_func_t * single_perf , * multi_perf ;
199
+ UCS_ARRAY_DEFINE_ONSTACK (stage_list , ucp_proto_perf_list , 16 );
200
+ UCS_ARRAY_DEFINE_ONSTACK (concave , ucp_proto_perf_envelope , 16 );
201
+ ucs_linear_func_t perf [UCP_PROTO_PERF_TYPE_LAST ];
202
+ ucs_linear_func_t sum_single_perf , sum_cpu_perf ;
197
203
const ucp_proto_perf_range_t * * stage_elem ;
198
204
ucp_proto_perf_envelope_elem_t * elem ;
199
205
ucp_proto_perf_node_t * stage_node ;
206
+ ucp_proto_perf_type_t perf_type ;
200
207
ucp_proto_perf_range_t * range ;
201
208
ucs_linear_func_t * perf_elem ;
202
- ucs_linear_func_t sum_perf ;
203
209
char frag_size_str [64 ];
204
210
ucs_status_t status ;
205
211
char range_str [64 ];
@@ -211,18 +217,30 @@ ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
211
217
frag_size_str , bias * 100.0 );
212
218
213
219
ucs_log_indent (1 );
214
- sum_perf = UCS_LINEAR_FUNC_ZERO ;
220
+ sum_single_perf = UCS_LINEAR_FUNC_ZERO ;
221
+ sum_cpu_perf = UCS_LINEAR_FUNC_ZERO ;
215
222
ucs_carray_for_each (stage_elem , stages , num_stages ) {
216
- /* Single-fragment is adding overheads and transfer time */
217
- single_perf = & (* stage_elem )-> perf [UCP_PROTO_PERF_TYPE_SINGLE ];
218
- ucs_linear_func_add_inplace (& sum_perf , * single_perf );
223
+ UCP_PROTO_PERF_TYPE_FOREACH (perf_type ) {
224
+ perf [perf_type ] = (* stage_elem )-> perf [perf_type ];
225
+ /* For multi-fragment protocols, we need to apply the fragment
226
+ * size to the performance function linear factor.
227
+ */
228
+ if (!(params -> flags & UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG )) {
229
+ perf [perf_type ].m += perf [perf_type ].c / frag_size ;
230
+ }
231
+ }
219
232
220
- /* account for the overhead of each fragment of a multi-fragment message */
221
- multi_perf = & (* stage_elem )-> perf [UCP_PROTO_PERF_TYPE_MULTI ];
222
- perf_elem = ucs_array_append (ucp_proto_perf_list , & stage_list ,
223
- status = UCS_ERR_NO_MEMORY ; goto out );
224
- perf_elem -> c = multi_perf -> c ;
225
- perf_elem -> m = multi_perf -> m + (multi_perf -> c / frag_size );
233
+ /* Summarize single and CPU time */
234
+ ucs_linear_func_add_inplace (& sum_single_perf ,
235
+ perf [UCP_PROTO_PERF_TYPE_SINGLE ]);
236
+ ucs_linear_func_add_inplace (& sum_cpu_perf ,
237
+ perf [UCP_PROTO_PERF_TYPE_CPU ]);
238
+
239
+ /* Add all multi perf ranges to envelope array */
240
+ perf_elem = ucs_array_append (ucp_proto_perf_list , & stage_list ,
241
+ status = UCS_ERR_NO_MEMORY ;
242
+ goto out );
243
+ * perf_elem = perf [UCP_PROTO_PERF_TYPE_MULTI ];
226
244
227
245
ucs_trace ("stage[%zu] %s " UCP_PROTO_PERF_FUNC_TYPES_FMT
228
246
UCP_PROTO_PERF_FUNC_FMT (perf_elem ),
@@ -232,6 +250,12 @@ ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
232
250
UCP_PROTO_PERF_FUNC_ARG (perf_elem ));
233
251
}
234
252
253
+ /* Add CPU time as another parallel stage */
254
+ perf_elem = ucs_array_append (ucp_proto_perf_list , & stage_list ,
255
+ status = UCS_ERR_NO_MEMORY ;
256
+ goto out );
257
+ * perf_elem = sum_cpu_perf ;
258
+
235
259
/* Multi-fragment is pipelining overheads and network transfer */
236
260
status = ucp_proto_perf_envelope_make (& stage_list , range_start , range_end ,
237
261
0 , & concave );
@@ -242,16 +266,19 @@ ucp_proto_init_parallel_stages(const ucp_proto_init_params_t *params,
242
266
ucs_array_for_each (elem , & concave ) {
243
267
range = & caps -> ranges [caps -> num_ranges ];
244
268
range -> max_length = elem -> max_length ;
245
- range -> node = ucp_proto_perf_node_new_data (params -> proto_name , "" );
269
+ range -> node = ucp_proto_perf_node_new_data (params -> super .proto_name ,
270
+ "" );
246
271
247
272
/* "single" performance estimation is sum of "stages" with the bias */
248
273
range -> perf [UCP_PROTO_PERF_TYPE_SINGLE ] =
249
- ucs_linear_func_compose (bias_func , sum_perf );
274
+ ucs_linear_func_compose (bias_func , sum_single_perf );
250
275
251
276
/* "multiple" performance estimation is concave envelope of "stages" */
252
- multi_perf = & ucs_array_elem (& stage_list , elem -> index );
253
- range -> perf [UCP_PROTO_PERF_TYPE_MULTI ] =
254
- ucs_linear_func_compose (bias_func , * multi_perf );
277
+ range -> perf [UCP_PROTO_PERF_TYPE_MULTI ] = ucs_linear_func_compose (
278
+ bias_func , ucs_array_elem (& stage_list , elem -> index ));
279
+
280
+ /* CPU overhead is the sum of all stages */
281
+ range -> perf [UCP_PROTO_PERF_TYPE_CPU ] = sum_cpu_perf ;
255
282
256
283
ucp_proto_perf_range_add_data (range );
257
284
@@ -331,11 +358,13 @@ ucp_proto_common_init_send_perf(const ucp_proto_common_init_params_t *params,
331
358
ucp_proto_perf_node_own_child (send_perf -> node , & child_perf_node );
332
359
}
333
360
334
- /* Add constant CPU overhead */
335
- send_overhead .c += tl_perf -> send_pre_overhead ;
336
- send_perf -> perf [UCP_PROTO_PERF_TYPE_SINGLE ] = send_overhead ;
337
- send_perf -> perf [UCP_PROTO_PERF_TYPE_MULTI ] = send_overhead ;
338
- send_perf -> perf [UCP_PROTO_PERF_TYPE_MULTI ].c += tl_perf -> send_post_overhead ;
361
+ send_overhead .c += tl_perf -> send_pre_overhead ;
362
+ send_perf -> perf [UCP_PROTO_PERF_TYPE_SINGLE ] = send_overhead ;
363
+
364
+ send_overhead .c += tl_perf -> send_post_overhead ;
365
+ send_perf -> perf [UCP_PROTO_PERF_TYPE_MULTI ] = send_overhead ;
366
+ send_perf -> perf [UCP_PROTO_PERF_TYPE_CPU ] = send_overhead ;
367
+
339
368
ucp_proto_perf_range_add_data (send_perf );
340
369
341
370
return UCS_OK ;
@@ -368,6 +397,7 @@ ucp_proto_common_init_xfer_perf(const ucp_proto_common_init_params_t *params,
368
397
xfer_perf -> perf [UCP_PROTO_PERF_TYPE_SINGLE ].c += tl_perf -> latency +
369
398
tl_perf -> sys_latency ;
370
399
xfer_perf -> perf [UCP_PROTO_PERF_TYPE_MULTI ] = xfer_time ;
400
+ xfer_perf -> perf [UCP_PROTO_PERF_TYPE_CPU ] = UCS_LINEAR_FUNC_ZERO ;
371
401
372
402
/*
373
403
* Add the latency of response/ACK back from the receiver.
@@ -448,6 +478,8 @@ ucp_proto_common_init_recv_perf(const ucp_proto_common_init_params_t *params,
448
478
449
479
recv_perf -> perf [UCP_PROTO_PERF_TYPE_SINGLE ] = recv_overhead ;
450
480
recv_perf -> perf [UCP_PROTO_PERF_TYPE_MULTI ] = recv_overhead ;
481
+ recv_perf -> perf [UCP_PROTO_PERF_TYPE_CPU ] = UCS_LINEAR_FUNC_ZERO ;
482
+
451
483
ucp_proto_perf_range_add_data (recv_perf );
452
484
453
485
return UCS_OK ;
@@ -503,8 +535,8 @@ ucp_proto_common_init_caps(const ucp_proto_common_init_params_t *params,
503
535
parallel_stages [2 ] = & recv_perf ;
504
536
505
537
/* Add ranges representing sending single fragment */
506
- status = ucp_proto_init_parallel_stages (& params -> super , 0 , frag_size ,
507
- frag_size , 0.0 , parallel_stages , 3 );
538
+ status = ucp_proto_init_parallel_stages (params , 0 , frag_size , frag_size ,
539
+ 0.0 , parallel_stages , 3 );
508
540
if (status != UCS_OK ) {
509
541
goto out_deref_recv_perf ;
510
542
}
0 commit comments