22
22
" <distr> is the distribution to use. Choices are:\n " \
23
23
" \t runif[0,1] runif[-1,1] runif[-1000,1000] rsubn\n " \
24
24
" <topology> is a string for logging, best used with SimGrid\n " \
25
- " <algorithm> is a stirng for logging, best used with SimGrid\n " )
25
+ " <algorithm> is a string for logging, best used with SimGrid\n " )
26
26
27
27
#include < cstdio>
28
28
#include < string>
29
29
#include < mpi.h>
30
30
#include < stdlib.h>
31
31
#include < stdbool.h>
32
32
#include < boost/multiprecision/mpfr.hpp>
33
+ #include < boost/multiprecision/number.hpp>
33
34
34
- #include " rand.hxx"
35
35
#include " assoc.hxx"
36
+ #include " error_semantics.hxx"
36
37
#include " mpi_op.hxx"
38
+ #include " rand.hxx"
37
39
#include " util.hxx"
38
40
39
41
#define FLOAT_T double
40
42
using namespace boost ::multiprecision;
43
+ using namespace boost ::math;
41
44
42
45
/* Note: it would be more robust to use ACCUMULATOR().operator()(a,b) instead
43
46
* of a ACC_OP b, but this doesn't work for mpfr values */
@@ -48,21 +51,28 @@ using namespace boost::multiprecision;
48
51
49
52
const bool is_sum = std::is_same<std::plus<FLOAT_T>, ACCUMULATOR>::value;
50
53
const bool is_prod = std::is_same<std::multiplies<FLOAT_T>, ACCUMULATOR>::value;
51
- mpfr_float_1000 mpfr_dot (FLOAT_T *as, FLOAT_T *bs, long long len);
54
+ /* Fill in random vectors and do dot product according to MPI canonical ordering */
55
+ FLOAT_T can_mpi_dot (int numtasks, long long len, FLOAT_T* as, FLOAT_T* bs, FLOAT_T (*rand_a)(), FLOAT_T (*rand_b)(), FLOAT_T *rank_sum);
56
+ /* Left-associative dot product */
57
+ FLOAT_T dot (long long len, FLOAT_T* a, FLOAT_T* b);
58
+ /* Left-associative dot product with MPFR accumulator */
59
+ mpfr_float_1000 mpfr_dot (FLOAT_T *a, FLOAT_T *b, long long len);
52
60
53
61
int main (int argc, char * argv[])
54
62
{
55
63
int taskid, numtasks;
56
- long i, j, chunk, rc=0 ;
64
+ long i, chunk, rc=0 ;
57
65
long long len, height;
58
66
MPI_Op nc_sum_op;
59
67
std::string distr, topo, algo;
60
68
FLOAT_T *a, *b, *as, *bs, *rank_sum;
61
- FLOAT_T mysum , nc_sum, par_sum, can_mpi_sum, rand_sum;
62
- FLOAT_T starttime, endtime, ptime;
69
+ FLOAT_T localsum , nc_sum, par_sum, can_mpi_sum, rand_sum, serial_sum ;
70
+ FLOAT_T starttime, endtime, ptime, ctime , stime, mpfrtime, randtreetime ;
63
71
FLOAT_T (*rand_flt_a)(); // Function to generate a random float
64
72
FLOAT_T (*rand_flt_b)(); // Function to generate a random float
65
73
mpfr_float_1000 mpfr_acc;
74
+ mpfr_float_1000 error, result;
75
+ FLOAT_T magnitude = 0.0 ;
66
76
union udouble {
67
77
double d;
68
78
unsigned long u;
@@ -94,8 +104,8 @@ int main (int argc, char* argv[])
94
104
}
95
105
/* Select distribution for random floating point numbers */
96
106
distr = argv[2 ];
97
- rc = parse_distr<FLOAT_T>(distr, &rand_flt_a)
98
- || parse_distr<FLOAT_T>(distr, &rand_flt_b);
107
+ rc = parse_distr<FLOAT_T>(distr, &magnitude, & rand_flt_a)
108
+ || parse_distr<FLOAT_T>(distr, &magnitude, & rand_flt_b);
99
109
if (rc != 0 ) {
100
110
if (taskid == 0 ) {
101
111
fprintf (stderr, " Unrecognized distribution:\n %s" , USAGE);
@@ -117,11 +127,13 @@ int main (int argc, char* argv[])
117
127
118
128
/* Assign storage for dot product vectors
119
129
* We do extra here for simplicity and so rank 0 has enough room */
130
+ // XXX: Do not need to malloc as and bs on each MPI process, just on rank 0
131
+ // This is a bit trickier to make sure the rand is consistent though.
120
132
a = (FLOAT_T*) malloc (len*sizeof (FLOAT_T));
121
133
b = (FLOAT_T*) malloc (len*sizeof (FLOAT_T));
122
134
as = (FLOAT_T*) malloc (len*sizeof (FLOAT_T));
123
135
bs = (FLOAT_T*) malloc (len*sizeof (FLOAT_T));
124
- rank_sum = (FLOAT_T *) malloc (numtasks*sizeof (FLOAT_T));
136
+ rank_sum = (FLOAT_T*) malloc (numtasks*sizeof (FLOAT_T));
125
137
126
138
/* Initialize dot product vectors */
127
139
chunk = len/numtasks;
@@ -132,74 +144,81 @@ int main (int argc, char* argv[])
132
144
b[i] = rand_flt_b ();
133
145
}
134
146
135
- /* Perform the dot product */
147
+ /* Perform the dot product in parallel */
136
148
starttime = MPI_Wtime ();
137
- mysum = 0.0 ;
149
+ localsum = 0.0 ;
138
150
for (i = chunk*taskid; i < chunk*taskid + chunk; i++) {
139
- mysum += a[i] * b[i];
151
+ localsum += a[i] * b[i];
140
152
}
141
153
142
154
/* After the dot product, perform a summation of results on each node */
143
- MPI_Reduce (&mysum, &par_sum, 1 , MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD);
144
- MPI_Reduce (&mysum, &nc_sum, 1 , MPI_DOUBLE, nc_sum_op, 0 , MPI_COMM_WORLD);
155
+ MPI_Reduce (&localsum, &par_sum, 1 , MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD);
145
156
endtime = MPI_Wtime ();
157
+ MPI_Reduce (&localsum, &nc_sum, 1 , MPI_DOUBLE, nc_sum_op, 0 , MPI_COMM_WORLD);
146
158
ptime = endtime - starttime;
147
159
148
- /* Now, task 0 does all the work to check. The canonical ordering
149
- * is increasing taskid */
160
+ /* Now, task 0 does all the work to check. The canonical ordering * is increasing taskid */
150
161
set_seed (ASSOC_SEED, 0 );
151
162
srand (ASSOC_SEED);
152
163
if (taskid == 0 ) {
153
- mysum = 0.0 ;
154
- for (i = 0 ; i < numtasks; i++) {
155
- rank_sum[i] = 0.0 ;
156
- for (j = chunk*i; j < chunk * i + chunk; j++) {
157
- as[j] = rand_flt_a ();
158
- bs[j] = rand_flt_b ();
159
- /* // Debug
160
- if (as[j] != a[j] || bs[j] != b[j]) {
161
- fprintf(stderr, "Results differ: (%a != %a, %a != %a)\n",
162
- as[j], a[j], bs[j], b[j]);
163
- }
164
- */
165
- rank_sum[i] += as[j] * bs[j];
166
- mysum += as[j] * bs[j];
167
- }
168
- }
169
- can_mpi_sum = 0.0 ;
170
- for (i = 0 ; i < numtasks; i++) {
171
- can_mpi_sum += rank_sum[i];
172
- }
164
+ // Do the canonical MPI dot product summation
165
+ starttime = MPI_Wtime ();
166
+ can_mpi_sum = can_mpi_dot (numtasks, len, as, bs, rand_flt_a, rand_flt_b, rank_sum);
167
+ endtime = MPI_Wtime ();
168
+ ctime = endtime - starttime;
169
+
170
+ // Do the serial sum
171
+ starttime = MPI_Wtime ();
172
+ serial_sum = dot (len, a, b);
173
+ endtime = MPI_Wtime ();
174
+ stime = endtime - starttime;
173
175
174
- // Generate a random summation
176
+ // Generate a random dot product on the MPI ranks
177
+ starttime = MPI_Wtime ();
175
178
rand_sum = associative_accumulate_rand<FLOAT_T>(numtasks, rank_sum, is_sum, &height);
179
+ endtime = MPI_Wtime ();
180
+ randtreetime = endtime - starttime;
176
181
177
- // MPFR
182
+ // MPFR dot product
183
+ starttime = MPI_Wtime ();
178
184
mpfr_acc = mpfr_dot (as, bs, len);
185
+ endtime = MPI_Wtime ();
186
+ mpfrtime = endtime - starttime;
187
+
188
+ // Error analysis
189
+ error = dot_e (magnitude, len, 0.0 );
190
+
191
+ // TODO: Figure out the height of MPI Reduce and MPI noncommutative sum, and canonical MPI sum
192
+ // TODO: Add in timings for MPFR and serial summations.
179
193
180
194
// Print header then different dot products
181
195
printf (" numtasks\t veclen\t topology\t distribution\t reduction algorithm\t order\t height\t time\t FP (decimal)\t FP (%%a)\t FP (hex)\n " );
182
- pv.d = mysum ;
196
+ pv.d = serial_sum ;
183
197
printf (" %d\t %lld\t %s\t %s\t %s\t Left assoc\t %lld\t %f\t %.15f\t %a\t 0x%lx\n " ,
184
- numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), len-1 , nan ( " " ), mysum, mysum , pv.u );
198
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), len-1 , stime, serial_sum, serial_sum , pv.u );
185
199
pv.d = rand_sum;
186
200
printf (" %d\t %lld\t %s\t %s\t %s\t Random assoc\t %lld\t %f\t %.15f\t %a\t 0x%lx\n " ,
187
- numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), height, ptime, rand_sum, rand_sum, pv.u );
188
- // TODO: Figure out the height of MPI Reduce and MPI noncommutative sum, and canonical MPI sum
189
- // TODO: Add in timings for MPFR and serial summations.
201
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), height, randtreetime, rand_sum, rand_sum, pv.u );
190
202
pv.d = par_sum;
191
203
printf (" %d\t %lld\t %s\t %s\t %s\t MPI Reduce\t %lld\t %f\t %.15f\t %a\t 0x%lx\n " ,
192
- numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), 0LL , ptime, par_sum, par_sum, pv.u );
204
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), ( long long ) ceil ( log2 (numtasks)) , ptime, par_sum, par_sum, pv.u );
193
205
pv.d = nc_sum;
194
206
printf (" %d\t %lld\t %s\t %s\t %s\t MPI noncomm sum\t %lld\t %f\t %.15f\t %a\t 0x%lx\n " ,
195
- numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), 0LL , ptime, nc_sum, nc_sum, pv.u );
207
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), ( long long ) numtasks- 1 , ptime, nc_sum, nc_sum, pv.u );
196
208
pv.d = can_mpi_sum;
197
209
printf (" %d\t %lld\t %s\t %s\t %s\t Canonical MPI\t %lld\t %f\t %.15f\t %a\t 0x%lx\n " ,
198
- numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), (long long ) numtasks-1 , nan ( " " ) , can_mpi_sum, can_mpi_sum, pv.u );
210
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (), (long long ) numtasks-1 , ctime , can_mpi_sum, can_mpi_sum, pv.u );
199
211
mpfr_printf (" %d\t %lld\t %s\t %s\t %s\t MPFR(%d) left assoc\t %lld\t %f\t %.20RNf\t %.20RNa\t %RNa\n " ,
200
212
numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (),
201
213
std::numeric_limits<mpfr_float_1000>::digits, // Precision of MPFR
202
- len-1 , nan (" " ), mpfr_acc, mpfr_acc, mpfr_acc);
214
+ len - 1 , mpfrtime, mpfr_acc, mpfr_acc, mpfr_acc);
215
+ mpfr_printf (" %d\t %lld\t %s\t %s\t %s\t Predicted error\t %lld\t %f\t %.20RNf\t %.20RNa\t %RNa\n " ,
216
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (),
217
+ len-1 , nan (" " ), error, error, error);
218
+ result = serial_sum - mpfr_acc;
219
+ mpfr_printf (" %d\t %lld\t %s\t %s\t %s\t Left assoc error\t %lld\t %f\t %.20RNf\t %.20RNa\t %RNa\n " ,
220
+ numtasks, len, topo.c_str (), distr.c_str (), algo.c_str (),
221
+ len - 1 , nan (" " ), result, result, result);
203
222
}
204
223
205
224
free (a);
@@ -214,12 +233,48 @@ int main (int argc, char* argv[])
214
233
return rc;
215
234
}
216
235
217
- mpfr_float_1000 mpfr_dot (FLOAT_T *as, FLOAT_T *bs, long long len)
236
+ FLOAT_T can_mpi_dot (int numtasks, long long len, FLOAT_T* as, FLOAT_T* bs, FLOAT_T (*rand_a)(), FLOAT_T (*rand_b)(), FLOAT_T *rank_sum)
237
+ {
238
+ int i, j;
239
+ int chunk = len/numtasks;
240
+ FLOAT_T can_mpi_sum = 0.0 ;
241
+ FLOAT_T localsum = 0.0 ;
242
+ for (i = 0 ; i < numtasks; i++) {
243
+ rank_sum[i] = 0.0 ;
244
+ for (j = chunk*i; j < chunk * i + chunk; j++) {
245
+ as[j] = rand_a ();
246
+ bs[j] = rand_b ();
247
+ /* // Debug
248
+ if (as[j] != a[j] || bs[j] != b[j]) {
249
+ fprintf(stderr, "Results differ: (%a != %a, %a != %a)\n",
250
+ as[j], a[j], bs[j], b[j]);
251
+ }
252
+ */
253
+ rank_sum[i] += as[j] * bs[j];
254
+ localsum += as[j] * bs[j];
255
+ }
256
+ }
257
+ for (i = 0 ; i < numtasks; i++) {
258
+ can_mpi_sum += rank_sum[i];
259
+ }
260
+ return (can_mpi_sum);
261
+ }
262
+
263
+ FLOAT_T dot (long long len, FLOAT_T* a, FLOAT_T* b)
264
+ {
265
+ FLOAT_T acc = 0.0 ;
266
+ for (long long i = 0 ; i < len; i++) {
267
+ acc = acc + a[i] * b[i];
268
+ }
269
+ return acc;
270
+ }
271
+
272
+ mpfr_float_1000 mpfr_dot (FLOAT_T *a, FLOAT_T *b, long long len)
218
273
{
219
274
mpfr_float_1000 acc;
220
275
acc = 0.0 ;
221
276
for (long long i = 0 ; i < len; i++) {
222
- acc = acc + as [i] * bs [i];
277
+ acc = acc + a [i] * b [i];
223
278
}
224
279
return (acc);
225
280
}
0 commit comments