Skip to content

Commit bad58cf

Browse files
author
Sam Pollard
committed
Added theoretical error analysis for dot product
1 parent 5617a77 commit bad58cf

7 files changed

+173
-69
lines changed

README.md

-4
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,3 @@ tar -a -cvf datasets.tar.bz2 README.md nekbone.tsv subn.tsv openmpi talapas simg
106106
Happy reproducing
107107
-Sam
108108

109-
## Acknowledgments
110-
This work is partially funded by Sandia National Laboratories.
111-
112-
Sandia National Laboratories is a multimission laboratory managed and operated by National Technology and Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International, Inc., for the U.S. Department of Energy's National Nuclear Security Administration under contract DE-NA-0003525.

src/Makefile

+11-11
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,28 @@ USE_MPI ?= 1
77
MPICXX ?= smpicxx
88
#MPICXX = mpicxx
99

10+
# If using a different install location for simgrid, specify that here
11+
LDFLAGS += -L$${HOME}/.local/simgrid/lib -Wl,-rpath=$${HOME}/.local/simgrid/lib
12+
1013
# Experiment Parameters
11-
EXP_DIR = experiments/post-acceptance
14+
EXP_DIR = experiments/2021
1215
RAND_TRIALS = 1000000
1316
VECLEN_RAND_QUICK = 100000
1417
VECLEN_RAND_BIG = 2000000
1518
RAND_TRIALS_DEEP = 5000000
1619
VECLEN_RAND_DEEP = 256
1720

18-
EXTRA_SOURCES = assoc.cxx mpi_op.cxx rand.cxx
19-
HEADERS = assoc.hxx mpi_op.hxx rand.hxx util.hxx
21+
EXTRA_SOURCES = assoc.cxx error_semantics.cxx mpi_op.cxx rand.cxx
22+
HEADERS = assoc.hxx error_semantics.hxx mpi_op.hxx rand.hxx util.hxx
2023
ifeq ($(USE_MPI), 1)
2124
TARGETS = mpi_pi_reduce dotprod_mpi
2225
else
2326
TARGETS = assoc_test gen_random
2427
endif
2528

26-
# SimGrid options
27-
LDFLAGS += -L$${HOME}/.local/simgrid/lib -Wl,-rpath=$${HOME}/.local/simgrid/lib
28-
LIBS += -lmpfr -lgmp
29-
CXXFLAGS += -Wall -g
30-
3129
# Shouldn't need to change
30+
LIBS += -lmpfr -lgmp
31+
CXXFLAGS += -Wall -g -std=c++14
3232
OBJECTS = $(EXTRA_SOURCES:.cxx=.o)
3333
TARGET_OBJS = $(TARGETS:=.o)
3434

@@ -41,12 +41,11 @@ else
4141
$(CXX) $(CXXFLAGS) -c $^
4242
endif
4343

44-
4544
# MPI targets
4645
ifeq ($(USE_MPI),1)
4746
mpi_pi_reduce: mpi_pi_reduce.o rand.o
4847
$(MPICXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
49-
dotprod_mpi : $(OBJECTS) rand.o mpi_op.o dotprod_mpi.o
48+
dotprod_mpi : dotprod_mpi.o assoc.o error_semantics.o mpi_op.o rand.o
5049
$(MPICXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
5150
else
5251
# Non-MPI targets
@@ -115,7 +114,8 @@ clean :
115114
# Dependency lists
116115
assoc.o : assoc.hxx
117116
assoc_test.o : assoc.hxx rand.hxx util.hxx
118-
dotprod_mpi.o : rand.hxx assoc.hxx mpi_op.hxx
117+
error_semantics.o : error_semantics.hxx
118+
dotprod_mpi.o : error_semantics.hxx rand.hxx assoc.hxx mpi_op.hxx util.hxx
119119
gen_random.o : rand.hxx
120120
mpi_op.o : mpi_op.hxx
121121
mpi_pi_reduce.o : rand.hxx

src/dotprod_mpi.cxx

+104-49
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,25 @@
2222
"<distr> is the distribution to use. Choices are:\n"\
2323
"\trunif[0,1] runif[-1,1] runif[-1000,1000] rsubn\n"\
2424
"<topology> is a string for logging, best used with SimGrid\n"\
25-
"<algorithm> is a stirng for logging, best used with SimGrid\n")
25+
"<algorithm> is a string for logging, best used with SimGrid\n")
2626

2727
#include <cstdio>
2828
#include <string>
2929
#include <mpi.h>
3030
#include <stdlib.h>
3131
#include <stdbool.h>
3232
#include <boost/multiprecision/mpfr.hpp>
33+
#include <boost/multiprecision/number.hpp>
3334

34-
#include "rand.hxx"
3535
#include "assoc.hxx"
36+
#include "error_semantics.hxx"
3637
#include "mpi_op.hxx"
38+
#include "rand.hxx"
3739
#include "util.hxx"
3840

3941
#define FLOAT_T double
4042
using namespace boost::multiprecision;
43+
using namespace boost::math;
4144

4245
/* Note: it would be more robust to use ACCUMULATOR().operator()(a,b) instead
4346
* of a ACC_OP b, but this doesn't work for mpfr values */
@@ -48,21 +51,28 @@ using namespace boost::multiprecision;
4851

4952
const bool is_sum = std::is_same<std::plus<FLOAT_T>, ACCUMULATOR>::value;
5053
const bool is_prod = std::is_same<std::multiplies<FLOAT_T>, ACCUMULATOR>::value;
51-
mpfr_float_1000 mpfr_dot(FLOAT_T *as, FLOAT_T *bs, long long len);
54+
/* Fill in random vectors and do dot product according to MPI canonical ordering */
55+
FLOAT_T can_mpi_dot(int numtasks, long long len, FLOAT_T* as, FLOAT_T* bs, FLOAT_T (*rand_a)(), FLOAT_T (*rand_b)(), FLOAT_T *rank_sum);
56+
/* Left-associative dot product */
57+
FLOAT_T dot(long long len, FLOAT_T* a, FLOAT_T* b);
58+
/* Left-associative dot product with MPFR accumulator */
59+
mpfr_float_1000 mpfr_dot(FLOAT_T *a, FLOAT_T *b, long long len);
5260

5361
int main (int argc, char* argv[])
5462
{
5563
int taskid, numtasks;
56-
long i, j, chunk, rc=0;
64+
long i, chunk, rc=0;
5765
long long len, height;
5866
MPI_Op nc_sum_op;
5967
std::string distr, topo, algo;
6068
FLOAT_T *a, *b, *as, *bs, *rank_sum;
61-
FLOAT_T mysum, nc_sum, par_sum, can_mpi_sum, rand_sum;
62-
FLOAT_T starttime, endtime, ptime;
69+
FLOAT_T localsum, nc_sum, par_sum, can_mpi_sum, rand_sum, serial_sum;
70+
FLOAT_T starttime, endtime, ptime, ctime, stime, mpfrtime, randtreetime;
6371
FLOAT_T (*rand_flt_a)(); // Function to generate a random float
6472
FLOAT_T (*rand_flt_b)(); // Function to generate a random float
6573
mpfr_float_1000 mpfr_acc;
74+
mpfr_float_1000 error, result;
75+
FLOAT_T magnitude = 0.0;
6676
union udouble {
6777
double d;
6878
unsigned long u;
@@ -94,8 +104,8 @@ int main (int argc, char* argv[])
94104
}
95105
/* Select distribution for random floating point numbers */
96106
distr = argv[2];
97-
rc = parse_distr<FLOAT_T>(distr, &rand_flt_a)
98-
|| parse_distr<FLOAT_T>(distr, &rand_flt_b);
107+
rc = parse_distr<FLOAT_T>(distr, &magnitude, &rand_flt_a)
108+
|| parse_distr<FLOAT_T>(distr, &magnitude, &rand_flt_b);
99109
if (rc != 0) {
100110
if (taskid == 0) {
101111
fprintf(stderr, "Unrecognized distribution:\n%s", USAGE);
@@ -117,11 +127,13 @@ int main (int argc, char* argv[])
117127

118128
/* Assign storage for dot product vectors
119129
* We do extra here for simplicity and so rank 0 has enough room */
130+
// XXX: Do not need to malloc as and bs on each MPI process, just on rank 0
131+
// This is a bit trickier to make sure the rand is consistent though.
120132
a = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
121133
b = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
122134
as = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
123135
bs = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
124-
rank_sum = (FLOAT_T *) malloc (numtasks*sizeof(FLOAT_T));
136+
rank_sum = (FLOAT_T*) malloc (numtasks*sizeof(FLOAT_T));
125137

126138
/* Initialize dot product vectors */
127139
chunk = len/numtasks;
@@ -132,74 +144,81 @@ int main (int argc, char* argv[])
132144
b[i] = rand_flt_b();
133145
}
134146

135-
/* Perform the dot product */
147+
/* Perform the dot product in parallel */
136148
starttime = MPI_Wtime();
137-
mysum = 0.0;
149+
localsum = 0.0;
138150
for (i = chunk*taskid; i < chunk*taskid + chunk; i++) {
139-
mysum += a[i] * b[i];
151+
localsum += a[i] * b[i];
140152
}
141153

142154
/* After the dot product, perform a summation of results on each node */
143-
MPI_Reduce(&mysum, &par_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
144-
MPI_Reduce(&mysum, &nc_sum, 1, MPI_DOUBLE, nc_sum_op, 0, MPI_COMM_WORLD);
155+
MPI_Reduce(&localsum, &par_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
145156
endtime = MPI_Wtime();
157+
MPI_Reduce(&localsum, &nc_sum, 1, MPI_DOUBLE, nc_sum_op, 0, MPI_COMM_WORLD);
146158
ptime = endtime - starttime;
147159

148-
/* Now, task 0 does all the work to check. The canonical ordering
149-
* is increasing taskid */
160+
/* Now, task 0 does all the work to check. The canonical ordering * is increasing taskid */
150161
set_seed(ASSOC_SEED, 0);
151162
srand(ASSOC_SEED);
152163
if (taskid == 0) {
153-
mysum = 0.0;
154-
for (i = 0; i < numtasks; i++) {
155-
rank_sum[i] = 0.0;
156-
for (j = chunk*i; j < chunk * i + chunk; j++) {
157-
as[j] = rand_flt_a();
158-
bs[j] = rand_flt_b();
159-
/* // Debug
160-
if (as[j] != a[j] || bs[j] != b[j]) {
161-
fprintf(stderr, "Results differ: (%a != %a, %a != %a)\n",
162-
as[j], a[j], bs[j], b[j]);
163-
}
164-
*/
165-
rank_sum[i] += as[j] * bs[j];
166-
mysum += as[j] * bs[j];
167-
}
168-
}
169-
can_mpi_sum = 0.0;
170-
for (i = 0; i < numtasks; i++) {
171-
can_mpi_sum += rank_sum[i];
172-
}
164+
// Do the canonical MPI dot product summation
165+
starttime = MPI_Wtime();
166+
can_mpi_sum = can_mpi_dot(numtasks, len, as, bs, rand_flt_a, rand_flt_b, rank_sum);
167+
endtime = MPI_Wtime();
168+
ctime = endtime - starttime;
169+
170+
// Do the serial sum
171+
starttime = MPI_Wtime();
172+
serial_sum = dot(len, a, b);
173+
endtime = MPI_Wtime();
174+
stime = endtime - starttime;
173175

174-
// Generate a random summation
176+
// Generate a random dot product on the MPI ranks
177+
starttime = MPI_Wtime();
175178
rand_sum = associative_accumulate_rand<FLOAT_T>(numtasks, rank_sum, is_sum, &height);
179+
endtime = MPI_Wtime();
180+
randtreetime = endtime - starttime;
176181

177-
// MPFR
182+
// MPFR dot product
183+
starttime = MPI_Wtime();
178184
mpfr_acc = mpfr_dot(as, bs, len);
185+
endtime = MPI_Wtime();
186+
mpfrtime = endtime - starttime;
187+
188+
// Error analysis
189+
error = dot_e(magnitude, len, 0.0);
190+
191+
// TODO: Figure out the height of MPI Reduce and MPI noncommutative sum, and canonical MPI sum
192+
// TODO: Add in timings for MPFR and serial summations.
179193

180194
// Print header then different dot products
181195
printf("numtasks\tveclen\ttopology\tdistribution\treduction algorithm\torder\theight\ttime\tFP (decimal)\tFP (%%a)\tFP (hex)\n");
182-
pv.d = mysum;
196+
pv.d = serial_sum;
183197
printf("%d\t%lld\t%s\t%s\t%s\tLeft assoc\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
184-
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), len-1, nan(""), mysum, mysum, pv.u);
198+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), len-1, stime, serial_sum, serial_sum, pv.u);
185199
pv.d = rand_sum;
186200
printf("%d\t%lld\t%s\t%s\t%s\tRandom assoc\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
187-
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), height, ptime, rand_sum, rand_sum, pv.u);
188-
// TODO: Figure out the height of MPI Reduce and MPI noncommutative sum, and canonical MPI sum
189-
// TODO: Add in timings for MPFR and serial summations.
201+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), height, randtreetime, rand_sum, rand_sum, pv.u);
190202
pv.d = par_sum;
191203
printf("%d\t%lld\t%s\t%s\t%s\tMPI Reduce\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
192-
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), 0LL, ptime, par_sum, par_sum, pv.u);
204+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) ceil(log2(numtasks)), ptime, par_sum, par_sum, pv.u);
193205
pv.d = nc_sum;
194206
printf("%d\t%lld\t%s\t%s\t%s\tMPI noncomm sum\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
195-
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), 0LL, ptime, nc_sum, nc_sum, pv.u);
207+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) numtasks-1, ptime, nc_sum, nc_sum, pv.u);
196208
pv.d = can_mpi_sum;
197209
printf("%d\t%lld\t%s\t%s\t%s\tCanonical MPI\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
198-
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) numtasks-1, nan(""), can_mpi_sum, can_mpi_sum, pv.u);
210+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) numtasks-1, ctime, can_mpi_sum, can_mpi_sum, pv.u);
199211
mpfr_printf("%d\t%lld\t%s\t%s\t%s\tMPFR(%d) left assoc\t%lld\t%f\t%.20RNf\t%.20RNa\t%RNa\n",
200212
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(),
201213
std::numeric_limits<mpfr_float_1000>::digits, // Precision of MPFR
202-
len-1, nan(""), mpfr_acc, mpfr_acc, mpfr_acc);
214+
len - 1, mpfrtime, mpfr_acc, mpfr_acc, mpfr_acc);
215+
mpfr_printf("%d\t%lld\t%s\t%s\t%s\tPredicted error\t%lld\t%f\t%.20RNf\t%.20RNa\t%RNa\n",
216+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(),
217+
len-1, nan(""), error, error, error);
218+
result = serial_sum - mpfr_acc;
219+
mpfr_printf("%d\t%lld\t%s\t%s\t%s\tLeft assoc error\t%lld\t%f\t%.20RNf\t%.20RNa\t%RNa\n",
220+
numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(),
221+
len - 1, nan(""), result, result, result);
203222
}
204223

205224
free(a);
@@ -214,12 +233,48 @@ int main (int argc, char* argv[])
214233
return rc;
215234
}
216235

217-
mpfr_float_1000 mpfr_dot(FLOAT_T *as, FLOAT_T *bs, long long len)
236+
FLOAT_T can_mpi_dot(int numtasks, long long len, FLOAT_T* as, FLOAT_T* bs, FLOAT_T (*rand_a)(), FLOAT_T (*rand_b)(), FLOAT_T *rank_sum)
237+
{
238+
int i, j;
239+
int chunk = len/numtasks;
240+
FLOAT_T can_mpi_sum = 0.0;
241+
FLOAT_T localsum = 0.0;
242+
for (i = 0; i < numtasks; i++) {
243+
rank_sum[i] = 0.0;
244+
for (j = chunk*i; j < chunk * i + chunk; j++) {
245+
as[j] = rand_a();
246+
bs[j] = rand_b();
247+
/* // Debug
248+
if (as[j] != a[j] || bs[j] != b[j]) {
249+
fprintf(stderr, "Results differ: (%a != %a, %a != %a)\n",
250+
as[j], a[j], bs[j], b[j]);
251+
}
252+
*/
253+
rank_sum[i] += as[j] * bs[j];
254+
localsum += as[j] * bs[j];
255+
}
256+
}
257+
for (i = 0; i < numtasks; i++) {
258+
can_mpi_sum += rank_sum[i];
259+
}
260+
return(can_mpi_sum);
261+
}
262+
263+
FLOAT_T dot(long long len, FLOAT_T* a, FLOAT_T* b)
264+
{
265+
FLOAT_T acc = 0.0;
266+
for (long long i = 0; i < len; i++) {
267+
acc = acc + a[i] * b[i];
268+
}
269+
return acc;
270+
}
271+
272+
mpfr_float_1000 mpfr_dot(FLOAT_T *a, FLOAT_T *b, long long len)
218273
{
219274
mpfr_float_1000 acc;
220275
acc = 0.0;
221276
for (long long i = 0; i < len; i++) {
222-
acc = acc + as[i] * bs[i];
277+
acc = acc + a[i] * b[i];
223278
}
224279
return(acc);
225280
}

src/error_semantics.cxx

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#ifndef ERROR_SEMANTICS_CXX
2+
#define ERROR_SEMANTICS_CXX
3+
4+
#include "error_semantics.hxx"
5+
6+
using namespace boost::multiprecision;
7+
8+
template <typename FLOAT_T>
9+
mpfr_float_1000 gamma(long long int n)
10+
{
11+
return n * mach_eps<FLOAT_T> / (1.0 - n * mach_eps<FLOAT_T>);
12+
}
13+
14+
mpfr_float_1000 dot_e(double mag, long long int n, mpfr_float_1000 e)
15+
{
16+
mpfr_float_1000 rv;
17+
/* e * |x| . |y| * \gamma_n + nd(1 + \theta_{n-1}) */
18+
rv = e + mag * mag * n * gamma<double>(n)
19+
+ n * mach_del_dbl * (1.0 + gamma<double>(n-1));
20+
return(rv);
21+
}
22+
23+
#endif

src/error_semantics.hxx

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/* Semantics of floating-point error.
2+
* Given an operation, return the error bounds.
3+
*/
4+
#ifndef ERROR_SEMANTICS_HXX
5+
#define ERROR_SEMANTICS_HXX
6+
7+
#include <limits>
8+
#include <boost/multiprecision/mpfr.hpp>
9+
#include <boost/multiprecision/number.hpp>
10+
11+
using namespace boost::multiprecision;
12+
using namespace boost::math;
13+
14+
template <typename FLOAT_T>
15+
const mpfr_float_1000 mach_eps = std::numeric_limits<FLOAT_T>::epsilon();
16+
17+
const mpfr_float_1000 mach_eps_flt = mach_eps<float>;
18+
const mpfr_float_1000 mach_eps_dbl = mach_eps<double>;
19+
20+
const mpfr_float_1000 mach_del_flt = pow(2, std::numeric_limits<float>::min_exponent - 1)*mach_eps<float>;
21+
const mpfr_float_1000 mach_del_dbl = pow(2, std::numeric_limits<double>::min_exponent - 1)*mach_eps<double>;
22+
23+
mpfr_float_1000 dot_e(double mag, long long int n, mpfr_float_1000 e);
24+
25+
#endif

0 commit comments

Comments
 (0)