Added theoretical error analysis for dot product

Sam Pollard · Sam Pollard · commit bad58cf3783d · 2021-03-03T03:18:52.000-08:00
diff --git a/README.md b/README.md
@@ -106,7 +106,3 @@ tar -a -cvf datasets.tar.bz2 README.md nekbone.tsv subn.tsv openmpi talapas simg
 Happy reproducing
 -Sam
 
-## Acknowledgments
-This work is partially funded by Sandia National Laboratories.
-
-Sandia National Laboratories is a multimission laboratory managed and operated by National Technology and Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International, Inc., for the U.S. Department of Energy's National Nuclear Security Administration under contract DE-NA-0003525.
diff --git a/src/Makefile b/src/Makefile
@@ -7,28 +7,28 @@ USE_MPI ?= 1
 MPICXX ?= smpicxx
 #MPICXX = mpicxx
 
+# If using a different install location for simgrid, specify that here
+LDFLAGS += -L$${HOME}/.local/simgrid/lib -Wl,-rpath=$${HOME}/.local/simgrid/lib
+
 # Experiment Parameters
-EXP_DIR = experiments/post-acceptance
+EXP_DIR = experiments/2021
 RAND_TRIALS = 1000000
 VECLEN_RAND_QUICK = 100000
 VECLEN_RAND_BIG = 2000000
 RAND_TRIALS_DEEP = 5000000
 VECLEN_RAND_DEEP = 256
 
-EXTRA_SOURCES = assoc.cxx mpi_op.cxx rand.cxx
-HEADERS = assoc.hxx mpi_op.hxx rand.hxx util.hxx
+EXTRA_SOURCES = assoc.cxx error_semantics.cxx mpi_op.cxx rand.cxx
+HEADERS = assoc.hxx error_semantics.hxx mpi_op.hxx rand.hxx util.hxx
 ifeq ($(USE_MPI), 1)
 TARGETS = mpi_pi_reduce dotprod_mpi
 else
 TARGETS = assoc_test gen_random
 endif
 
-# SimGrid options
-LDFLAGS += -L$${HOME}/.local/simgrid/lib -Wl,-rpath=$${HOME}/.local/simgrid/lib
-LIBS += -lmpfr -lgmp
-CXXFLAGS += -Wall -g
-
 # Shouldn't need to change
+LIBS += -lmpfr -lgmp
+CXXFLAGS += -Wall -g -std=c++14
 OBJECTS = $(EXTRA_SOURCES:.cxx=.o)
 TARGET_OBJS = $(TARGETS:=.o)
 
@@ -41,12 +41,11 @@ else
 	$(CXX) $(CXXFLAGS) -c $^
 endif
 
-
 # MPI targets
 ifeq ($(USE_MPI),1)
 mpi_pi_reduce: mpi_pi_reduce.o rand.o
 	$(MPICXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
-dotprod_mpi : $(OBJECTS) rand.o mpi_op.o dotprod_mpi.o
+dotprod_mpi : dotprod_mpi.o assoc.o error_semantics.o mpi_op.o rand.o
 	$(MPICXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
 else 
 # Non-MPI targets
@@ -115,7 +114,8 @@ clean :
 # Dependency lists
 assoc.o : assoc.hxx
 assoc_test.o : assoc.hxx rand.hxx util.hxx
-dotprod_mpi.o : rand.hxx assoc.hxx mpi_op.hxx
+error_semantics.o : error_semantics.hxx
+dotprod_mpi.o : error_semantics.hxx rand.hxx assoc.hxx mpi_op.hxx util.hxx
 gen_random.o : rand.hxx
 mpi_op.o : mpi_op.hxx
 mpi_pi_reduce.o : rand.hxx
diff --git a/src/dotprod_mpi.cxx b/src/dotprod_mpi.cxx
@@ -22,22 +22,25 @@
 	"<distr> is the distribution to use. Choices are:\n"\
 	"\trunif[0,1] runif[-1,1] runif[-1000,1000] rsubn\n"\
 	"<topology> is a string for logging, best used with SimGrid\n"\
-	"<algorithm> is a stirng for logging, best used with SimGrid\n")
+	"<algorithm> is a string for logging, best used with SimGrid\n")
 
 #include <cstdio>
 #include <string>
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <boost/multiprecision/mpfr.hpp>
+#include <boost/multiprecision/number.hpp>
 
-#include "rand.hxx"
 #include "assoc.hxx"
+#include "error_semantics.hxx"
 #include "mpi_op.hxx"
+#include "rand.hxx"
 #include "util.hxx"
 
 #define FLOAT_T double
 using namespace boost::multiprecision;
+using namespace boost::math;
 
 /* Note: it would be more robust to use ACCUMULATOR().operator()(a,b) instead
  * of a ACC_OP b, but this doesn't work for mpfr values */
@@ -48,21 +51,28 @@ using namespace boost::multiprecision;
 
 const bool is_sum  = std::is_same<std::plus<FLOAT_T>, ACCUMULATOR>::value;
 const bool is_prod = std::is_same<std::multiplies<FLOAT_T>, ACCUMULATOR>::value;
-mpfr_float_1000 mpfr_dot(FLOAT_T *as, FLOAT_T *bs, long long len);
+/* Fill in random vectors and do dot product according to MPI canonical ordering */
+FLOAT_T can_mpi_dot(int numtasks, long long len, FLOAT_T* as, FLOAT_T* bs, FLOAT_T (*rand_a)(), FLOAT_T (*rand_b)(), FLOAT_T *rank_sum);
+/* Left-associative dot product */
+FLOAT_T dot(long long len, FLOAT_T* a, FLOAT_T* b);
+/* Left-associative dot product with MPFR accumulator */
+mpfr_float_1000 mpfr_dot(FLOAT_T *a, FLOAT_T *b, long long len);
 
 int main (int argc, char* argv[])
 {
 	int taskid, numtasks;
-	long i, j, chunk, rc=0;
+	long i, chunk, rc=0;
 	long long len, height;
 	MPI_Op nc_sum_op;
 	std::string distr, topo, algo;
 	FLOAT_T *a, *b, *as, *bs, *rank_sum;
-	FLOAT_T mysum, nc_sum, par_sum, can_mpi_sum, rand_sum;
-	FLOAT_T starttime, endtime, ptime;
+	FLOAT_T localsum, nc_sum, par_sum, can_mpi_sum, rand_sum, serial_sum;
+	FLOAT_T starttime, endtime, ptime, ctime, stime, mpfrtime, randtreetime;
 	FLOAT_T (*rand_flt_a)(); // Function to generate a random float
 	FLOAT_T (*rand_flt_b)(); // Function to generate a random float
 	mpfr_float_1000 mpfr_acc;
+	mpfr_float_1000 error, result;
+	FLOAT_T magnitude = 0.0;
 	union udouble {
 		double d;
 		unsigned long u;
@@ -94,8 +104,8 @@ int main (int argc, char* argv[])
 	}
 	/* Select distribution for random floating point numbers */
 	distr = argv[2];
-	rc = parse_distr<FLOAT_T>(distr, &rand_flt_a)
-		|| parse_distr<FLOAT_T>(distr, &rand_flt_b);
+	rc = parse_distr<FLOAT_T>(distr, &magnitude, &rand_flt_a)
+		|| parse_distr<FLOAT_T>(distr, &magnitude, &rand_flt_b);
 	if (rc != 0) {
 		if (taskid == 0) {
 			fprintf(stderr, "Unrecognized distribution:\n%s", USAGE);
@@ -117,11 +127,13 @@ int main (int argc, char* argv[])
 
 	/* Assign storage for dot product vectors
 	 * We do extra here for simplicity and so rank 0 has enough room */
+	// XXX: Do not need to malloc as and bs on each MPI process, just on rank 0
+	// This is a bit trickier to make sure the rand is consistent though.
 	a  = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
 	b  = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
 	as = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
 	bs = (FLOAT_T*) malloc(len*sizeof(FLOAT_T));
-	rank_sum = (FLOAT_T *) malloc (numtasks*sizeof(FLOAT_T));
+	rank_sum = (FLOAT_T*) malloc (numtasks*sizeof(FLOAT_T));
 
 	/* Initialize dot product vectors */
 	chunk = len/numtasks;
@@ -132,74 +144,81 @@ int main (int argc, char* argv[])
 		b[i] = rand_flt_b();
 	}
 
-	/* Perform the dot product */
+	/* Perform the dot product in parallel */
 	starttime = MPI_Wtime();
-	mysum = 0.0;
+	localsum = 0.0;
 	for (i = chunk*taskid; i < chunk*taskid + chunk; i++) {
-		mysum += a[i] * b[i];
+		localsum += a[i] * b[i];
 	}
 
 	/* After the dot product, perform a summation of results on each node */
-	MPI_Reduce(&mysum, &par_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-	MPI_Reduce(&mysum, &nc_sum, 1, MPI_DOUBLE, nc_sum_op, 0, MPI_COMM_WORLD);
+	MPI_Reduce(&localsum, &par_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
 	endtime = MPI_Wtime();
+	MPI_Reduce(&localsum, &nc_sum, 1, MPI_DOUBLE, nc_sum_op, 0, MPI_COMM_WORLD);
 	ptime = endtime - starttime;
 
-	/* Now, task 0 does all the work to check. The canonical ordering
-	 * is increasing taskid */
+	/* Now, task 0 does all the work to check. The canonical ordering * is increasing taskid */
 	set_seed(ASSOC_SEED, 0);
 	srand(ASSOC_SEED);
 	if (taskid == 0) {
-		mysum = 0.0;
-		for (i = 0; i < numtasks; i++) {
-			rank_sum[i] = 0.0;
-			for (j = chunk*i; j < chunk * i + chunk; j++) {
-				as[j] = rand_flt_a();
-				bs[j] = rand_flt_b();
-				/* // Debug
-				if (as[j] != a[j] || bs[j] != b[j]) {
-						fprintf(stderr, "Results differ: (%a != %a, %a != %a)\n",
-						        as[j], a[j], bs[j], b[j]);
-				}
-				*/
-				rank_sum[i] += as[j] * bs[j];
-				mysum += as[j] * bs[j];
-			}
-		}
-		can_mpi_sum = 0.0;
-		for (i = 0; i < numtasks; i++) {
-			can_mpi_sum += rank_sum[i];
-		}
+		// Do the canonical MPI dot product summation
+		starttime = MPI_Wtime();
+		can_mpi_sum = can_mpi_dot(numtasks, len, as, bs, rand_flt_a, rand_flt_b, rank_sum);
+		endtime = MPI_Wtime();
+		ctime = endtime - starttime;
+
+		// Do the serial sum
+		starttime = MPI_Wtime();
+		serial_sum = dot(len, a, b);
+		endtime = MPI_Wtime();
+		stime = endtime - starttime;
 
-		// Generate a random summation
+		// Generate a random dot product on the MPI ranks
+		starttime = MPI_Wtime();
 		rand_sum = associative_accumulate_rand<FLOAT_T>(numtasks, rank_sum, is_sum, &height);
+		endtime = MPI_Wtime();
+		randtreetime = endtime - starttime;
 
-		// MPFR
+		// MPFR dot product
+		starttime = MPI_Wtime();
 		mpfr_acc = mpfr_dot(as, bs, len);
+		endtime = MPI_Wtime();
+		mpfrtime = endtime - starttime;
+
+		// Error analysis
+		error = dot_e(magnitude, len, 0.0);
+
+		// TODO: Figure out the height of MPI Reduce and MPI noncommutative sum, and canonical MPI sum
+		// TODO: Add in timings for MPFR and serial summations.
 
 		// Print header then different dot products
 		printf("numtasks\tveclen\ttopology\tdistribution\treduction algorithm\torder\theight\ttime\tFP (decimal)\tFP (%%a)\tFP (hex)\n");
-		pv.d = mysum;
+		pv.d = serial_sum;
 		printf("%d\t%lld\t%s\t%s\t%s\tLeft assoc\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
-			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), len-1, nan(""), mysum, mysum, pv.u);
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), len-1, stime, serial_sum, serial_sum, pv.u);
 		pv.d = rand_sum;
 		printf("%d\t%lld\t%s\t%s\t%s\tRandom assoc\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
-			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), height, ptime, rand_sum, rand_sum, pv.u);
-		// TODO: Figure out the height of MPI Reduce and MPI noncommutative sum, and canonical MPI sum
-		// TODO: Add in timings for MPFR and serial summations.
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), height, randtreetime, rand_sum, rand_sum, pv.u);
 		pv.d = par_sum;
 		printf("%d\t%lld\t%s\t%s\t%s\tMPI Reduce\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
-			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), 0LL, ptime, par_sum, par_sum, pv.u);
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) ceil(log2(numtasks)), ptime, par_sum, par_sum, pv.u);
 		pv.d = nc_sum;
 		printf("%d\t%lld\t%s\t%s\t%s\tMPI noncomm sum\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
-			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), 0LL, ptime, nc_sum, nc_sum, pv.u);
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) numtasks-1, ptime, nc_sum, nc_sum, pv.u);
 		pv.d = can_mpi_sum;
 		printf("%d\t%lld\t%s\t%s\t%s\tCanonical MPI\t%lld\t%f\t%.15f\t%a\t0x%lx\n",
-			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) numtasks-1, nan(""), can_mpi_sum, can_mpi_sum, pv.u);
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(), (long long) numtasks-1, ctime, can_mpi_sum, can_mpi_sum, pv.u);
 		mpfr_printf("%d\t%lld\t%s\t%s\t%s\tMPFR(%d) left assoc\t%lld\t%f\t%.20RNf\t%.20RNa\t%RNa\n",
 			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(),
 			std::numeric_limits<mpfr_float_1000>::digits, // Precision of MPFR
-			len-1, nan(""), mpfr_acc, mpfr_acc, mpfr_acc);
+			len - 1, mpfrtime, mpfr_acc, mpfr_acc, mpfr_acc);
+		mpfr_printf("%d\t%lld\t%s\t%s\t%s\tPredicted error\t%lld\t%f\t%.20RNf\t%.20RNa\t%RNa\n",
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(),
+			len-1, nan(""), error, error, error);
+		result = serial_sum - mpfr_acc;
+		mpfr_printf("%d\t%lld\t%s\t%s\t%s\tLeft assoc error\t%lld\t%f\t%.20RNf\t%.20RNa\t%RNa\n",
+			numtasks, len, topo.c_str(), distr.c_str(), algo.c_str(),
+			len - 1, nan(""), result, result, result);
 	}
 
 	free(a);
@@ -214,12 +233,48 @@ int main (int argc, char* argv[])
 	return rc;
 }
 
-mpfr_float_1000 mpfr_dot(FLOAT_T *as, FLOAT_T *bs, long long len)
+FLOAT_T can_mpi_dot(int numtasks, long long len, FLOAT_T* as, FLOAT_T* bs, FLOAT_T (*rand_a)(), FLOAT_T (*rand_b)(), FLOAT_T *rank_sum)
+{
+	int i, j;
+	int chunk = len/numtasks;
+	FLOAT_T can_mpi_sum = 0.0;
+	FLOAT_T localsum = 0.0;
+	for (i = 0; i < numtasks; i++) {
+		rank_sum[i] = 0.0;
+		for (j = chunk*i; j < chunk * i + chunk; j++) {
+			as[j] = rand_a();
+			bs[j] = rand_b();
+			/* // Debug
+			if (as[j] != a[j] || bs[j] != b[j]) {
+					fprintf(stderr, "Results differ: (%a != %a, %a != %a)\n",
+							as[j], a[j], bs[j], b[j]);
+			}
+			*/
+			rank_sum[i] += as[j] * bs[j];
+			localsum += as[j] * bs[j];
+		}
+	}
+	for (i = 0; i < numtasks; i++) {
+		can_mpi_sum += rank_sum[i];
+	}
+	return(can_mpi_sum);
+}
+
+FLOAT_T dot(long long len, FLOAT_T* a, FLOAT_T* b)
+{
+	FLOAT_T acc = 0.0;
+	for (long long i = 0; i < len; i++) {
+		acc = acc + a[i] * b[i];
+	}
+	return acc;
+}
+
+mpfr_float_1000 mpfr_dot(FLOAT_T *a, FLOAT_T *b, long long len)
 {
 	mpfr_float_1000 acc;
 	acc = 0.0;
 	for (long long i = 0; i < len; i++) {
-		acc = acc + as[i] * bs[i];
+		acc = acc + a[i] * b[i];
 	}
 	return(acc);
 }
diff --git a/src/error_semantics.cxx b/src/error_semantics.cxx
@@ -0,0 +1,23 @@
+#ifndef ERROR_SEMANTICS_CXX
+#define ERROR_SEMANTICS_CXX
+
+#include "error_semantics.hxx"
+
+using namespace boost::multiprecision;
+
+template <typename FLOAT_T>
+mpfr_float_1000 gamma(long long int n)
+{
+	return n * mach_eps<FLOAT_T> / (1.0 - n * mach_eps<FLOAT_T>);
+}
+
+mpfr_float_1000 dot_e(double mag, long long int n, mpfr_float_1000 e)
+{
+	mpfr_float_1000 rv;
+	/*  e * |x| . |y| * \gamma_n + nd(1 + \theta_{n-1}) */
+	rv = e + mag * mag * n * gamma<double>(n)
+	       + n * mach_del_dbl * (1.0 + gamma<double>(n-1));
+	return(rv);
+}
+
+#endif
diff --git a/src/error_semantics.hxx b/src/error_semantics.hxx
@@ -0,0 +1,25 @@
+/* Semantics of floating-point error.
+ * Given an operation, return the error bounds.
+ */
+#ifndef ERROR_SEMANTICS_HXX
+#define ERROR_SEMANTICS_HXX
+
+#include <limits>
+#include <boost/multiprecision/mpfr.hpp>
+#include <boost/multiprecision/number.hpp>
+
+using namespace boost::multiprecision;
+using namespace boost::math;
+
+template <typename FLOAT_T>
+const mpfr_float_1000 mach_eps = std::numeric_limits<FLOAT_T>::epsilon();
+
+const mpfr_float_1000 mach_eps_flt = mach_eps<float>;
+const mpfr_float_1000 mach_eps_dbl = mach_eps<double>;
+
+const mpfr_float_1000 mach_del_flt = pow(2, std::numeric_limits<float>::min_exponent - 1)*mach_eps<float>;
+const mpfr_float_1000 mach_del_dbl = pow(2, std::numeric_limits<double>::min_exponent - 1)*mach_eps<double>;
+
+mpfr_float_1000 dot_e(double mag, long long int n, mpfr_float_1000 e);
+
+#endif
diff --git a/src/rand.cxx b/src/rand.cxx
diff --git a/src/rand.hxx b/src/rand.hxx