-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfft_par.c
1016 lines (918 loc) · 34.5 KB
/
fft_par.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#define _XOPEN_SOURCE 500
#include <alloca.h>
#include <complex.h>
#include <math.h>
#include <stdlib.h>
#include <fftw3.h>
#include "fft_par.h"
#include "fft_ser.h"
struct fft_par_plan_s {
void *src;
void *srcbuffer;
void *dstbuffer;
void *dst;
/* Communication Displacements */
int *cycle_senddispls;
int *cycle_recvdispls;
int *swizzle_senddispls;
int *swizzle_recvdispls;
/* Communication Types */
MPI_Datatype *cycle_sendtypes;
MPI_Datatype *cycle_recvtypes;
MPI_Datatype *swizzle_sendtypes;
MPI_Datatype *swizzle_recvtypes;
/* Persistent communication requests for asynch communication */
MPI_Request *swizzle_sendreqs_fwd;
MPI_Request *swizzle_recvreqs_fwd;
MPI_Request *swizzle_sendreqs_rev;
MPI_Request *swizzle_recvreqs_rev;
/* FFT Plans */
fftw_plan innerplan_fwd;
fftw_plan outerplan_fwd;
fftw_plan innerplan_rev;
fftw_plan outerplan_rev;
int *ones;
int *map; /* [procs][ndims] location of each processor */
int *outerjobs; /* [procs][ndims] Number of outer jobs on each processor */
int *nelems; /* [ndims] Number of elements in each direction */
int *nprocs; /* [ndims] Number of processors in each direction */
int ndims; /* How many dimensions there are */
int veclen; /* The length of each element (parallel transforms) */
int procid;
MPI_Comm comm;
};
static inline int max(int a, int b)
{
return (a > b ? a : b);
}
//! Compute the send and receive offsets and types for a cyclic permutation.
/*! Compute the send and receive offsets and types for a given processor which
* will be used by <tt>MPI_Alltoallw</tt> to perform a cyclic data permutation.
* The comm, map, procs, ndims, nelem, and len members of the plan should be
* set to the values for the communication.
*
* \param plan The plan to write the cyclic_{send,recv}{displ,type}s members
* of. This function makes no attempt to avoid overwriting these values in case
* of failure.
* \return The MPI return code
*/
int cycle_data_parameter_fill(fft_par_plan plan);
//! Compute the send and receive offsets and types for swizzling data.
/*! Compute the send and receive offsets and types for a given processor which
* will be used by <tt>MPI_Alltoallw</tt> to perform second stage data
* swizzling. The cyc_[sr][dt] members of the plan structure should be
* initialized to arrays of at least plan->nprocs length, and the nprocs,
* rank, and outerjobs members of the plan should be set to the values for the
* communication.
*
* \param plan The plan to write the swiz_[sr][dt] members of.
* \return The MPI return code
*/
int swizzle_data_parameter_fill(fft_par_plan plan);
//! Initialize the persistent send/recv requests for second stage communication
/*! Initialize the persistent send and receive requests for asynchronous second
* stage communication. Assumes that the swizzle_{send,recv}reqs buffers of plan
* have been initialized.
*
* \param plan The plan in which the swizzle_{send,recv}reqs members will be
* written.
* \return The MPI return code
*/
int fft_par_init_swizzle_reqs(fft_par_plan plan);
//! Apply the twiddle factors to the intermediate data within a plan.
/*! In order to compose two smaller length fourier transforms into a fourier
* transform of a larger composite data set, we need to multiply twiddle
* factors onto the result of the inner fourier transform before performing
* the outer fourier transform. This routine multiplies the data in the
* outersrc array by the appropriate complex values.
*
* The process proceeds orthogonally for each rank, and operates on the data
* coming from a single specified processor.
*
* \param plan The plan owning the outersrc array to apply twiddle factors to.
* \param p The elements coming from processor p have the twiddle factors
* applied.
* \return the MPI error code, if any
*/
typedef enum {
TWIDDLE_FORWARD,
TWIDDLE_REVERSE
} twiddle_dir;
int apply_twiddle_factors(fft_par_plan plan, int p, twiddle_dir dir)
{
int procid;
int err = MPI_Comm_rank(plan->comm, &procid);
if(err != MPI_SUCCESS) goto fail_immed;
const int dims = plan->ndims;
const int *const procs = plan->nprocs;
const int *const elems = plan->nelems;
const int *const ojobs = plan->outerjobs + procid*dims;
const int *const loc = plan->map + procid*dims;
const int vl = plan->veclen;
/* Multi-index over the work unit */
int *wu = alloca(dims*sizeof(int));
/* Lenght of a workunit */
int wulen = vl;
for(int d = 0; d < dims; ++d) { wulen *= procs[d]; }
/* Zero the multi-index */
for(int d = 0; d < dims; ++d) { wu[d] = 0; }
/* The processor index */
const int *const pidx = plan->map + p*dims;
/* The displacement of this processor's element within a workunit (in
* multiples of veclen)
*/
int displ = 0;
for(int d = 0; d < dims; ++d) {
displ = displ*procs[d] + pidx[d];
}
/* The current target element we are applying the swizzle factor to */
double complex *tgt = (double complex *)plan->srcbuffer + displ*vl;
/* Root of unity in each direction */
double complex *w0 = alloca(dims*sizeof(complex double));
switch(dir) {
case TWIDDLE_FORWARD:
for(int d = 0; d < dims; ++d) {
w0[d] = cexp(-2*M_PI*I/elems[d]/procs[d]);
}
break;
case TWIDDLE_REVERSE:
default:
for(int d = 0; d < dims; ++d) {
w0[d] = cexp(2*M_PI*I/elems[d]/procs[d]);
}
break;
}
/* What to multiply the twiddle factor by each iteration, the ratio of twiddle
* factors between successive work units.
*/
double complex *wprod = alloca(dims*sizeof(double complex));
for(int d = 0; d < dims; ++d) {
wprod[d] = cpow(w0[d], pidx[d]*procs[d]);
}
/* The current twiddle factor */
double complex *w = alloca(dims*sizeof(double complex));
for(int d = 0; d < dims; ++d) {
w[d] = cpow(w0[d], pidx[d]*loc[d]);
}
int cont = 1; /* Continue iteration over work units */
for(int d = 0; d < dims; ++d) { cont = cont && wu[d] < ojobs[d]; }
while(cont) {
/* Apply twiddle factors */
for(int j = 0; j < vl; ++j) {
for(int d = 0; d < dims; ++d) {
/* We essentially want to perform this operation
* tgt[j] *= cpow(w0[d], pidx[d]*(wu[d]*procs[d] + loc[d]));
*/
tgt[j] *= w[d];
}
}
/* Increment the work unit's outermost indices, and spill backwards.
* Increment the twiddle factor as we go
*/
++wu[dims - 1];
w[dims - 1] *= wprod[dims - 1];
for(int d = dims - 1; d >= 1; --d) {
if(wu[d] >= ojobs[d]) {
wu[d] = 0;
/* Reset twiddle factor in this dimension */
w[d] = cpow(w0[d], pidx[d]*loc[d]);
++wu[d-1];
w[d-1] *= wprod[d-1];
}
}
cont = wu[0] < ojobs[0];
/* Increment the target */
tgt += wulen;
}
fail_immed:
return err;
}
fft_par_plan fft_par_plan_r2c_nc(MPI_Comm comm, int ndims, const int *proc_dim,
const int VL, const int *map, const int *size, double *src,
complex double *dst, int *errloc);
fft_par_plan fft_par_plan_r2c_1d(MPI_Comm comm, int const nelems, int const vl,
double * const src, double complex * const dst, int * const errloc)
{
fft_par_plan plan = NULL;
int size;
int err = MPI_Comm_size(comm, &size);
if(err != MPI_SUCCESS) goto fail_immed;
/* Create a trivial map for the one dimensional case */
int *const map = malloc(size*sizeof(int));
for(int i = 0; i < size; ++i) { map[i] = i; }
plan = fft_par_plan_r2c_nc(comm, 1, &size, vl, map, &nelems,
src, dst, &err);
free(map);
fail_immed:
if(errloc != NULL) *errloc = err;
return plan;
}
fft_par_plan fft_par_plan_r2c(MPI_Comm comm, const int *const size,
const int vl, double *const src, double complex *const dst,
int *const errloc)
{
fft_par_plan plan = NULL;
int ndims;
int err = MPI_Cartdim_get(comm, &ndims);
if(err != MPI_SUCCESS) {
goto fail_immed;
}
int *proc_dim = malloc(ndims*sizeof(int));
if(proc_dim == NULL) goto fail_immed;
int *period = malloc(ndims*sizeof(int));
if(period == NULL) goto fail_free_proc_dim;
int *loc = malloc(ndims*sizeof(int));
if(loc == NULL) goto fail_free_period;
err = MPI_Cart_get(comm, ndims, proc_dim, period, loc);
if(err != MPI_SUCCESS) goto fail_free_loc;
int procs;
err = MPI_Comm_size(comm, &procs);
if(err != MPI_SUCCESS) goto fail_free_loc;
int *map = malloc(sizeof(int)*ndims*procs);
if(map == NULL) goto fail_free_loc;
/* Fill in the processor map */
for(int p = 0; p < procs; ++p) {
err = MPI_Cart_coords(comm, p, ndims, map + p*ndims);
if(err != MPI_SUCCESS) goto fail_free_map;
}
// Call this helper function to actually initialize, it doesn't assume
// cartesian topology information is attached to the communicator.
plan = fft_par_plan_r2c_nc(comm, ndims, proc_dim, vl, map, size,
src, dst, &err);
/* Resources freed when this function failed */
fail_free_map:
free(map);
fail_free_loc:
free(loc);
fail_free_period:
free(period);
fail_free_proc_dim:
free(proc_dim);
fail_immed:
if(errloc != NULL) *errloc = err;
return plan;
}
fft_par_plan fft_par_plan_r2c_nc(MPI_Comm comm, const int ndims,
const int *const proc_dim, const int vl, const int *const map, const int
*const size, double * const src, double complex * const dst, int *errloc)
{
int rank;
int err = MPI_Comm_rank(comm, &rank);
if(err != MPI_SUCCESS) goto fail_immed;
fft_par_plan plan = malloc(sizeof(struct fft_par_plan_s));
if(plan == NULL) goto fail_immed;
plan->comm = comm;
plan->ndims = ndims;
plan->src = src;
plan->dst = dst;
plan->procid = rank;
plan->veclen = vl;
/* The total number of data elements on each processor */
int netlen = 1; for(int i = 0; i < ndims; ++i) netlen *= size[i];
/* The total number of processors on the grid */
int nprocs = 1; for(int i = 0; i < ndims; ++i) nprocs *= proc_dim[i];
/* Compute the number of second stage work units in each direction for each
* processor.
*/
plan->outerjobs = malloc(nprocs*ndims*sizeof(int));
if(plan->outerjobs == NULL) goto fail_free_plan;
for(int p = 0; p < nprocs; ++p) {
const int * const p_loc = map + p*ndims; // The coordinates of processor p
for(int d = 0; d < ndims; ++d) {
*(plan->outerjobs + p*ndims + d) = size[d]/proc_dim[d];
if(p_loc[d] < size[d] % proc_dim[d]) {
++*(plan->outerjobs + p*ndims + d);
}
}
}
/* For convenience, cache the total number of outer jobs for this processor */
int ojobs = 1;
for(int d = 0; d < ndims; ++d) ojobs *= *(plan->outerjobs + rank*ndims + d);
/* Copy the processor map */
plan->map = malloc(nprocs*ndims*sizeof(int));
if(plan->map == NULL) goto fail_free_outerjobs;
/* We don't use memcpy because memcpy's source parameter isn't const void *,
* thus the compiler has a hissy fit because we discard the const qualifier
* on the input map.
*/
for(int i = 0; i < nprocs*ndims; ++i) plan->map[i] = map[i];
/* Copy the number of elements in each direction */
plan->nelems = malloc(ndims*sizeof(int));
if(plan->nelems == NULL) goto fail_free_map;
for(int d = 0; d < ndims; ++d) plan->nelems[d] = size[d];
/* Copy the number of processors in each direction */
plan->nprocs = malloc(ndims*sizeof(int));
if(plan->nprocs == NULL) goto fail_free_nelems;
for(int d = 0; d < ndims; ++d) plan->nprocs[d] = proc_dim[d];
/* Allocate the storage arrays */
const int maxlen = max(netlen, ojobs*nprocs);
plan->srcbuffer = fftw_malloc(vl*maxlen*sizeof(double complex));
if(plan->srcbuffer == NULL) goto fail_free_nprocs;
plan->dstbuffer = fftw_malloc(vl*maxlen*sizeof(double complex));
if(plan->dstbuffer == NULL) goto fail_free_srcbuffer;
/* Allocate an array of ones to use in MPI_Alltoallw as the send count. The
* types encode all the information.
*/
plan->ones = malloc(nprocs*sizeof(int));
if(plan->ones == NULL) goto fail_free_dstbuffer;
for(int i = 0; i < nprocs; ++i) plan->ones[i] = 1;
/* Array of MPI requests to be used in send/recv operations for the second
* stage communication
*/
plan->swizzle_sendreqs_fwd = malloc(nprocs*sizeof(MPI_Request));
if(plan->swizzle_sendreqs_fwd == NULL) goto fail_free_ones;
plan->swizzle_recvreqs_fwd = malloc(nprocs*sizeof(MPI_Request));
if(plan->swizzle_recvreqs_fwd == NULL) goto fail_free_swizzle_sendreqs_fwd;
plan->swizzle_sendreqs_rev = malloc(nprocs*sizeof(MPI_Request));
if(plan->swizzle_sendreqs_rev == NULL) goto fail_free_swizzle_recvreqs_fwd;
plan->swizzle_recvreqs_rev = malloc(nprocs*sizeof(MPI_Request));
if(plan->swizzle_recvreqs_rev == NULL) goto fail_free_swizzle_sendreqs_rev;
/* Initialize the cyclic communication types */
err = cycle_data_parameter_fill(plan);
if(err != MPI_SUCCESS) goto fail_free_swizzle_recvreqs_rev;
/* Initialize the swizzle communication types */
err = swizzle_data_parameter_fill(plan);
if(err != MPI_SUCCESS) goto fail_free_cycle_params;
plan->innerplan_fwd = fftw_plan_many_dft_r2c(ndims, size, vl,
plan->srcbuffer, size, vl, 1, plan->dstbuffer, size, vl, 1,
FFTW_ESTIMATE);
if(plan->innerplan_fwd == NULL) goto fail_free_swizzle_params;
plan->innerplan_rev = fftw_plan_many_dft_c2r(ndims, size, vl,
plan->dstbuffer, size, vl, 1, plan->srcbuffer, size, vl, 1,
FFTW_ESTIMATE);
if(plan->innerplan_rev == NULL) goto fail_free_innerplan_fwd;
/* Initialize the outer plans */
fftw_iodim howmany[2] = {
{ .n = ojobs, .is = vl*nprocs, .os = vl*nprocs },
{ .n = vl, .is = 1, .os = 1}
};
fftw_iodim *outerdims = malloc(ndims*sizeof(fftw_iodim));
if(outerdims == NULL) goto fail_free_innerplan_rev;
outerdims[ndims - 1].n = plan->nprocs[ndims - 1];
outerdims[ndims - 1].is = vl;
outerdims[ndims - 1].os = vl;
for(int d = ndims - 2; d >= 0; --d) {
outerdims[d].n = plan->nprocs[d];
outerdims[d].is = plan->nprocs[d+1]*outerdims[d+1].is;
outerdims[d].os = plan->nprocs[d+1]*outerdims[d+1].os;
}
plan->outerplan_fwd = fftw_plan_guru_dft(ndims, outerdims, 2, howmany,
plan->srcbuffer, plan->dstbuffer, FFTW_FORWARD, FFTW_ESTIMATE);
if(plan->outerplan_fwd == NULL) goto fail_free_outerdims;
plan->outerplan_rev = fftw_plan_guru_dft(ndims, outerdims, 2, howmany,
plan->dstbuffer, plan->srcbuffer, FFTW_BACKWARD, FFTW_ESTIMATE);
if(plan->outerplan_rev == NULL) goto fail_free_outerplan_fwd;
free(outerdims);
/* Initialize persistent communication requests */
err = fft_par_init_swizzle_reqs(plan);
if(err != MPI_SUCCESS) goto fail_free_outerplan_rev;
if(errloc != NULL) *errloc = err;
return plan;
for(int i = 0; i < nprocs; ++i) {
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_sendreqs_fwd + i);
}
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_recvreqs_fwd + i);
}
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_sendreqs_rev + i);
}
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_recvreqs_rev + i);
}
}
fail_free_outerplan_rev:
fftw_destroy_plan(plan->outerplan_rev);
fail_free_outerplan_fwd:
fftw_destroy_plan(plan->outerplan_fwd);
fail_free_outerdims:
free(outerdims);
fail_free_innerplan_rev:
fftw_destroy_plan(plan->innerplan_rev);
fail_free_innerplan_fwd:
fftw_destroy_plan(plan->innerplan_fwd);
fail_free_swizzle_params:
for(int i = 0; i < nprocs; ++i) {
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->swizzle_sendtypes + i);
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->swizzle_recvtypes + i);
}
free(plan->swizzle_sendtypes);
free(plan->swizzle_recvtypes);
free(plan->swizzle_senddispls);
free(plan->swizzle_recvdispls);
fail_free_cycle_params:
for(int i = 0; i < nprocs; ++i) {
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->cycle_sendtypes + i);
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->cycle_recvtypes + i);
}
free(plan->cycle_sendtypes);
free(plan->cycle_recvtypes);
free(plan->cycle_senddispls);
free(plan->cycle_recvdispls);
fail_free_swizzle_recvreqs_rev:
free(plan->swizzle_recvreqs_rev);
fail_free_swizzle_sendreqs_rev:
free(plan->swizzle_sendreqs_rev);
fail_free_swizzle_recvreqs_fwd:
free(plan->swizzle_recvreqs_fwd);
fail_free_swizzle_sendreqs_fwd:
free(plan->swizzle_sendreqs_fwd);
fail_free_ones:
free(plan->ones);
fail_free_dstbuffer:
fftw_free(plan->dstbuffer);
fail_free_srcbuffer:
fftw_free(plan->srcbuffer);
fail_free_nprocs:
free(plan->nprocs);
fail_free_nelems:
free(plan->nelems);
fail_free_map:
free(plan->map);
fail_free_outerjobs:
free(plan->outerjobs);
fail_free_plan:
free(plan);
fail_immed:
if(errloc != NULL) *errloc = err;
return NULL;
}
int fft_par_plan_destroy(fft_par_plan plan)
{
int err = MPI_SUCCESS;
int procs = 1, ojobs = 1;
for(int d = 0; d < plan->ndims; ++d) {
procs *= plan->nprocs[d];
ojobs *= plan->outerjobs[plan->procid*plan->ndims + d];
}
/* Free up MPI resources */
for(int i = 0; i < procs; ++i) {
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->cycle_sendtypes + i);
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->cycle_recvtypes + i);
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->swizzle_sendtypes + i);
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->swizzle_recvtypes + i);
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_sendreqs_fwd + i);
}
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_recvreqs_fwd + i);
}
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_sendreqs_rev + i);
}
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_recvreqs_rev + i);
}
}
/* Free up memory buffers */
fftw_destroy_plan(plan->outerplan_rev);
fftw_destroy_plan(plan->innerplan_rev);
fftw_destroy_plan(plan->outerplan_fwd);
fftw_destroy_plan(plan->innerplan_fwd);
free(plan->swizzle_recvdispls);
free(plan->swizzle_recvtypes);
free(plan->swizzle_senddispls);
free(plan->swizzle_sendtypes);
free(plan->cycle_recvdispls);
free(plan->cycle_recvtypes);
free(plan->cycle_senddispls);
free(plan->cycle_sendtypes);
free(plan->swizzle_recvreqs_rev);
free(plan->swizzle_sendreqs_rev);
free(plan->swizzle_recvreqs_fwd);
free(plan->swizzle_sendreqs_fwd);
free(plan->ones);
fftw_free(plan->dstbuffer);
fftw_free(plan->srcbuffer);
free(plan->nprocs);
free(plan->nelems);
free(plan->map);
free(plan->outerjobs);
free(plan);
return err;
}
int fft_par_init_swizzle_reqs(fft_par_plan plan)
{
int err = MPI_SUCCESS;
int procs = 1;
for(int d = 0; d < plan->ndims; ++d) procs *= plan->nprocs[d];
/* Queue up recvs */
int rc = -1; /* Index of receive request successfully committed */
for(int p = 0; p < procs; ++p) {
err = MPI_Recv_init(
(char *)plan->srcbuffer + plan->swizzle_recvdispls[p],
1, plan->swizzle_recvtypes[p], p, 0, plan->comm,
plan->swizzle_recvreqs_fwd + p);
if(err != MPI_SUCCESS) goto fail_cancel_recvreqs;
rc = p;
}
/* Queue up sends */
int sc = -1; /* Index of last send request successfully completed. */
for(int p = 0; p < procs; ++p) {
err = MPI_Send_init(
(char *)plan->dstbuffer + plan->swizzle_senddispls[p],
1, plan->swizzle_sendtypes[p], p, 0, plan->comm,
plan->swizzle_sendreqs_fwd + p);
if(err != MPI_SUCCESS) goto fail_cancel_sendreqs;
sc = p;
}
/* Queue up reverse recvs */
int rrc = -1; /* Index of receive request successfully committed */
for(int p = 0; p < procs; ++p) {
err = MPI_Recv_init(
(char *)plan->dstbuffer + plan->swizzle_senddispls[p],
1, plan->swizzle_sendtypes[p], p, 0, plan->comm,
plan->swizzle_recvreqs_rev + p);
if(err != MPI_SUCCESS) goto fail_cancel_revrecvreqs;
rrc = p;
}
/* Queue up reverse sends */
int rsc = -1; /* Index of last send request successfully completed. */
for(int p = 0; p < procs; ++p) {
err = MPI_Send_init(
(char *)plan->srcbuffer + plan->swizzle_recvdispls[p],
1, plan->swizzle_recvtypes[p], p, 0, plan->comm,
plan->swizzle_sendreqs_rev + p);
if(err != MPI_SUCCESS) goto fail_cancel_revsendreqs;
rsc = p;
}
/* Success */
return err;
/* Error handling -- this currently does nothing because if we get here, err
* is not MPI_SUCCESS. It's here as a stub for better error handling.
*/
fail_cancel_revsendreqs:
for(int p = rsc; p >= 0; --p) {
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_sendreqs_rev + p);
}
}
fail_cancel_revrecvreqs:
for(int p = rrc; p >= 0; --p) {
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_recvreqs_rev + p);
}
}
fail_cancel_sendreqs:
for(int p = sc; p >= 0; --p) {
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_sendreqs_fwd + p);
}
}
fail_cancel_recvreqs:
for(int p = rc; p >= 0; --p) {
if(err == MPI_SUCCESS) {
err = MPI_Request_free(plan->swizzle_recvreqs_fwd + p);
}
}
return err;
}
int fft_par_execute_fwd(fft_par_plan plan)
{
int err = MPI_SUCCESS;
int procs = 1; for(int d = 0; d < plan->ndims; ++d) procs *= plan->nprocs[d];
// Stage one: cyclic permutation.
err = MPI_Alltoallw(plan->src, plan->ones, plan->cycle_senddispls,
plan->cycle_sendtypes, plan->srcbuffer, plan->ones, plan->cycle_recvdispls,
plan->cycle_recvtypes, plan->comm);
if (err != MPI_SUCCESS) goto fail_immed;
// Stage two: inner fft
fftw_execute(plan->innerplan_fwd);
// Stage 2.5: finish the fourier transform
fft_r2c_finish_unpacked_vec(plan->dstbuffer, plan->ndims, plan->nelems,
plan->veclen);
// Stage three: swizzle data
/* Actually, we just start the swizzle communication. In stage four, we'll
* wait for a receive to complete and then apply twiddle factors for that
* processor. Before stage 5, we ensure all sends complete.
*/
err = MPI_Startall(procs, plan->swizzle_recvreqs_fwd);
if(err != MPI_SUCCESS) goto fail_immed;
err = MPI_Startall(procs, plan->swizzle_sendreqs_fwd);
if(err != MPI_SUCCESS) goto fail_immed;
//Stage four: twiddle factors
for(int procs_remain = procs; procs_remain > 0; --procs_remain) {
int idx = MPI_UNDEFINED;
err = MPI_Waitany(procs, plan->swizzle_recvreqs_fwd, &idx,
MPI_STATUSES_IGNORE);
if(err != MPI_SUCCESS) goto fail_immed;
err = apply_twiddle_factors(plan, idx, TWIDDLE_FORWARD);
if(err != MPI_SUCCESS) goto fail_immed;
}
err = MPI_Waitall(procs, plan->swizzle_sendreqs_fwd, MPI_STATUSES_IGNORE);
if(err != MPI_SUCCESS) goto fail_immed;
// Stage five: outer fft
fftw_execute(plan->outerplan_fwd);
// Stage six: rearrange data back to destination
// This is the reverse of the previous swizzling step
err = MPI_Alltoallw(plan->dstbuffer, plan->ones, plan->swizzle_recvdispls,
plan->swizzle_recvtypes, plan->dst, plan->ones, plan->swizzle_senddispls,
plan->swizzle_sendtypes, plan->comm);
fail_immed:
return err;
}
int fft_par_execute_rev(fft_par_plan plan)
{
int err = MPI_SUCCESS;
int procs = 1; for(int d = 0; d < plan->ndims; ++d) procs *= plan->nprocs[d];
/* We need to do the forward transform, but in reverse */
/* Stage Six: bring the data from the destination array back to the dstbuffer
*/
err = MPI_Alltoallw(plan->dst, plan->ones, plan->swizzle_senddispls,
plan->swizzle_sendtypes, plan->dstbuffer, plan->ones,
plan->swizzle_recvdispls, plan->swizzle_recvtypes, plan->comm);
if(err != MPI_SUCCESS) goto fail_immed;
/* Stage five: outer fft */
fftw_execute(plan->outerplan_rev);
/* Stage four: twiddle factors
* Actually, we kick off the second stage sends as we finish the computations
* in order to hide the computational cost of twiddling in the latency of the
* communication
*/
MPI_Startall(procs, plan->swizzle_recvreqs_rev);
for(int p = 0; p < procs; ++p) {
err = apply_twiddle_factors(plan, p, TWIDDLE_REVERSE);
if(err != MPI_SUCCESS) goto fail_immed;
err = MPI_Start(plan->swizzle_sendreqs_rev + p);
if(err != MPI_SUCCESS) goto fail_immed;
}
/* Stage three: swizzle communication
* This was already kicked off in the last stage, now wait for it to complete.
* We need to wait for the sends to finish as well before we clobber the dst
* buffer they're sending from with the inner fft.
*/
err = MPI_Waitall(procs, plan->swizzle_recvreqs_rev, MPI_STATUSES_IGNORE);
if(err != MPI_SUCCESS) goto fail_immed;
err = MPI_Waitall(procs, plan->swizzle_sendreqs_rev, MPI_STATUSES_IGNORE);
if(err != MPI_SUCCESS) goto fail_immed;
/* Stage two: inner fft
* No finishing of the incomplete data is necessary because the data comes
* from a complete complex plan.
*/
fftw_execute(plan->innerplan_rev);
/* Stage one, cycling permutation */
err = MPI_Alltoallw(plan->srcbuffer, plan->ones, plan->cycle_recvdispls,
plan->cycle_recvtypes, plan->src, plan->ones, plan->cycle_senddispls,
plan->cycle_sendtypes, plan->comm);
fail_immed:
return err;
}
int cycle_data_parameter_fill(const fft_par_plan plan)
{
/* We only read these fields of plan */
const int *const elems = plan->nelems;
const int *const procs = plan->nprocs;
const int dims = plan->ndims;
const MPI_Comm comm = plan->comm;
const int *const map = plan->map;
MPI_Aint lb, ext;
int err = MPI_Type_get_extent(MPI_DOUBLE, &lb, &ext);
if(err != MPI_SUCCESS) goto fail_immed;
int nprocs;
err = MPI_Comm_size(comm, &nprocs);
if(err != MPI_SUCCESS) goto fail_immed;
int procid;
err = MPI_Comm_rank(comm, &procid);
if(err != MPI_SUCCESS) goto fail_immed;
/* Allocate buffers for the types and offsets to use when sending */
plan->cycle_sendtypes = malloc(nprocs*sizeof(MPI_Datatype));
if(plan->cycle_sendtypes == NULL) goto fail_immed;
plan->cycle_senddispls = malloc(nprocs*sizeof(int));
if(plan->cycle_senddispls == NULL) goto fail_free_cycle_sendtypes;
/* Allocate buffers for the types and offsets to use when receiving */
plan->cycle_recvtypes = malloc(nprocs*sizeof(MPI_Datatype));
if(plan->cycle_recvtypes == NULL) goto fail_free_cycle_senddispls;
plan->cycle_recvdispls = malloc(nprocs*sizeof(int));
if(plan->cycle_recvdispls == NULL) goto fail_free_cycle_recvtypes;
const int *const loc = map + procid*dims;
// Compute offsets to use when sending
for(int r = 0; r < nprocs; ++r) {
const int *const rloc = map + r*dims;
int displ = 0;
// Apply the same offset formula in conjunction with the row-major access
// formula to find the offset.
for(int d = 0; d < dims; ++d) {
displ = elems[d]*displ +
((rloc[d] - loc[d]*elems[d]) % procs[d] + procs[d]) % procs[d];
}
plan->cycle_senddispls[r] = displ*ext*plan->veclen;
}
// Compute offsets to use when receiving
for(int s = 0; s < nprocs; ++s) {
int displ = 0;
for(int d = 0; d < dims; ++d) {
const int *const sloc = map + s*dims;
displ *= elems[d];
for(int ss = 0; ss < sloc[d]; ++ss) {
displ += elems[d]/procs[d];
if(((loc[d] - ss*elems[d])%procs[d] + procs[d])%procs[d] <
elems[d] % procs[d]) {
++displ;
}
}
}
plan->cycle_recvdispls[s] = displ*ext*plan->veclen;
}
int rc = -1; /* The last sendtype index with a type that needs to be freed */
for(int r = 0; r < nprocs; ++r) {
const int *const rloc = map + r*dims;
/* initialize type */
err = MPI_Type_vector(1, plan->veclen, plan->veclen, MPI_DOUBLE,
plan->cycle_sendtypes + r);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
rc = r;
int stride = ext*plan->veclen;
/* Work downard along the dimensions, thus handling the fast indices first */
for(int d = dims - 1; d >= 0; --d) {
// Compute how many indices in this direction
int count = elems[d]/procs[d];
if(((rloc[d] - loc[d]*elems[d])%procs[d] + procs[d])%procs[d] <
elems[d] % procs[d]) {
++count;
}
MPI_Datatype t,s;
err = MPI_Type_hvector(count, 1, procs[d]*stride, plan->cycle_sendtypes[r],
&t);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
// Swap the new and old data types. This way, cycle_sendtypes[r] contains
// a valid type which needs to be deleted if a calamity occurs. Thus, we
// can skip out of the loop and free cycle_sendtypes[r] if we hit a
// problem.
s = t; t = plan->cycle_sendtypes[r]; plan->cycle_sendtypes[r] = s;
// Delete the old data type
err = MPI_Type_free(&t);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
stride *= elems[d];
}
err = MPI_Type_commit(plan->cycle_sendtypes + r);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
}
int sc = -1;
for(int s = 0; s < nprocs; ++s) {
const int *const sloc = map + s*dims;
/* initialize receive type */
err = MPI_Type_vector(1, plan->veclen, plan->veclen, MPI_DOUBLE,\
plan->cycle_recvtypes + s);
if(err != MPI_SUCCESS) goto fail_free_recvtypes;
sc = s;
int stride = ext*plan->veclen;
/* Work down through indices, from fastest to slowest */
for(int d = dims - 1; d >= 0; --d) {
int count = elems[d]/procs[d];
if(((loc[d] - sloc[d]*elems[d])%procs[d] + procs[d])%procs[d] <
elems[d] % procs[d]) {
++count;
}
MPI_Datatype t, q;
err = MPI_Type_hvector(count, 1, stride, plan->cycle_recvtypes[s], &t);
if(err != MPI_SUCCESS) goto fail_free_recvtypes;
/* As before, if t is a valid type, we want to swap it into recvtypes
* before freeing the old type.
*/
q = t; t = plan->cycle_recvtypes[s]; plan->cycle_recvtypes[s] = q;
/* Delete the old data type */
err = MPI_Type_free(&t);
if(err != MPI_SUCCESS) goto fail_free_recvtypes;
stride *= elems[d];
}
err = MPI_Type_commit(plan->cycle_recvtypes + s);
if(err != MPI_SUCCESS) goto fail_free_recvtypes;
}
return err;
fail_free_recvtypes:
while(sc >= 0) {
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->cycle_sendtypes + rc);
sc--;
}
fail_free_sendtypes:
while(rc >= 0) {
if(err == MPI_SUCCESS) err = MPI_Type_free(plan->cycle_sendtypes + rc);
rc--;
}
free(plan->cycle_recvdispls);
fail_free_cycle_recvtypes:
free(plan->cycle_recvtypes);
fail_free_cycle_senddispls:
free(plan->cycle_senddispls);
fail_free_cycle_sendtypes:
free(plan->cycle_sendtypes);
fail_immed:
return err;
}
int swizzle_data_parameter_fill(const fft_par_plan plan)
{
/* We only read these members of plan */
MPI_Comm comm = plan->comm;
const int dims = plan->ndims;
const int *const map = plan->map;
const int *const elems = plan->nelems;
const int *const procs = plan->nprocs;
const int *const ojobs = plan->outerjobs;
MPI_Aint lb, ext;
int err = MPI_Type_get_extent(MPI_C_DOUBLE_COMPLEX, &lb, &ext);
if(err != MPI_SUCCESS) goto fail_immed;
int nprocs;
err = MPI_Comm_size(comm, &nprocs);
if(err != MPI_SUCCESS) goto fail_immed;
int procid;
err = MPI_Comm_rank(comm, &procid);
if(err != MPI_SUCCESS) goto fail_immed;
/* Allocate buffers for the types and offsets to use when sending */
plan->swizzle_sendtypes = malloc(nprocs*sizeof(MPI_Datatype));
if(plan->swizzle_sendtypes == NULL) goto fail_immed;
plan->swizzle_senddispls = malloc(nprocs*sizeof(int));
if(plan->swizzle_senddispls == NULL) goto fail_free_swizzle_sendtypes;
/* Allocate buffers for the types and offsets to use when receiving */
plan->swizzle_recvtypes = malloc(nprocs*sizeof(MPI_Datatype));
if(plan->swizzle_recvtypes == NULL) goto fail_free_swizzle_senddispls;
plan->swizzle_recvdispls = malloc(nprocs*sizeof(int));
if(plan->swizzle_recvdispls == NULL) goto fail_free_swizzle_recvtypes;
// Compute offsets to use when sending
for(int r = 0; r < nprocs; ++r) {
const int *const rloc = map + r*dims;
int displ = 0;
// Apply the same offset formula in conjunction with the row-major access
// formula to find the offset.
for(int d = 0; d < dims; ++d) {
displ = elems[d]*displ + rloc[d];
}
plan->swizzle_senddispls[r] = displ*ext*plan->veclen;
}
// Compute offsets to use when receiving
for(int s = 0; s < nprocs; ++s) {
const int *const sloc = map + s*dims;
int displ = 0;
// Apply the same offset formula in conjunction with the row-major access
// formula to find the offset.
for(int d = 0; d < dims; ++d) {
displ = procs[d]*displ + sloc[d];
}
plan->swizzle_recvdispls[s] = displ*ext*plan->veclen;
}
int rc = -1; /* The last sendtype index with a type that needs to be freed */
for(int r = 0; r < nprocs; ++r) {
/* initialize type */
err = MPI_Type_vector(1, plan->veclen, plan->veclen, MPI_C_DOUBLE_COMPLEX,
plan->swizzle_sendtypes + r);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
rc = r;
int stride = ext*plan->veclen;
/* Work downard along the dimensions, thus handling the fast indices first */
for(int d = dims - 1; d >= 0; --d) {
// Compute how many indices in this direction
int count = ojobs[r*dims + d];
MPI_Datatype t,s;
err = MPI_Type_hvector(count, 1, procs[d]*stride, plan->swizzle_sendtypes[r],
&t);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
// Swap the new and old data types. This way, swizzle_sendtypes[r] contains
// a valid type which needs to be deleted if a calamity occurs. Thus, we
// can skip out of the loop and free swizzle_sendtypes[r] if we hit a
// problem.
s = t; t = plan->swizzle_sendtypes[r]; plan->swizzle_sendtypes[r] = s;
// Delete the old data type
err = MPI_Type_free(&t);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
stride *= elems[d];
}
err = MPI_Type_commit(plan->swizzle_sendtypes + r);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
}
int sc = -1; /* The last recvtype index with a type that needs to be freed */
for(int s = 0; s < nprocs; ++s) {
/* Compute the length of a work unit */
int wulen = 1;
for(int d = 0; d < dims; ++d) { wulen *= procs[d]; }
/* Compute the number of work units */
int nwu = 1;
for(int d = 0; d < dims; ++d) { nwu *= ojobs[procid*dims + d]; }
/* Initialize type */
err = MPI_Type_vector(nwu, plan->veclen, wulen*plan->veclen,
MPI_C_DOUBLE_COMPLEX, plan->swizzle_recvtypes + s);
if(err != MPI_SUCCESS) goto fail_free_recvtypes;
sc = s;
err = MPI_Type_commit(plan->swizzle_recvtypes + s);
if(err != MPI_SUCCESS) goto fail_free_sendtypes;
}
return err;
fail_free_recvtypes:
while(sc >= 0) {
if(err != MPI_SUCCESS) err = MPI_Type_free(plan->swizzle_recvtypes + sc);
--sc;