|
| 1 | +// from https://github.com/olcf-tutorials/vector_addition_cuda/blob/master/vector_addition.cu |
| 2 | + |
| 3 | +#include <stdio.h> |
| 4 | + |
| 5 | +// Size of array |
| 6 | +#define N 1048576 |
| 7 | + |
| 8 | +// Kernel |
| 9 | +__global__ void add_vectors(double *a, double *b, double *c) |
| 10 | +{ |
| 11 | + int id = blockDim.x * blockIdx.x + threadIdx.x; |
| 12 | + if(id < N) c[id] = a[id] + b[id]; |
| 13 | +} |
| 14 | + |
| 15 | +// Main program |
| 16 | +int main() |
| 17 | +{ |
| 18 | + // Number of bytes to allocate for N doubles |
| 19 | + size_t bytes = N*sizeof(double); |
| 20 | + |
| 21 | + // Allocate memory for arrays A, B, and C on host |
| 22 | + double *A = (double*)malloc(bytes); |
| 23 | + double *B = (double*)malloc(bytes); |
| 24 | + double *C = (double*)malloc(bytes); |
| 25 | + |
| 26 | + // Allocate memory for arrays d_A, d_B, and d_C on device |
| 27 | + double *d_A, *d_B, *d_C; |
| 28 | + cudaMalloc(&d_A, bytes); |
| 29 | + cudaMalloc(&d_B, bytes); |
| 30 | + cudaMalloc(&d_C, bytes); |
| 31 | + |
| 32 | + // Fill host arrays A and B |
| 33 | + for(int i=0; i<N; i++) |
| 34 | + { |
| 35 | + A[i] = 1.0; |
| 36 | + B[i] = 2.0; |
| 37 | + } |
| 38 | + |
| 39 | + // Copy data from host arrays A and B to device arrays d_A and d_B |
| 40 | + cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice); |
| 41 | + cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice); |
| 42 | + |
| 43 | + // Set execution configuration parameters |
| 44 | + // thr_per_blk: number of CUDA threads per grid block |
| 45 | + // blk_in_grid: number of blocks in grid |
| 46 | + int thr_per_blk = 256; |
| 47 | + int blk_in_grid = ceil( float(N) / thr_per_blk ); |
| 48 | + |
| 49 | + // Launch kernel |
| 50 | + add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C); |
| 51 | + |
| 52 | + // Copy data from device array d_C to host array C |
| 53 | + cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost); |
| 54 | + |
| 55 | + // Verify results |
| 56 | + double tolerance = 1.0e-14; |
| 57 | + for(int i=0; i<N; i++) |
| 58 | + { |
| 59 | + if( fabs(C[i] - 3.0) > tolerance) |
| 60 | + { |
| 61 | + printf("\nError: value of C[%d] = %d instead of 3.0\n\n", i, C[i]); |
| 62 | + exit(1); |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + // Free CPU memory |
| 67 | + free(A); |
| 68 | + free(B); |
| 69 | + free(C); |
| 70 | + |
| 71 | + // Free GPU memory |
| 72 | + cudaFree(d_A); |
| 73 | + cudaFree(d_B); |
| 74 | + cudaFree(d_C); |
| 75 | + |
| 76 | + printf("\n---------------------------\n"); |
| 77 | + printf("__SUCCESS__\n"); |
| 78 | + printf("---------------------------\n"); |
| 79 | + printf("N = %d\n", N); |
| 80 | + printf("Threads Per Block = %d\n", thr_per_blk); |
| 81 | + printf("Blocks In Grid = %d\n", blk_in_grid); |
| 82 | + printf("---------------------------\n\n"); |
| 83 | + |
| 84 | + return 0; |
| 85 | +} |
0 commit comments