Skip to content

Commit f2fcdac

Browse files
authored
Create vector_addition.cu
1 parent e436a5e commit f2fcdac

File tree

1 file changed

+85
-0
lines changed

1 file changed

+85
-0
lines changed

Diff for: examples/cuda/vector_addition.cu

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// from https://github.com/olcf-tutorials/vector_addition_cuda/blob/master/vector_addition.cu
2+
3+
#include <stdio.h>
4+
5+
// Size of array
6+
#define N 1048576
7+
8+
// Kernel
9+
__global__ void add_vectors(double *a, double *b, double *c)
10+
{
11+
int id = blockDim.x * blockIdx.x + threadIdx.x;
12+
if(id < N) c[id] = a[id] + b[id];
13+
}
14+
15+
// Main program
16+
int main()
17+
{
18+
// Number of bytes to allocate for N doubles
19+
size_t bytes = N*sizeof(double);
20+
21+
// Allocate memory for arrays A, B, and C on host
22+
double *A = (double*)malloc(bytes);
23+
double *B = (double*)malloc(bytes);
24+
double *C = (double*)malloc(bytes);
25+
26+
// Allocate memory for arrays d_A, d_B, and d_C on device
27+
double *d_A, *d_B, *d_C;
28+
cudaMalloc(&d_A, bytes);
29+
cudaMalloc(&d_B, bytes);
30+
cudaMalloc(&d_C, bytes);
31+
32+
// Fill host arrays A and B
33+
for(int i=0; i<N; i++)
34+
{
35+
A[i] = 1.0;
36+
B[i] = 2.0;
37+
}
38+
39+
// Copy data from host arrays A and B to device arrays d_A and d_B
40+
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
41+
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
42+
43+
// Set execution configuration parameters
44+
// thr_per_blk: number of CUDA threads per grid block
45+
// blk_in_grid: number of blocks in grid
46+
int thr_per_blk = 256;
47+
int blk_in_grid = ceil( float(N) / thr_per_blk );
48+
49+
// Launch kernel
50+
add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);
51+
52+
// Copy data from device array d_C to host array C
53+
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
54+
55+
// Verify results
56+
double tolerance = 1.0e-14;
57+
for(int i=0; i<N; i++)
58+
{
59+
if( fabs(C[i] - 3.0) > tolerance)
60+
{
61+
printf("\nError: value of C[%d] = %d instead of 3.0\n\n", i, C[i]);
62+
exit(1);
63+
}
64+
}
65+
66+
// Free CPU memory
67+
free(A);
68+
free(B);
69+
free(C);
70+
71+
// Free GPU memory
72+
cudaFree(d_A);
73+
cudaFree(d_B);
74+
cudaFree(d_C);
75+
76+
printf("\n---------------------------\n");
77+
printf("__SUCCESS__\n");
78+
printf("---------------------------\n");
79+
printf("N = %d\n", N);
80+
printf("Threads Per Block = %d\n", thr_per_blk);
81+
printf("Blocks In Grid = %d\n", blk_in_grid);
82+
printf("---------------------------\n\n");
83+
84+
return 0;
85+
}

0 commit comments

Comments
 (0)