-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_addition.cu
59 lines (47 loc) · 1.53 KB
/
vector_addition.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include <iostream>
// Kernel for vector addition
__global__ void vectorAdd(float* A, float* B, float* C, int size) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < size) {
C[tid] = A[tid] + B[tid];
}
}
int main() {
const int N = 1024; // Size of vectors
const int threadsPerBlock = 256;
const int numBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
// Allocate memory for vectors on the host
float* h_A = new float[N];
float* h_B = new float[N];
float* h_C = new float[N];
// Initialize vectors with some values (you can modify this)
for (int i = 0; i < N; ++i) {
h_A[i] = i;
h_B[i] = 2 * i;
}
// Allocate memory for vectors on the device (GPU)
float* d_A, *d_B, *d_C;
cudaMalloc(&d_A, N * sizeof(float));
cudaMalloc(&d_B, N * sizeof(float));
cudaMalloc(&d_C, N * sizeof(float));
// Copy data from host to device
cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel
vectorAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);
// Copy result back to host
cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
// Clean up
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Print the result (you can modify this)
for (int i = 0; i < N; ++i) {
std::cout << h_C[i] << " ";
}
std::cout << std::endl;
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}