-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvector_add.cu
More file actions
75 lines (61 loc) · 1.95 KB
/
Copy pathvector_add.cu
File metadata and controls
75 lines (61 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/**
* vector_add.cu - Vector Addition Example
*
* Compile: nvcc -O3 -arch=sm_90 -o vector_add vector_add.cu
* Run: ./vector_add
*/
#include <stdio.h>
#include <cuda_runtime.h>
#define CUDA_CHECK(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
} while(0)
__global__ void vectorAdd(const float *a, const float *b, float *c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}
int main() {
const int N = 1 << 20; // 1M elements
const size_t bytes = N * sizeof(float);
// Host allocation
float *h_a = (float*)malloc(bytes);
float *h_b = (float*)malloc(bytes);
float *h_c = (float*)malloc(bytes);
// Initialize
for (int i = 0; i < N; i++) {
h_a[i] = i;
h_b[i] = i * 2;
}
// Device allocation
float *d_a, *d_b, *d_c;
CUDA_CHECK(cudaMalloc(&d_a, bytes));
CUDA_CHECK(cudaMalloc(&d_b, bytes));
CUDA_CHECK(cudaMalloc(&d_c, bytes));
// Copy to device
CUDA_CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));
// Launch kernel
int threads = 256;
int blocks = (N + threads - 1) / threads;
vectorAdd<<<blocks, threads>>>(d_a, d_b, d_c, N);
CUDA_CHECK(cudaGetLastError());
// Copy result back
CUDA_CHECK(cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost));
// Verify
int errors = 0;
for (int i = 0; i < N; i++) {
if (h_c[i] != h_a[i] + h_b[i]) errors++;
}
printf("Vector Addition: %d elements, %d errors\n", N, errors);
printf("%s\n", errors == 0 ? "[PASS]" : "[FAIL]");
// Cleanup
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
free(h_a); free(h_b); free(h_c);
return errors;
}