diff --git a/Modern C++/Chapter01/CudaContainer.h b/Modern C++/Chapter01/CudaContainer.h new file mode 100644 index 0000000..781ccae --- /dev/null +++ b/Modern C++/Chapter01/CudaContainer.h @@ -0,0 +1,11 @@ +#pragma once + +template +class CudaContainer +{ +public: + int size; + T* data; + CudaContainer(int size); + ~CudaContainer(); +}; \ No newline at end of file diff --git a/Modern C++/Chapter01/vector_addition.cu b/Modern C++/Chapter01/vector_addition.cu new file mode 100644 index 0000000..559b83d --- /dev/null +++ b/Modern C++/Chapter01/vector_addition.cu @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include // Needed or __global__ == unrecognised. +#include "device_launch_parameters.h" // Variable identifiers. +#include +#include +#include "CudaContainer.h" + +const int SIZE = 256; +const int THREADS_PER_BLOCK = 4; +const int NO_OF_BLOCKS = SIZE / THREADS_PER_BLOCK; + +__global__ void device_add(int* a, int* b, int* c); +void fill_array(const std::shared_ptr>& out); + +int main() +{ + std::cout << "Hello" << std::endl; + + // Host memory allocation. + std::shared_ptr> a = std::make_shared>(); + std::shared_ptr> b = std::make_shared>(); + std::shared_ptr> c = std::make_shared>(); + // Device memory allocation. + std::shared_ptr> d_a = std::make_shared>(SIZE); + std::shared_ptr> d_b = std::make_shared>(SIZE); + std::shared_ptr> d_c = std::make_shared>(SIZE); + + fill_array(a); + fill_array(b); + + cudaMemcpy(d_a->data, a.get(), SIZE * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_b->data, b.get(), SIZE * sizeof(int), cudaMemcpyHostToDevice); + + device_add << > > (d_a->data, d_b->data, d_c->data); + + cudaDeviceSynchronize(); + + cudaMemcpy(c.get(), d_c->data, SIZE * sizeof(int), cudaMemcpyDeviceToHost); + + // No need to manually call free or cudaFree. + // Since cudaFree is in the destructor of ~CudaContainer. + // Which is wrapped in a shared_ptr. + + for (int i = 0; i < SIZE; i++) + std::cout << (*c)[i] << std::endl; + + return 0; +} + +template +CudaContainer::CudaContainer(int size) +{ + this->size = size; + cudaMalloc(&data, size * sizeof(T)); +} + +template +CudaContainer::~CudaContainer() +{ + cudaFree(data); +} + +__global__ void device_add(int* a, int* b, int* c) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + printf("Setting index %d to %d + %d\n", index, a[index], b[index]); + c[index] = a[index] + b[index]; +} + +void fill_array(const std::shared_ptr>& out) +{ + for (int i = 0; i < SIZE; i++) + (*out)[i] = i; +} \ No newline at end of file