diff --git a/.cache/clangd/index/cudaMat4.hpp.59DDD2E3599F7102.idx b/.cache/clangd/index/cudaMat4.hpp.59DDD2E3599F7102.idx new file mode 100644 index 0000000..be9a94e Binary files /dev/null and b/.cache/clangd/index/cudaMat4.hpp.59DDD2E3599F7102.idx differ diff --git a/.cache/clangd/index/glslUtility.cpp.6B5C7C5C38F14405.idx b/.cache/clangd/index/glslUtility.cpp.6B5C7C5C38F14405.idx new file mode 100644 index 0000000..0c9f8f6 Binary files /dev/null and b/.cache/clangd/index/glslUtility.cpp.6B5C7C5C38F14405.idx differ diff --git a/.cache/clangd/index/glslUtility.hpp.1C4CD8843D047D6E.idx b/.cache/clangd/index/glslUtility.hpp.1C4CD8843D047D6E.idx new file mode 100644 index 0000000..dff3dce Binary files /dev/null and b/.cache/clangd/index/glslUtility.hpp.1C4CD8843D047D6E.idx differ diff --git a/.cache/clangd/index/kernel.h.86AC7F8FD0199582.idx b/.cache/clangd/index/kernel.h.86AC7F8FD0199582.idx new file mode 100644 index 0000000..bb70170 Binary files /dev/null and b/.cache/clangd/index/kernel.h.86AC7F8FD0199582.idx differ diff --git a/.cache/clangd/index/main.cpp.59D3B8A164E7B469.idx b/.cache/clangd/index/main.cpp.59D3B8A164E7B469.idx new file mode 100644 index 0000000..a6661eb Binary files /dev/null and b/.cache/clangd/index/main.cpp.59D3B8A164E7B469.idx differ diff --git a/.cache/clangd/index/main.hpp.120473D743DD0E61.idx b/.cache/clangd/index/main.hpp.120473D743DD0E61.idx new file mode 100644 index 0000000..6336ee5 Binary files /dev/null and b/.cache/clangd/index/main.hpp.120473D743DD0E61.idx differ diff --git a/.cache/clangd/index/utilityCore.cpp.D6912F82B554AF4B.idx b/.cache/clangd/index/utilityCore.cpp.D6912F82B554AF4B.idx new file mode 100644 index 0000000..edb3a4f Binary files /dev/null and b/.cache/clangd/index/utilityCore.cpp.D6912F82B554AF4B.idx differ diff --git a/.cache/clangd/index/utilityCore.hpp.963F8090EB22A8F1.idx b/.cache/clangd/index/utilityCore.hpp.963F8090EB22A8F1.idx new file mode 100644 index 0000000..6e0125f Binary files /dev/null and b/.cache/clangd/index/utilityCore.hpp.963F8090EB22A8F1.idx differ diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..e295476 --- /dev/null +++ b/.clang-format @@ -0,0 +1,213 @@ +# 语言: None, Cpp, Java, JavaScript, ObjC, Proto, TableGen, TextProto +Language: Cpp +# BasedOnStyle: LLVM + +# 访问说明符(public、private等)的偏移 +AccessModifierOffset: -4 + +# 开括号(开圆括号、开尖括号、开方括号)后的对齐: Align, DontAlign, AlwaysBreak(总是在开括号后换行) +AlignAfterOpenBracket: Align + +# 连续赋值时,对齐所有等号 +AlignConsecutiveAssignments: false + +# 连续声明时,对齐所有声明的变量名 +AlignConsecutiveDeclarations: false + +# 右对齐逃脱换行(使用反斜杠换行)的反斜杠 +AlignEscapedNewlines: Right + +# 水平对齐二元和三元表达式的操作数 +AlignOperands: true + +# 对齐连续的尾随的注释 +AlignTrailingComments: true + +# 不允许函数声明的所有参数在放在下一行 +AllowAllParametersOfDeclarationOnNextLine: false + +# 不允许短的块放在同一行 +AllowShortBlocksOnASingleLine: true + +# 允许短的case标签放在同一行 +AllowShortCaseLabelsOnASingleLine: true + +# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中,空函数), All +AllowShortFunctionsOnASingleLine: None + +# 允许短的if语句保持在同一行 +AllowShortIfStatementsOnASingleLine: true + +# 允许短的循环保持在同一行 +AllowShortLoopsOnASingleLine: true + +# 总是在返回类型后换行: None, All, TopLevel(顶级函数,不包括在类中的函数), +# AllDefinitions(所有的定义,不包括声明), TopLevelDefinitions(所有的顶级函数的定义) +AlwaysBreakAfterReturnType: None + +# 总是在多行string字面量前换行 +AlwaysBreakBeforeMultilineStrings: false + +# 总是在template声明后换行 +AlwaysBreakTemplateDeclarations: true + +# false表示函数实参要么都在同一行,要么都各自一行 +BinPackArguments: true + +# false表示所有形参要么都在同一行,要么都各自一行 +BinPackParameters: true + +# 大括号换行,只有当BreakBeforeBraces设置为Custom时才有效 +BraceWrapping: + # class定义后面 + AfterClass: false + # 控制语句后面 + AfterControlStatement: false + # enum定义后面 + AfterEnum: false + # 函数定义后面 + AfterFunction: false + # 命名空间定义后面 + AfterNamespace: false + # struct定义后面 + AfterStruct: false + # union定义后面 + AfterUnion: false + # extern之后 + AfterExternBlock: false + # catch之前 + BeforeCatch: false + # else之前 + BeforeElse: false + # 缩进大括号 + IndentBraces: false + # 分离空函数 + SplitEmptyFunction: false + # 分离空语句 + SplitEmptyRecord: false + # 分离空命名空间 + SplitEmptyNamespace: false + +# 在二元运算符前换行: None(在操作符后换行), NonAssignment(在非赋值的操作符前换行), All(在操作符前换行) +BreakBeforeBinaryOperators: NonAssignment + +# 在大括号前换行: Attach(始终将大括号附加到周围的上下文), Linux(除函数、命名空间和类定义,与Attach类似), +# Mozilla(除枚举、函数、记录定义,与Attach类似), Stroustrup(除函数定义、catch、else,与Attach类似), +# Allman(总是在大括号前换行), GNU(总是在大括号前换行,并对于控制语句的大括号增加额外的缩进), WebKit(在函数前换行), Custom +# 注:这里认为语句块也属于函数 +BreakBeforeBraces: Custom + +# 在三元运算符前换行 +BreakBeforeTernaryOperators: false + +# 在构造函数的初始化列表的冒号后换行 +BreakConstructorInitializers: AfterColon + +#BreakInheritanceList: AfterColon + +BreakStringLiterals: false + +# 每行字符的限制,0表示没有限制 +ColumnLimit: 0 + +CompactNamespaces: true + +# 构造函数的初始化列表要么都在同一行,要么都各自一行 +ConstructorInitializerAllOnOneLineOrOnePerLine: false + +# 构造函数的初始化列表的缩进宽度 +ConstructorInitializerIndentWidth: 4 + +# 延续的行的缩进宽度 +ContinuationIndentWidth: 4 + +# 去除C++11的列表初始化的大括号{后和}前的空格 +Cpp11BracedListStyle: true + +# 继承最常用的指针和引用的对齐方式 +DerivePointerAlignment: false + +# 固定命名空间注释 +FixNamespaceComments: true + +# 缩进case标签 +IndentCaseLabels: false + +IndentPPDirectives: None + +# 缩进宽度 +IndentWidth: 4 + +# 函数返回类型换行时,缩进函数声明或函数定义的函数名 +IndentWrappedFunctionNames: false + +# 保留在块开始处的空行 +KeepEmptyLinesAtTheStartOfBlocks: false + +# 连续空行的最大数量 +MaxEmptyLinesToKeep: 1 + +# 命名空间的缩进: None, Inner(缩进嵌套的命名空间中的内容), All +NamespaceIndentation: None + +# 指针和引用的对齐: Left, Right, Middle +PointerAlignment: Right + +# 允许重新排版注释 +ReflowComments: true + +# 允许排序#include +SortIncludes: false + +# 允许排序 using 声明 +SortUsingDeclarations: false + +# 在C风格类型转换后添加空格 +SpaceAfterCStyleCast: false + +# 在Template 关键字后面添加空格 +SpaceAfterTemplateKeyword: true + +# 在赋值运算符之前添加空格 +SpaceBeforeAssignmentOperators: true + +# SpaceBeforeCpp11BracedList: true + +# SpaceBeforeCtorInitializerColon: true + +# SpaceBeforeInheritanceColon: true + +# 开圆括号之前添加一个空格: Never, ControlStatements, Always +SpaceBeforeParens: ControlStatements + +# SpaceBeforeRangeBasedForLoopColon: true + +# 在空的圆括号中添加空格 +SpaceInEmptyParentheses: false + +# 在尾随的评论前添加的空格数(只适用于//) +SpacesBeforeTrailingComments: 1 + +# 在尖括号的<后和>前添加空格 +SpacesInAngles: false + +# 在C风格类型转换的括号中添加空格 +SpacesInCStyleCastParentheses: false + +# 在容器(ObjC和JavaScript的数组和字典等)字面量中添加空格 +SpacesInContainerLiterals: true + +# 在圆括号的(后和)前添加空格 +SpacesInParentheses: false + +# 在方括号的[后和]前添加空格,lamda表达式和未指明大小的数组的声明不受影响 +SpacesInSquareBrackets: false + +# 标准: Cpp03, Cpp11, Auto +Standard: Cpp11 + +# tab宽度 +TabWidth: 4 + +# 使用tab字符: Never, ForIndentation, ForContinuationAndIndentation, Always +UseTab: Never \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..8645ef1 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug", + "program": "${command:cmake.launchTargetPath}", + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/NoteBook.pdf b/NoteBook.pdf new file mode 100644 index 0000000..402b97c Binary files /dev/null and b/NoteBook.pdf differ diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..c33ed2f 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -5,36 +5,36 @@ #include #include "utilityCore.hpp" #include "kernel.h" +#include // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax -#define imax( a, b ) ( ((a) > (b)) ? (a) : (b) ) +#define imax(a, b) (((a) > (b)) ? (a) : (b)) #endif #ifndef imin -#define imin( a, b ) ( ((a) < (b)) ? (a) : (b) ) +#define imin(a, b) (((a) < (b)) ? (a) : (b)) #endif #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) /** -* Check for CUDA errors; print and exit if there was a problem. -*/ + * Check for CUDA errors; print and exit if there was a problem. + */ void checkCUDAError(const char *msg, int line = -1) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - if (line >= 0) { - fprintf(stderr, "Line %d: ", line); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + if (line >= 0) { + fprintf(stderr, "Line %d: ", line); + } + fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } - fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } } - /***************** -* Configuration * -*****************/ + * Configuration * + *****************/ /*! Block size used for CUDA kernel launch. */ #define blockSize 128 @@ -55,8 +55,8 @@ void checkCUDAError(const char *msg, int line = -1) { #define scene_scale 100.0f /*********************************************** -* Kernel state (pointers are device pointers) * -***********************************************/ + * Kernel state (pointers are device pointers) * + ***********************************************/ int numObjects; dim3 threadsPerBlock(blockSize); @@ -75,7 +75,7 @@ glm::vec3 *dev_vel2; // For efficient sorting and the uniform grid. These should always be parallel. int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? -int *dev_particleGridIndices; // What grid cell is this particle in? +int *dev_particleGridIndices; // What grid cell is this particle in? // needed for use with thrust thrust::device_ptr dev_thrust_particleArrayIndices; thrust::device_ptr dev_thrust_particleGridIndices; @@ -85,6 +85,8 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3 *dev_sortedPos; +glm::vec3 *dev_sortedVel; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -95,181 +97,219 @@ float gridInverseCellWidth; glm::vec3 gridMinimum; /****************** -* initSimulation * -******************/ + * initSimulation * + ******************/ __host__ __device__ unsigned int hash(unsigned int a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; } /** -* LOOK-1.2 - this is a typical helper function for a CUDA kernel. -* Function for generating a random vec3. -*/ + * LOOK-1.2 - this is a typical helper function for a CUDA kernel. + * Function for generating a random vec3. + */ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { - thrust::default_random_engine rng(hash((int)(index * time))); - thrust::uniform_real_distribution unitDistrib(-1, 1); + thrust::default_random_engine rng(hash((int)(index * time))); + thrust::uniform_real_distribution unitDistrib(-1, 1); - return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); + return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); } /** -* LOOK-1.2 - This is a basic CUDA kernel. -* CUDA kernel for generating boids with a specified mass randomly around the star. -*/ -__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - glm::vec3 rand = generateRandomVec3(time, index); - arr[index].x = scale * rand.x; - arr[index].y = scale * rand.y; - arr[index].z = scale * rand.z; - } + * LOOK-1.2 - This is a basic CUDA kernel. + * CUDA kernel for generating boids with a specified mass randomly around the star. + */ +__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 *arr, float scale) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + glm::vec3 rand = generateRandomVec3(time, index); + arr[index].x = scale * rand.x; + arr[index].y = scale * rand.y; + arr[index].z = scale * rand.z; + } } /** -* Initialize memory, update some globals -*/ + * Initialize memory, update some globals + */ void Boids::initSimulation(int N) { - numObjects = N; - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - // LOOK-1.2 - This is basic CUDA memory management and error checking. - // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); - - cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); - - cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); - - // LOOK-1.2 - This is a typical CUDA kernel invocation. - kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); - checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); - - // LOOK-2.1 computing grid params - gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); - int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; - gridSideCount = 2 * halfSideCount; - - gridCellCount = gridSideCount * gridSideCount * gridSideCount; - gridInverseCellWidth = 1.0f / gridCellWidth; - float halfGridWidth = gridCellWidth * halfSideCount; - gridMinimum.x -= halfGridWidth; - gridMinimum.y -= halfGridWidth; - gridMinimum.z -= halfGridWidth; - - // TODO-2.1 TODO-2.3 - Allocate additional buffers here. - cudaDeviceSynchronize(); -} + numObjects = N; + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // LOOK-1.2 - This is basic CUDA memory management and error checking. + // Don't forget to cudaFree in Boids::endSimulation. + cudaMalloc((void **)&dev_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); + + cudaMalloc((void **)&dev_vel1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); + + cudaMalloc((void **)&dev_vel2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); + + // LOOK-1.2 - This is a typical CUDA kernel invocation. + kernGenerateRandomPosArray<<>>(1, numObjects, + dev_pos, scene_scale); + checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + + // LOOK-2.1 computing grid params + gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; + gridSideCount = 2 * halfSideCount; + + gridCellCount = gridSideCount * gridSideCount * gridSideCount; + gridInverseCellWidth = 1.0f / gridCellWidth; + float halfGridWidth = gridCellWidth * halfSideCount; + gridMinimum.x -= halfGridWidth; + gridMinimum.y -= halfGridWidth; + gridMinimum.z -= halfGridWidth; + + // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void **)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("Malloc dev_particleArrayIndices failed!\n"); + + cudaMalloc((void **)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("Malloc dev_particleGridIndices failed!\n"); + cudaMalloc((void **)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("Malloc dev_gridCellStartIndices failed!\n"); + + cudaMalloc((void **)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("Malloc dev_gridCellEndIndices failed!\n"); + + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + + cudaMalloc((void **)&dev_sortedPos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("Malloc dev_sortedPos failed!\n"); + + cudaMalloc((void **)&dev_sortedVel, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("Malloc dev_sortedVel failed!\n"); + + cudaDeviceSynchronize(); +} /****************** -* copyBoidsToVBO * -******************/ + * copyBoidsToVBO * + ******************/ /** -* Copy the boid positions into the VBO so that they can be drawn by OpenGL. -*/ + * Copy the boid positions into the VBO so that they can be drawn by OpenGL. + */ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); + int index = threadIdx.x + (blockIdx.x * blockDim.x); - float c_scale = -1.0f / s_scale; + float c_scale = -1.0f / s_scale; - if (index < N) { - vbo[4 * index + 0] = pos[index].x * c_scale; - vbo[4 * index + 1] = pos[index].y * c_scale; - vbo[4 * index + 2] = pos[index].z * c_scale; - vbo[4 * index + 3] = 1.0f; - } + if (index < N) { + vbo[4 * index + 0] = pos[index].x * c_scale; + vbo[4 * index + 1] = pos[index].y * c_scale; + vbo[4 * index + 2] = pos[index].z * c_scale; + vbo[4 * index + 3] = 1.0f; + } } __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - - if (index < N) { - vbo[4 * index + 0] = vel[index].x + 0.3f; - vbo[4 * index + 1] = vel[index].y + 0.3f; - vbo[4 * index + 2] = vel[index].z + 0.3f; - vbo[4 * index + 3] = 1.0f; - } + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + vbo[4 * index + 0] = vel[index].x + 0.3f; + vbo[4 * index + 1] = vel[index].y + 0.3f; + vbo[4 * index + 2] = vel[index].z + 0.3f; + vbo[4 * index + 3] = 1.0f; + } } /** -* Wrapper for call to the kernCopyboidsToVBO CUDA kernel. -*/ + * Wrapper for call to the kernCopyboidsToVBO CUDA kernel. + */ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { - dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO<<>>(numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO<<>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale); - checkCUDAErrorWithLine("copyBoidsToVBO failed!"); + checkCUDAErrorWithLine("copyBoidsToVBO failed!"); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); } - /****************** -* stepSimulation * -******************/ + * stepSimulation * + ******************/ /** -* LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce. -* __device__ code can be called from a __global__ context -* Compute the new velocity on the body with index `iSelf` due to the `N` boids -* in the `pos` and `vel` arrays. -*/ + * LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce. + * __device__ code can be called from a __global__ context + * Compute the new velocity on the body with index `iSelf` due to the `N` boids + * in the `pos` and `vel` arrays. + */ + __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + // Rule 2: boids try to stay a distance d away from each other + // Rule 3: boids try to match the speed of surrounding boids + glm::vec3 boidPos = pos[iSelf]; + glm::vec3 perceived_center(0.0f), perceived_velocity(0.0f), c(0.0f), ret(0.0f); + int n1 = 0, n3 = 0; + for (int i = 0; i < N; i++) { + if (i == iSelf) continue; + float distance = glm::distance(boidPos, pos[i]); + if (distance < rule1Distance) perceived_center += pos[i], n1++; + if (distance < rule2Distance) c -= pos[i] - boidPos; + if (distance < rule3Distance) perceived_velocity += vel[i], n3++; + } + if (n1 > 0) perceived_center /= static_cast(n1), ret += (perceived_center - boidPos) * rule1Scale; + ret += c * rule2Scale; + if (n3 > 0) perceived_velocity /= static_cast(n3), ret += perceived_velocity * rule3Scale; + return ret; } /** -* TODO-1.2 implement basic flocking -* For each of the `N` bodies, update its position based on its current velocity. -*/ + * TODO-1.2 implement basic flocking + * For each of the `N` bodies, update its position based on its current velocity. + */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, - glm::vec3 *vel1, glm::vec3 *vel2) { - // Compute a new velocity based on pos and vel1 - // Clamp the speed - // Record the new velocity into vel2. Question: why NOT vel1? + glm::vec3 *vel1, glm::vec3 *vel2) { + // Compute a new velocity based on pos and vel1 + // Clamp the speed + // Record the new velocity into vel2. Question: why NOT vel1? + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + glm::vec3 deltaVel = computeVelocityChange(N, index, pos, vel1); + glm::vec3 curVel = vel1[index]; + vel2[index] = glm::clamp(curVel + deltaVel, -maxSpeed, maxSpeed); } /** -* LOOK-1.2 Since this is pretty trivial, we implemented it for you. -* For each of the `N` bodies, update its position based on its current velocity. -*/ + * LOOK-1.2 Since this is pretty trivial, we implemented it for you. + * For each of the `N` bodies, update its position based on its current velocity. + */ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { - // Update position by velocity - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index >= N) { - return; - } - glm::vec3 thisPos = pos[index]; - thisPos += vel[index] * dt; + // Update position by velocity + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + thisPos += vel[index] * dt; - // Wrap the boids around so we don't lose them - thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; - thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; - thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; + // Wrap the boids around so we don't lose them + thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; + thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; + thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; - thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; - thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; - thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; + thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; + thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; + thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; - pos[index] = thisPos; + pos[index] = thisPos; } // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index. @@ -279,179 +319,342 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { // for(y) // for(z)? Or some other order? __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { - return x + y * gridResolution + z * gridResolution * gridResolution; + return x + y * gridResolution + z * gridResolution * gridResolution; } __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3 *pos, int *indices, int *gridIndices) { // TODO-2.1 // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + int boidIndex = blockIdx.x * blockDim.x + threadIdx.x; // boid index + if (boidIndex >= N) return; + glm::vec3 offset = pos[boidIndex] - gridMin; + float xIndex = offset.x * inverseCellWidth; + float yIndex = offset.y * inverseCellWidth; + float zIndex = offset.z * inverseCellWidth; + int gridIndex = gridIndex3Dto1D(xIndex, yIndex, zIndex, gridResolution); + // key + indices[boidIndex] = boidIndex; + // value + gridIndices[boidIndex] = gridIndex; } // LOOK-2.1 Consider how this could be useful for indicating that a cell // does not enclose any boids __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - intBuffer[index] = value; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + intBuffer[index] = value; + } } __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" + int *gridCellStartIndices, int *gridCellEndIndices) { + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + + // We must do this after sort + int boidIndex = (blockIdx.x * blockDim.x) + threadIdx.x; + if (boidIndex >= N) return; + int gridIndex = particleGridIndices[boidIndex]; + if (boidIndex > 0) { + int preGridIndex = particleGridIndices[boidIndex - 1]; + if (preGridIndex != gridIndex) { + gridCellStartIndices[gridIndex] = boidIndex; + gridCellEndIndices[preGridIndex] = boidIndex; + } + if (preGridIndex == N - 1) { + gridCellEndIndices[gridIndex] = boidIndex; + } + } else { + gridCellStartIndices[gridIndex] = boidIndex; + } } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + // - Identify the grid cell that this particle is in + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + int boidIndex = blockDim.x * blockIdx.x + threadIdx.x; + if (boidIndex >= N) return; + glm::vec3 curPos = pos[boidIndex]; + glm::vec3 offset = curPos - gridMin; + float xIndex = offset.x * inverseCellWidth; + float yIndex = offset.y * inverseCellWidth; + float zIndex = offset.z * inverseCellWidth; + int gridIndex = gridIndex3Dto1D(xIndex, yIndex, zIndex, gridResolution); + + // find the bounding box of A GRID + int X_MAX = imin(xIndex + 1, gridResolution - 1); + int X_MIN = imax(xIndex - 1, 0); + int Y_MAX = imin(yIndex + 1, gridResolution - 1); + int Y_MIN = imax(yIndex - 1, 0); + int Z_MAX = imin(zIndex + 1, gridResolution - 1); + int Z_MIN = imax(zIndex - 1, 0); + + glm::vec3 perceived_center(0.0f), perceived_velocity(0.0f), c(0.0f), ret(0.0f); + int n1 = 0, n3 = 0; + + for (int x = X_MIN; x <= X_MAX; x++) { + for (int y = Y_MIN; y <= Y_MAX; y++) { + for (int z = Z_MIN; z <= Z_MAX; z++) { + int neighborGridIndex = gridIndex3Dto1D(x, y, z, gridResolution); + int neighborGridStart = gridCellStartIndices[neighborGridIndex]; + int neighborGridEnd = gridCellEndIndices[neighborGridIndex]; + for (int i = neighborGridStart; i <= neighborGridEnd; i++) { + int this_boidIndex = particleArrayIndices[i]; + if (this_boidIndex != boidIndex) { + float distance = glm::distance(pos[this_boidIndex], curPos); + if (distance < rule1Distance) perceived_center += pos[this_boidIndex], n1++; + if (distance < rule2Distance) c -= pos[this_boidIndex] - curPos; + if (distance < rule3Distance) perceived_velocity += vel1[this_boidIndex], n3++; + } + } + } + } + } + if (n1 > 0) perceived_center /= static_cast(n1), ret += (perceived_center - curPos) * rule1Scale; + ret += c * rule2Scale; + if (n3 > 0) perceived_velocity /= static_cast(n3), ret += perceived_velocity * rule3Scale; + + vel2[boidIndex] = glm::clamp(vel1[boidIndex] + ret, -maxSpeed, maxSpeed); } __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + // This should expect gridCellStartIndices and gridCellEndIndices to refer + // directly to pos and vel1. + // - Identify the grid cell that this particle is in + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // DIFFERENCE: For best results, consider what order the cells should be + // checked in to maximize the memory benefits of reordering the boids data. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + int boidIndex = blockDim.x * blockIdx.x + threadIdx.x; + if (boidIndex >= N) return; + glm::vec3 curPos = pos[boidIndex]; + glm::vec3 offset = curPos - gridMin; + float xIndex = offset.x * inverseCellWidth; + float yIndex = offset.y * inverseCellWidth; + float zIndex = offset.z * inverseCellWidth; + int gridIndex = gridIndex3Dto1D(xIndex, yIndex, zIndex, gridResolution); + + // find the bounding box of A GRID + int X_MAX = imin(xIndex + 1, gridResolution - 1); + int X_MIN = imax(xIndex - 1, 0); + int Y_MAX = imin(yIndex + 1, gridResolution - 1); + int Y_MIN = imax(yIndex - 1, 0); + int Z_MAX = imin(zIndex + 1, gridResolution - 1); + int Z_MIN = imax(zIndex - 1, 0); + + glm::vec3 perceived_center(0.0f), perceived_velocity(0.0f), c(0.0f), ret(0.0f); + int n1 = 0, n3 = 0; + + for (int x = X_MIN; x <= X_MAX; x++) { + for (int y = Y_MIN; y <= Y_MAX; y++) { + for (int z = Z_MIN; z <= Z_MAX; z++) { + int neighborGridIndex = gridIndex3Dto1D(x, y, z, gridResolution); + int neighborGridStart = gridCellStartIndices[neighborGridIndex]; + int neighborGridEnd = gridCellEndIndices[neighborGridIndex]; + for (int i = neighborGridStart; i <= neighborGridEnd; i++) { + if (i != boidIndex) { + float distance = glm::distance(pos[i], curPos); + if (distance < rule1Distance) perceived_center += pos[i], n1++; + if (distance < rule2Distance) c -= pos[i] - curPos; + if (distance < rule3Distance) perceived_velocity += vel1[i], n3++; + } + } + } + } + } + if (n1 > 0) perceived_center /= static_cast(n1), ret += (perceived_center - curPos) * rule1Scale; + ret += c * rule2Scale; + if (n3 > 0) perceived_velocity /= static_cast(n3), ret += perceived_velocity * rule3Scale; + + vel2[boidIndex] = glm::clamp(vel1[boidIndex] + ret, -maxSpeed, maxSpeed); } /** -* Step the entire N-body simulation by `dt` seconds. -*/ + * Step the entire N-body simulation by `dt` seconds. + */ void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers + // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); // 分块,每块的大小是[n / blockSize],向上取整,所以(numObjects + blockSize - 1) / blockSize + kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + // TODO-1.2 ping-pong the velocity buffers + std::swap(dev_vel1, dev_vel2); } void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed + // TODO-2.1 + // Uniform Grid Neighbor search using Thrust sort. + // In Parallel: + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + // - Perform velocity updates using neighbor search + // - Update positions + // - Ping-pong buffers as needed + dim3 blocksPerGridBoids((numObjects + blockSize - 1) / blockSize); + kernComputeIndices<<>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_particleArrayIndices); + + dim3 blocksPerGridCells((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellEndIndices, -1); + kernIdentifyCellStartEnd<<>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + + kernUpdateVelNeighborSearchScattered<<>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, + dev_pos, dev_vel1, dev_vel2); + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + std::swap(dev_vel1, dev_vel2); +} + +__global__ void kernSortPosAndVel(int N, int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *sortedPos, glm::vec3 *vel, glm::vec3 *sortedVel) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= N) return; + int boidIndex = particleArrayIndices[index]; + sortedPos[index] = pos[boidIndex]; + sortedVel[index] = vel[boidIndex]; } void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + // In Parallel: + // - Label each particle with its array index as well as its grid index. + // Use 2x width grids + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + // - Perform velocity updates using neighbor search + // - Update positions + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + dim3 blocksPerGridBoids((numObjects + blockSize - 1) / blockSize); + kernComputeIndices<<>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_particleArrayIndices); + + dim3 blocksPerGridCells((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellEndIndices, -1); + kernIdentifyCellStartEnd<<>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + + kernSortPosAndVel<<>>(numObjects, dev_particleArrayIndices, dev_pos, dev_sortedPos, dev_vel1, dev_sortedVel); + + kernUpdateVelNeighborSearchCoherent<<>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_sortedPos, dev_sortedVel, dev_vel2); + kernUpdatePos<<>>(numObjects, dt, dev_sortedPos, dev_vel2); + std::swap(dev_vel1, dev_vel2); + std::swap(dev_pos, dev_sortedPos); } void Boids::endSimulation() { - cudaFree(dev_vel1); - cudaFree(dev_vel2); - cudaFree(dev_pos); - - // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_vel1); + cudaFree(dev_vel2); + cudaFree(dev_pos); + + // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_sortedPos); + cudaFree(dev_sortedVel); } void Boids::unitTest() { - // LOOK-1.2 Feel free to write additional tests here. - - // test unstable sort - int *dev_intKeys; - int *dev_intValues; - int N = 10; - - std::unique_ptrintKeys{ new int[N] }; - std::unique_ptrintValues{ new int[N] }; - - intKeys[0] = 0; intValues[0] = 0; - intKeys[1] = 1; intValues[1] = 1; - intKeys[2] = 0; intValues[2] = 2; - intKeys[3] = 3; intValues[3] = 3; - intKeys[4] = 0; intValues[4] = 4; - intKeys[5] = 2; intValues[5] = 5; - intKeys[6] = 2; intValues[6] = 6; - intKeys[7] = 0; intValues[7] = 7; - intKeys[8] = 5; intValues[8] = 8; - intKeys[9] = 6; intValues[9] = 9; - - cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); - - cudaMalloc((void**)&dev_intValues, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); - - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - std::cout << "before unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // How to copy data to the GPU - cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - - // Wrap device vectors in thrust iterators for use with thrust. - thrust::device_ptr dev_thrust_keys(dev_intKeys); - thrust::device_ptr dev_thrust_values(dev_intValues); - // LOOK-2.1 Example for using thrust::sort_by_key - thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); - - // How to copy data back to the CPU side from the GPU - cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); - cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); - checkCUDAErrorWithLine("memcpy back failed!"); - - std::cout << "after unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // cleanup - cudaFree(dev_intKeys); - cudaFree(dev_intValues); - checkCUDAErrorWithLine("cudaFree failed!"); - return; + // LOOK-1.2 Feel free to write additional tests here. + + // test unstable sort + int *dev_intKeys; + int *dev_intValues; + int N = 10; + + std::unique_ptr intKeys{new int[N]}; + std::unique_ptr intValues{new int[N]}; + + intKeys[0] = 0; intValues[0] = 0; + intKeys[1] = 1; intValues[1] = 1; + intKeys[2] = 0; intValues[2] = 2; + intKeys[3] = 3; intValues[3] = 3; + intKeys[4] = 0; intValues[4] = 4; + intKeys[5] = 2; intValues[5] = 5; + intKeys[6] = 2; intValues[6] = 6; + intKeys[7] = 0; intValues[7] = 7; + intKeys[8] = 5; intValues[8] = 8; + intKeys[9] = 6; intValues[9] = 9; + + cudaMalloc((void **)&dev_intKeys, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); + + cudaMalloc((void **)&dev_intValues, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + std::cout << "before unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // How to copy data to the GPU + cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_intKeys); + thrust::device_ptr dev_thrust_values(dev_intValues); + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); + + // How to copy data back to the CPU side from the GPU + cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("memcpy back failed!"); + + std::cout << "after unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // cleanup + cudaFree(dev_intKeys); + cudaFree(dev_intValues); + checkCUDAErrorWithLine("cudaFree failed!"); + return; } diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..f720880 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,8 +14,8 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation const int N_FOR_VIS = 5000; @@ -24,16 +24,16 @@ const float DT = 0.2f; /** * C main function. */ -int main(int argc, char* argv[]) { - projectName = "565 CUDA Intro: Boids"; - - if (init(argc, argv)) { - mainLoop(); - Boids::endSimulation(); - return 0; - } else { - return 1; - } +int main(int argc, char *argv[]) { + projectName = "565 CUDA Intro: Boids"; + + if (init(argc, argv)) { + mainLoop(); + Boids::endSimulation(); + return 0; + } else { + return 1; + } } //------------------------------- @@ -47,143 +47,141 @@ GLFWwindow *window; * Initialization of CUDA and GLFW. */ bool init(int argc, char **argv) { - // Set window title to "Student Name: [SM 2.0] GPU Name" - cudaDeviceProp deviceProp; - int gpuDevice = 0; - int device_count = 0; - cudaGetDeviceCount(&device_count); - if (gpuDevice > device_count) { - std::cout - << "Error: GPU device number is greater than the number of devices!" - << " Perhaps a CUDA-capable GPU is not installed?" - << std::endl; - return false; - } - cudaGetDeviceProperties(&deviceProp, gpuDevice); - int major = deviceProp.major; - int minor = deviceProp.minor; - - std::ostringstream ss; - ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]"; - deviceName = ss.str(); - - // Window setup stuff - glfwSetErrorCallback(errorCallback); - - if (!glfwInit()) { - std::cout - << "Error: Could not initialize GLFW!" - << " Perhaps OpenGL 3.3 isn't available?" - << std::endl; - return false; - } - - glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3); - glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); - glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); - glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); - - window = glfwCreateWindow(width, height, deviceName.c_str(), NULL, NULL); - if (!window) { - glfwTerminate(); - return false; - } - glfwMakeContextCurrent(window); - glfwSetKeyCallback(window, keyCallback); - glfwSetCursorPosCallback(window, mousePositionCallback); - glfwSetMouseButtonCallback(window, mouseButtonCallback); + // Set window title to "Student Name: [SM 2.0] GPU Name" + cudaDeviceProp deviceProp; + int gpuDevice = 0; + int device_count = 0; + cudaGetDeviceCount(&device_count); + if (gpuDevice > device_count) { + std::cout + << "Error: GPU device number is greater than the number of devices!" + << " Perhaps a CUDA-capable GPU is not installed?" + << std::endl; + return false; + } + cudaGetDeviceProperties(&deviceProp, gpuDevice); + int major = deviceProp.major; + int minor = deviceProp.minor; + + std::ostringstream ss; + ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]"; + deviceName = ss.str(); + + // Window setup stuff + glfwSetErrorCallback(errorCallback); + + if (!glfwInit()) { + std::cout + << "Error: Could not initialize GLFW!" + << " Perhaps OpenGL 3.3 isn't available?" + << std::endl; + return false; + } - glewExperimental = GL_TRUE; - if (glewInit() != GLEW_OK) { - return false; - } + glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3); + glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); + glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); + glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); - // Initialize drawing state - initVAO(); + window = glfwCreateWindow(width, height, deviceName.c_str(), NULL, NULL); + if (!window) { + glfwTerminate(); + return false; + } + glfwMakeContextCurrent(window); + glfwSetKeyCallback(window, keyCallback); + glfwSetCursorPosCallback(window, mousePositionCallback); + glfwSetMouseButtonCallback(window, mouseButtonCallback); + + glewExperimental = GL_TRUE; + if (glewInit() != GLEW_OK) { + return false; + } - // Default to device ID 0. If you have more than one GPU and want to test a non-default one, - // change the device ID. - cudaGLSetGLDevice(0); + // Initialize drawing state + initVAO(); - cudaGLRegisterBufferObject(boidVBO_positions); - cudaGLRegisterBufferObject(boidVBO_velocities); + // Default to device ID 0. If you have more than one GPU and want to test a non-default one, + // change the device ID. + cudaGLSetGLDevice(0); - // Initialize N-body simulation - Boids::initSimulation(N_FOR_VIS); + cudaGLRegisterBufferObject(boidVBO_positions); + cudaGLRegisterBufferObject(boidVBO_velocities); - updateCamera(); + // Initialize N-body simulation + Boids::initSimulation(N_FOR_VIS); - initShaders(program); + updateCamera(); - glEnable(GL_DEPTH_TEST); + initShaders(program); - return true; + glEnable(GL_DEPTH_TEST); + + return true; } void initVAO() { + std::unique_ptr bodies{new GLfloat[4 * (N_FOR_VIS)]}; + std::unique_ptr bindices{new GLuint[N_FOR_VIS]}; + + glm::vec4 ul(-1.0, -1.0, 1.0, 1.0); + glm::vec4 lr(1.0, 1.0, 0.0, 0.0); + + for (int i = 0; i < N_FOR_VIS; i++) { + bodies[4 * i + 0] = 0.0f; + bodies[4 * i + 1] = 0.0f; + bodies[4 * i + 2] = 0.0f; + bodies[4 * i + 3] = 1.0f; + bindices[i] = i; + } - std::unique_ptr bodies{ new GLfloat[4 * (N_FOR_VIS)] }; - std::unique_ptr bindices{ new GLuint[N_FOR_VIS] }; - - glm::vec4 ul(-1.0, -1.0, 1.0, 1.0); - glm::vec4 lr(1.0, 1.0, 0.0, 0.0); - - for (int i = 0; i < N_FOR_VIS; i++) { - bodies[4 * i + 0] = 0.0f; - bodies[4 * i + 1] = 0.0f; - bodies[4 * i + 2] = 0.0f; - bodies[4 * i + 3] = 1.0f; - bindices[i] = i; - } - - - glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this - glGenBuffers(1, &boidVBO_positions); - glGenBuffers(1, &boidVBO_velocities); - glGenBuffers(1, &boidIBO); + glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this + glGenBuffers(1, &boidVBO_positions); + glGenBuffers(1, &boidVBO_velocities); + glGenBuffers(1, &boidIBO); - glBindVertexArray(boidVAO); + glBindVertexArray(boidVAO); - // Bind the positions array to the boidVAO by way of the boidVBO_positions - glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer - glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data + // Bind the positions array to the boidVAO by way of the boidVBO_positions + glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer + glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data - glEnableVertexAttribArray(positionLocation); - glVertexAttribPointer((GLuint)positionLocation, 4, GL_FLOAT, GL_FALSE, 0, 0); + glEnableVertexAttribArray(positionLocation); + glVertexAttribPointer((GLuint)positionLocation, 4, GL_FLOAT, GL_FALSE, 0, 0); - // Bind the velocities array to the boidVAO by way of the boidVBO_velocities - glBindBuffer(GL_ARRAY_BUFFER, boidVBO_velocities); - glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); - glEnableVertexAttribArray(velocitiesLocation); - glVertexAttribPointer((GLuint)velocitiesLocation, 4, GL_FLOAT, GL_FALSE, 0, 0); + // Bind the velocities array to the boidVAO by way of the boidVBO_velocities + glBindBuffer(GL_ARRAY_BUFFER, boidVBO_velocities); + glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); + glEnableVertexAttribArray(velocitiesLocation); + glVertexAttribPointer((GLuint)velocitiesLocation, 4, GL_FLOAT, GL_FALSE, 0, 0); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, boidIBO); - glBufferData(GL_ELEMENT_ARRAY_BUFFER, (N_FOR_VIS) * sizeof(GLuint), bindices.get(), GL_STATIC_DRAW); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, boidIBO); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, (N_FOR_VIS) * sizeof(GLuint), bindices.get(), GL_STATIC_DRAW); - glBindVertexArray(0); + glBindVertexArray(0); } -void initShaders(GLuint * program) { - GLint location; +void initShaders(GLuint *program) { + GLint location; - program[PROG_BOID] = glslUtility::createProgram( - "shaders/boid.vert.glsl", - "shaders/boid.geom.glsl", - "shaders/boid.frag.glsl", attributeLocations, 2); + program[PROG_BOID] = glslUtility::createProgram( + "shaders/boid.vert.glsl", + "shaders/boid.geom.glsl", + "shaders/boid.frag.glsl", attributeLocations, 2); glUseProgram(program[PROG_BOID]); if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) { - glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); + glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); } if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1) { - glUniform3fv(location, 1, &cameraPosition[0]); + glUniform3fv(location, 1, &cameraPosition[0]); } - } +} - //==================================== - // Main loop - //==================================== - void runCUDA() { +//==================================== +// Main loop +//==================================== +void runCUDA() { // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not // use this buffer @@ -192,109 +190,108 @@ void initShaders(GLuint * program) { float *dptrVertPositions = NULL; float *dptrVertVelocities = NULL; - cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions); - cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities); + cudaGLMapBufferObject((void **)&dptrVertPositions, boidVBO_positions); + cudaGLMapBufferObject((void **)&dptrVertVelocities, boidVBO_velocities); // execute the kernel - #if UNIFORM_GRID && COHERENT_GRID +#if UNIFORM_GRID && COHERENT_GRID Boids::stepSimulationCoherentGrid(DT); - #elif UNIFORM_GRID +#elif UNIFORM_GRID Boids::stepSimulationScatteredGrid(DT); - #else +#else Boids::stepSimulationNaive(DT); - #endif +#endif - #if VISUALIZE +#if VISUALIZE Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities); - #endif +#endif // unmap buffer object cudaGLUnmapBufferObject(boidVBO_positions); cudaGLUnmapBufferObject(boidVBO_velocities); - } +} - void mainLoop() { +void mainLoop() { double fps = 0; double timebase = 0; int frame = 0; Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure - // your CUDA development setup is ready to go. + // your CUDA development setup is ready to go. while (!glfwWindowShouldClose(window)) { - glfwPollEvents(); + glfwPollEvents(); - frame++; - double time = glfwGetTime(); + frame++; + double time = glfwGetTime(); - if (time - timebase > 1.0) { - fps = frame / (time - timebase); - timebase = time; - frame = 0; - } + if (time - timebase > 1.0) { + fps = frame / (time - timebase); + timebase = time; + frame = 0; + } - runCUDA(); + runCUDA(); - std::ostringstream ss; - ss << "["; - ss.precision(1); - ss << std::fixed << fps; - ss << " fps] " << deviceName; - glfwSetWindowTitle(window, ss.str().c_str()); + std::ostringstream ss; + ss << "["; + ss.precision(1); + ss << std::fixed << fps; + ss << " fps] " << deviceName; + glfwSetWindowTitle(window, ss.str().c_str()); - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - #if VISUALIZE - glUseProgram(program[PROG_BOID]); - glBindVertexArray(boidVAO); - glPointSize((GLfloat)pointSize); - glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0); - glPointSize(1.0f); +#if VISUALIZE + glUseProgram(program[PROG_BOID]); + glBindVertexArray(boidVAO); + glPointSize((GLfloat)pointSize); + glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0); + glPointSize(1.0f); - glUseProgram(0); - glBindVertexArray(0); + glUseProgram(0); + glBindVertexArray(0); - glfwSwapBuffers(window); - #endif + glfwSwapBuffers(window); +#endif } glfwDestroyWindow(window); glfwTerminate(); - } +} - void errorCallback(int error, const char *description) { +void errorCallback(int error, const char *description) { fprintf(stderr, "error %d: %s\n", error, description); - } +} - void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) { +void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) { if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) { - glfwSetWindowShouldClose(window, GL_TRUE); + glfwSetWindowShouldClose(window, GL_TRUE); } - } +} - void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) { +void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods) { leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS); rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS); - } +} - void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) { +void mousePositionCallback(GLFWwindow *window, double xpos, double ypos) { if (leftMousePressed) { - // compute new camera parameters - phi += (xpos - lastX) / width; - theta -= (ypos - lastY) / height; - theta = std::fmax(0.01f, std::fmin(theta, 3.14f)); - updateCamera(); - } - else if (rightMousePressed) { - zoom += (ypos - lastY) / height; - zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f)); - updateCamera(); + // compute new camera parameters + phi += (xpos - lastX) / width; + theta -= (ypos - lastY) / height; + theta = std::fmax(0.01f, std::fmin(theta, 3.14f)); + updateCamera(); + } else if (rightMousePressed) { + zoom += (ypos - lastY) / height; + zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f)); + updateCamera(); } - lastX = xpos; - lastY = ypos; - } + lastX = xpos; + lastY = ypos; +} - void updateCamera() { +void updateCamera() { cameraPosition.x = zoom * sin(phi) * sin(theta); cameraPosition.z = zoom * cos(theta); cameraPosition.y = zoom * cos(phi) * sin(theta); @@ -308,6 +305,6 @@ void initShaders(GLuint * program) { glUseProgram(program[PROG_BOID]); if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) { - glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); + glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); } - } +}