diff --git a/.cache/clangd/index/cudaMat4.hpp.59DDD2E3599F7102.idx b/.cache/clangd/index/cudaMat4.hpp.59DDD2E3599F7102.idx
new file mode 100644
index 0000000..be9a94e
Binary files /dev/null and b/.cache/clangd/index/cudaMat4.hpp.59DDD2E3599F7102.idx differ
diff --git a/.cache/clangd/index/glslUtility.cpp.6B5C7C5C38F14405.idx b/.cache/clangd/index/glslUtility.cpp.6B5C7C5C38F14405.idx
new file mode 100644
index 0000000..0c9f8f6
Binary files /dev/null and b/.cache/clangd/index/glslUtility.cpp.6B5C7C5C38F14405.idx differ
diff --git a/.cache/clangd/index/glslUtility.hpp.1C4CD8843D047D6E.idx b/.cache/clangd/index/glslUtility.hpp.1C4CD8843D047D6E.idx
new file mode 100644
index 0000000..dff3dce
Binary files /dev/null and b/.cache/clangd/index/glslUtility.hpp.1C4CD8843D047D6E.idx differ
diff --git a/.cache/clangd/index/kernel.h.86AC7F8FD0199582.idx b/.cache/clangd/index/kernel.h.86AC7F8FD0199582.idx
new file mode 100644
index 0000000..bb70170
Binary files /dev/null and b/.cache/clangd/index/kernel.h.86AC7F8FD0199582.idx differ
diff --git a/.cache/clangd/index/main.cpp.59D3B8A164E7B469.idx b/.cache/clangd/index/main.cpp.59D3B8A164E7B469.idx
new file mode 100644
index 0000000..a6661eb
Binary files /dev/null and b/.cache/clangd/index/main.cpp.59D3B8A164E7B469.idx differ
diff --git a/.cache/clangd/index/main.hpp.120473D743DD0E61.idx b/.cache/clangd/index/main.hpp.120473D743DD0E61.idx
new file mode 100644
index 0000000..6336ee5
Binary files /dev/null and b/.cache/clangd/index/main.hpp.120473D743DD0E61.idx differ
diff --git a/.cache/clangd/index/utilityCore.cpp.D6912F82B554AF4B.idx b/.cache/clangd/index/utilityCore.cpp.D6912F82B554AF4B.idx
new file mode 100644
index 0000000..edb3a4f
Binary files /dev/null and b/.cache/clangd/index/utilityCore.cpp.D6912F82B554AF4B.idx differ
diff --git a/.cache/clangd/index/utilityCore.hpp.963F8090EB22A8F1.idx b/.cache/clangd/index/utilityCore.hpp.963F8090EB22A8F1.idx
new file mode 100644
index 0000000..6e0125f
Binary files /dev/null and b/.cache/clangd/index/utilityCore.hpp.963F8090EB22A8F1.idx differ
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..e295476
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,213 @@
+# 语言: None, Cpp, Java, JavaScript, ObjC, Proto, TableGen, TextProto
+Language: Cpp
+# BasedOnStyle:	LLVM
+
+# 访问说明符(public、private等)的偏移
+AccessModifierOffset: -4
+
+# 开括号(开圆括号、开尖括号、开方括号)后的对齐: Align, DontAlign, AlwaysBreak(总是在开括号后换行)
+AlignAfterOpenBracket: Align
+
+# 连续赋值时，对齐所有等号
+AlignConsecutiveAssignments: false
+
+# 连续声明时，对齐所有声明的变量名
+AlignConsecutiveDeclarations: false
+
+# 右对齐逃脱换行(使用反斜杠换行)的反斜杠
+AlignEscapedNewlines: Right
+
+# 水平对齐二元和三元表达式的操作数
+AlignOperands: true
+
+# 对齐连续的尾随的注释
+AlignTrailingComments: true
+
+# 不允许函数声明的所有参数在放在下一行
+AllowAllParametersOfDeclarationOnNextLine: false
+
+# 不允许短的块放在同一行
+AllowShortBlocksOnASingleLine: true
+
+# 允许短的case标签放在同一行
+AllowShortCaseLabelsOnASingleLine: true
+
+# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中，空函数), All
+AllowShortFunctionsOnASingleLine: None
+
+# 允许短的if语句保持在同一行
+AllowShortIfStatementsOnASingleLine: true
+
+# 允许短的循环保持在同一行
+AllowShortLoopsOnASingleLine: true
+
+# 总是在返回类型后换行: None, All, TopLevel(顶级函数，不包括在类中的函数), 
+# AllDefinitions(所有的定义，不包括声明), TopLevelDefinitions(所有的顶级函数的定义)
+AlwaysBreakAfterReturnType: None
+
+# 总是在多行string字面量前换行
+AlwaysBreakBeforeMultilineStrings: false
+
+# 总是在template声明后换行
+AlwaysBreakTemplateDeclarations: true
+
+# false表示函数实参要么都在同一行，要么都各自一行
+BinPackArguments: true
+
+# false表示所有形参要么都在同一行，要么都各自一行
+BinPackParameters: true
+
+# 大括号换行，只有当BreakBeforeBraces设置为Custom时才有效
+BraceWrapping:
+  # class定义后面
+  AfterClass: false
+  # 控制语句后面
+  AfterControlStatement: false
+  # enum定义后面
+  AfterEnum: false
+  # 函数定义后面
+  AfterFunction: false
+  # 命名空间定义后面
+  AfterNamespace: false
+  # struct定义后面
+  AfterStruct: false
+  # union定义后面
+  AfterUnion: false
+  # extern之后
+  AfterExternBlock: false
+  # catch之前
+  BeforeCatch: false
+  # else之前
+  BeforeElse: false
+  # 缩进大括号
+  IndentBraces: false
+  # 分离空函数
+  SplitEmptyFunction: false
+  # 分离空语句
+  SplitEmptyRecord: false
+  # 分离空命名空间
+  SplitEmptyNamespace: false
+
+# 在二元运算符前换行: None(在操作符后换行), NonAssignment(在非赋值的操作符前换行), All(在操作符前换行)
+BreakBeforeBinaryOperators: NonAssignment
+
+# 在大括号前换行: Attach(始终将大括号附加到周围的上下文), Linux(除函数、命名空间和类定义，与Attach类似), 
+#   Mozilla(除枚举、函数、记录定义，与Attach类似), Stroustrup(除函数定义、catch、else，与Attach类似), 
+#   Allman(总是在大括号前换行), GNU(总是在大括号前换行，并对于控制语句的大括号增加额外的缩进), WebKit(在函数前换行), Custom
+#   注：这里认为语句块也属于函数
+BreakBeforeBraces: Custom
+
+# 在三元运算符前换行
+BreakBeforeTernaryOperators: false
+
+# 在构造函数的初始化列表的冒号后换行
+BreakConstructorInitializers: AfterColon
+
+#BreakInheritanceList: AfterColon
+
+BreakStringLiterals: false
+
+# 每行字符的限制，0表示没有限制
+ColumnLimit: 0
+
+CompactNamespaces: true
+
+# 构造函数的初始化列表要么都在同一行，要么都各自一行
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+
+# 构造函数的初始化列表的缩进宽度
+ConstructorInitializerIndentWidth: 4
+
+# 延续的行的缩进宽度
+ContinuationIndentWidth: 4
+
+# 去除C++11的列表初始化的大括号{后和}前的空格
+Cpp11BracedListStyle: true
+
+# 继承最常用的指针和引用的对齐方式
+DerivePointerAlignment: false
+
+# 固定命名空间注释
+FixNamespaceComments: true
+
+# 缩进case标签
+IndentCaseLabels: false
+
+IndentPPDirectives: None
+
+# 缩进宽度
+IndentWidth: 4
+
+# 函数返回类型换行时，缩进函数声明或函数定义的函数名
+IndentWrappedFunctionNames: false
+
+# 保留在块开始处的空行
+KeepEmptyLinesAtTheStartOfBlocks: false
+
+# 连续空行的最大数量
+MaxEmptyLinesToKeep: 1
+
+# 命名空间的缩进: None, Inner(缩进嵌套的命名空间中的内容), All
+NamespaceIndentation: None
+
+# 指针和引用的对齐: Left, Right, Middle
+PointerAlignment: Right
+
+# 允许重新排版注释
+ReflowComments: true
+
+# 允许排序#include
+SortIncludes: false
+
+# 允许排序 using 声明
+SortUsingDeclarations: false
+
+# 在C风格类型转换后添加空格
+SpaceAfterCStyleCast: false
+
+# 在Template 关键字后面添加空格
+SpaceAfterTemplateKeyword: true
+
+# 在赋值运算符之前添加空格
+SpaceBeforeAssignmentOperators: true
+
+# SpaceBeforeCpp11BracedList: true
+
+# SpaceBeforeCtorInitializerColon: true
+
+# SpaceBeforeInheritanceColon: true
+
+# 开圆括号之前添加一个空格: Never, ControlStatements, Always
+SpaceBeforeParens: ControlStatements
+
+# SpaceBeforeRangeBasedForLoopColon: true
+
+# 在空的圆括号中添加空格
+SpaceInEmptyParentheses: false
+
+# 在尾随的评论前添加的空格数(只适用于//)
+SpacesBeforeTrailingComments: 1
+
+# 在尖括号的<后和>前添加空格
+SpacesInAngles: false
+
+# 在C风格类型转换的括号中添加空格
+SpacesInCStyleCastParentheses: false
+
+# 在容器(ObjC和JavaScript的数组和字典等)字面量中添加空格
+SpacesInContainerLiterals: true
+
+# 在圆括号的(后和)前添加空格
+SpacesInParentheses: false
+
+# 在方括号的[后和]前添加空格，lamda表达式和未指明大小的数组的声明不受影响
+SpacesInSquareBrackets: false
+
+# 标准: Cpp03, Cpp11, Auto
+Standard: Cpp11
+
+# tab宽度
+TabWidth: 4
+
+# 使用tab字符: Never, ForIndentation, ForContinuationAndIndentation, Always
+UseTab: Never
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..8645ef1
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // 使用 IntelliSense 了解相关属性。 
+    // 悬停以查看现有属性的描述。
+    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "lldb",
+            "request": "launch",
+            "name": "Debug",
+            "program": "${command:cmake.launchTargetPath}",
+            "args": [],
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/NoteBook.pdf b/NoteBook.pdf
new file mode 100644
index 0000000..402b97c
Binary files /dev/null and b/NoteBook.pdf differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..c33ed2f 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,36 +5,36 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
+#include <device_launch_parameters.h>
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
-#define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
+#define imax(a, b) (((a) > (b)) ? (a) : (b))
 #endif
 
 #ifndef imin
-#define imin( a, b ) ( ((a) < (b)) ? (a) : (b) )
+#define imin(a, b) (((a) < (b)) ? (a) : (b))
 #endif
 
 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
 /**
-* Check for CUDA errors; print and exit if there was a problem.
-*/
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
 void checkCUDAError(const char *msg, int line = -1) {
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    if (line >= 0) {
-      fprintf(stderr, "Line %d: ", line);
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        if (line >= 0) {
+            fprintf(stderr, "Line %d: ", line);
+        }
+        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
     }
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
 }
 
-
 /*****************
-* Configuration *
-*****************/
+ * Configuration *
+ *****************/
 
 /*! Block size used for CUDA kernel launch. */
 #define blockSize 128
@@ -55,8 +55,8 @@ void checkCUDAError(const char *msg, int line = -1) {
 #define scene_scale 100.0f
 
 /***********************************************
-* Kernel state (pointers are device pointers) *
-***********************************************/
+ * Kernel state (pointers are device pointers) *
+ ***********************************************/
 
 int numObjects;
 dim3 threadsPerBlock(blockSize);
@@ -75,7 +75,7 @@ glm::vec3 *dev_vel2;
 
 // For efficient sorting and the uniform grid. These should always be parallel.
 int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
-int *dev_particleGridIndices; // What grid cell is this particle in?
+int *dev_particleGridIndices;  // What grid cell is this particle in?
 // needed for use with thrust
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
@@ -85,6 +85,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_sortedPos;
+glm::vec3 *dev_sortedVel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -95,181 +97,219 @@ float gridInverseCellWidth;
 glm::vec3 gridMinimum;
 
 /******************
-* initSimulation *
-******************/
+ * initSimulation *
+ ******************/
 
 __host__ __device__ unsigned int hash(unsigned int a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+    a = (a + 0x7ed55d16) + (a << 12);
+    a = (a ^ 0xc761c23c) ^ (a >> 19);
+    a = (a + 0x165667b1) + (a << 5);
+    a = (a + 0xd3a2646c) ^ (a << 9);
+    a = (a + 0xfd7046c5) + (a << 3);
+    a = (a ^ 0xb55a4f09) ^ (a >> 16);
+    return a;
 }
 
 /**
-* LOOK-1.2 - this is a typical helper function for a CUDA kernel.
-* Function for generating a random vec3.
-*/
+ * LOOK-1.2 - this is a typical helper function for a CUDA kernel.
+ * Function for generating a random vec3.
+ */
 __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
-  thrust::default_random_engine rng(hash((int)(index * time)));
-  thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+    thrust::default_random_engine rng(hash((int)(index * time)));
+    thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
 
-  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+    return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
 }
 
 /**
-* LOOK-1.2 - This is a basic CUDA kernel.
-* CUDA kernel for generating boids with a specified mass randomly around the star.
-*/
-__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    glm::vec3 rand = generateRandomVec3(time, index);
-    arr[index].x = scale * rand.x;
-    arr[index].y = scale * rand.y;
-    arr[index].z = scale * rand.z;
-  }
+ * LOOK-1.2 - This is a basic CUDA kernel.
+ * CUDA kernel for generating boids with a specified mass randomly around the star.
+ */
+__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 *arr, float scale) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::vec3 rand = generateRandomVec3(time, index);
+        arr[index].x = scale * rand.x;
+        arr[index].y = scale * rand.y;
+        arr[index].z = scale * rand.z;
+    }
 }
 
 /**
-* Initialize memory, update some globals
-*/
+ * Initialize memory, update some globals
+ */
 void Boids::initSimulation(int N) {
-  numObjects = N;
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  // LOOK-1.2 - This is basic CUDA memory management and error checking.
-  // Don't forget to cudaFree in  Boids::endSimulation.
-  cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
-
-  cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
-  cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
-
-  // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
-  checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
-
-  // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
-  int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
-
-  gridCellCount = gridSideCount * gridSideCount * gridSideCount;
-  gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
-  gridMinimum.x -= halfGridWidth;
-  gridMinimum.y -= halfGridWidth;
-  gridMinimum.z -= halfGridWidth;
-
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
-  cudaDeviceSynchronize();
-}
+    numObjects = N;
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+    // LOOK-1.2 - This is basic CUDA memory management and error checking.
+    // Don't forget to cudaFree in  Boids::endSimulation.
+    cudaMalloc((void **)&dev_pos, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+    cudaMalloc((void **)&dev_vel1, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
+    cudaMalloc((void **)&dev_vel2, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
+
+    // LOOK-1.2 - This is a typical CUDA kernel invocation.
+    kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
+                                                                 dev_pos, scene_scale);
+    checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
+    // LOOK-2.1 computing grid params
+    gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+    int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
+    gridSideCount = 2 * halfSideCount;
+
+    gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+    gridInverseCellWidth = 1.0f / gridCellWidth;
+    float halfGridWidth = gridCellWidth * halfSideCount;
+    gridMinimum.x -= halfGridWidth;
+    gridMinimum.y -= halfGridWidth;
+    gridMinimum.z -= halfGridWidth;
+
+    // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+    cudaMalloc((void **)&dev_particleArrayIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("Malloc dev_particleArrayIndices failed!\n");
+
+    cudaMalloc((void **)&dev_particleGridIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("Malloc dev_particleGridIndices failed!\n");
 
+    cudaMalloc((void **)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("Malloc dev_gridCellStartIndices failed!\n");
+
+    cudaMalloc((void **)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("Malloc dev_gridCellEndIndices failed!\n");
+
+    dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+    cudaMalloc((void **)&dev_sortedPos, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("Malloc dev_sortedPos failed!\n");
+
+    cudaMalloc((void **)&dev_sortedVel, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("Malloc dev_sortedVel failed!\n");
+
+    cudaDeviceSynchronize();
+}
 
 /******************
-* copyBoidsToVBO *
-******************/
+ * copyBoidsToVBO *
+ ******************/
 
 /**
-* Copy the boid positions into the VBO so that they can be drawn by OpenGL.
-*/
+ * Copy the boid positions into the VBO so that they can be drawn by OpenGL.
+ */
 __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
-  float c_scale = -1.0f / s_scale;
+    float c_scale = -1.0f / s_scale;
 
-  if (index < N) {
-    vbo[4 * index + 0] = pos[index].x * c_scale;
-    vbo[4 * index + 1] = pos[index].y * c_scale;
-    vbo[4 * index + 2] = pos[index].z * c_scale;
-    vbo[4 * index + 3] = 1.0f;
-  }
+    if (index < N) {
+        vbo[4 * index + 0] = pos[index].x * c_scale;
+        vbo[4 * index + 1] = pos[index].y * c_scale;
+        vbo[4 * index + 2] = pos[index].z * c_scale;
+        vbo[4 * index + 3] = 1.0f;
+    }
 }
 
 __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-
-  if (index < N) {
-    vbo[4 * index + 0] = vel[index].x + 0.3f;
-    vbo[4 * index + 1] = vel[index].y + 0.3f;
-    vbo[4 * index + 2] = vel[index].z + 0.3f;
-    vbo[4 * index + 3] = 1.0f;
-  }
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+    if (index < N) {
+        vbo[4 * index + 0] = vel[index].x + 0.3f;
+        vbo[4 * index + 1] = vel[index].y + 0.3f;
+        vbo[4 * index + 2] = vel[index].z + 0.3f;
+        vbo[4 * index + 3] = 1.0f;
+    }
 }
 
 /**
-* Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
-*/
+ * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
+ */
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
-  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+    kernCopyPositionsToVBO<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, vbodptr_positions, scene_scale);
+    kernCopyVelocitiesToVBO<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
-  checkCUDAErrorWithLine("copyBoidsToVBO failed!");
+    checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
 }
 
-
 /******************
-* stepSimulation *
-******************/
+ * stepSimulation *
+ ******************/
 
 /**
-* LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
-* __device__ code can be called from a __global__ context
-* Compute the new velocity on the body with index `iSelf` due to the `N` boids
-* in the `pos` and `vel` arrays.
-*/
+ * LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
+ * __device__ code can be called from a __global__ context
+ * Compute the new velocity on the body with index `iSelf` due to the `N` boids
+ * in the `pos` and `vel` arrays.
+ */
+
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    // Rule 2: boids try to stay a distance d away from each other
+    // Rule 3: boids try to match the speed of surrounding boids
+    glm::vec3 boidPos = pos[iSelf];
+    glm::vec3 perceived_center(0.0f), perceived_velocity(0.0f), c(0.0f), ret(0.0f);
+    int n1 = 0, n3 = 0;
+    for (int i = 0; i < N; i++) {
+        if (i == iSelf) continue;
+        float distance = glm::distance(boidPos, pos[i]);
+        if (distance < rule1Distance) perceived_center += pos[i], n1++;
+        if (distance < rule2Distance) c -= pos[i] - boidPos;
+        if (distance < rule3Distance) perceived_velocity += vel[i], n3++;
+    }
+    if (n1 > 0) perceived_center /= static_cast<float>(n1), ret += (perceived_center - boidPos) * rule1Scale;
+    ret += c * rule2Scale;
+    if (n3 > 0) perceived_velocity /= static_cast<float>(n3), ret += perceived_velocity * rule3Scale;
+    return ret;
 }
 
 /**
-* TODO-1.2 implement basic flocking
-* For each of the `N` bodies, update its position based on its current velocity.
-*/
+ * TODO-1.2 implement basic flocking
+ * For each of the `N` bodies, update its position based on its current velocity.
+ */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+                                             glm::vec3 *vel1, glm::vec3 *vel2) {
+    // Compute a new velocity based on pos and vel1
+    // Clamp the speed
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) return;
+    glm::vec3 deltaVel = computeVelocityChange(N, index, pos, vel1);
+    glm::vec3 curVel = vel1[index];
+    vel2[index] = glm::clamp(curVel + deltaVel, -maxSpeed, maxSpeed);
 }
 
 /**
-* LOOK-1.2 Since this is pretty trivial, we implemented it for you.
-* For each of the `N` bodies, update its position based on its current velocity.
-*/
+ * LOOK-1.2 Since this is pretty trivial, we implemented it for you.
+ * For each of the `N` bodies, update its position based on its current velocity.
+ */
 __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
-  // Update position by velocity
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-  if (index >= N) {
-    return;
-  }
-  glm::vec3 thisPos = pos[index];
-  thisPos += vel[index] * dt;
+    // Update position by velocity
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 thisPos = pos[index];
+    thisPos += vel[index] * dt;
 
-  // Wrap the boids around so we don't lose them
-  thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
-  thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
-  thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+    // Wrap the boids around so we don't lose them
+    thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+    thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+    thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
 
-  thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
-  thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
-  thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+    thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+    thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+    thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
 
-  pos[index] = thisPos;
+    pos[index] = thisPos;
 }
 
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
@@ -279,179 +319,342 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //            for(y)
 //             for(z)? Or some other order?
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+    return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
+                                   glm::vec3 gridMin, float inverseCellWidth,
+                                   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int boidIndex = blockIdx.x * blockDim.x + threadIdx.x; // boid index
+    if (boidIndex >= N) return;
+    glm::vec3 offset = pos[boidIndex] - gridMin;
+    float xIndex = offset.x * inverseCellWidth;
+    float yIndex = offset.y * inverseCellWidth;
+    float zIndex = offset.z * inverseCellWidth;
+    int gridIndex = gridIndex3Dto1D(xIndex, yIndex, zIndex, gridResolution);
+    // key
+    indices[boidIndex] = boidIndex;
+    // value
+    gridIndices[boidIndex] = gridIndex;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        intBuffer[index] = value;
+    }
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+                                         int *gridCellStartIndices, int *gridCellEndIndices) {
+    // TODO-2.1
+    // Identify the start point of each cell in the gridIndices array.
+    // This is basically a parallel unrolling of a loop that goes
+    // "this index doesn't match the one before it, must be a new cell!"
+
+    // We must do this after sort
+    int boidIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (boidIndex >= N) return;
+    int gridIndex = particleGridIndices[boidIndex];
+    if (boidIndex > 0) {
+        int preGridIndex = particleGridIndices[boidIndex - 1];
+        if (preGridIndex != gridIndex) {
+            gridCellStartIndices[gridIndex] = boidIndex;
+            gridCellEndIndices[preGridIndex] = boidIndex;
+        }
+        if (preGridIndex == N - 1) {
+            gridCellEndIndices[gridIndex] = boidIndex;
+        }
+    } else {
+        gridCellStartIndices[gridIndex] = boidIndex;
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int *gridCellStartIndices, int *gridCellEndIndices,
+    int *particleArrayIndices,
+    glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+    int boidIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    if (boidIndex >= N) return;
+    glm::vec3 curPos = pos[boidIndex];
+    glm::vec3 offset = curPos - gridMin;
+    float xIndex = offset.x * inverseCellWidth;
+    float yIndex = offset.y * inverseCellWidth;
+    float zIndex = offset.z * inverseCellWidth;
+    int gridIndex = gridIndex3Dto1D(xIndex, yIndex, zIndex, gridResolution);
+
+    // find the bounding box of A GRID
+    int X_MAX = imin(xIndex + 1, gridResolution - 1);
+    int X_MIN = imax(xIndex - 1, 0);
+    int Y_MAX = imin(yIndex + 1, gridResolution - 1);
+    int Y_MIN = imax(yIndex - 1, 0);
+    int Z_MAX = imin(zIndex + 1, gridResolution - 1);
+    int Z_MIN = imax(zIndex - 1, 0);
+
+    glm::vec3 perceived_center(0.0f), perceived_velocity(0.0f), c(0.0f), ret(0.0f);
+    int n1 = 0, n3 = 0;
+
+    for (int x = X_MIN; x <= X_MAX; x++) {
+        for (int y = Y_MIN; y <= Y_MAX; y++) {
+            for (int z = Z_MIN; z <= Z_MAX; z++) {
+                int neighborGridIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+                int neighborGridStart = gridCellStartIndices[neighborGridIndex];
+                int neighborGridEnd = gridCellEndIndices[neighborGridIndex];
+                for (int i = neighborGridStart; i <= neighborGridEnd; i++) {
+                    int this_boidIndex = particleArrayIndices[i];
+                    if (this_boidIndex != boidIndex) {
+                        float distance = glm::distance(pos[this_boidIndex], curPos);
+                        if (distance < rule1Distance) perceived_center += pos[this_boidIndex], n1++;
+                        if (distance < rule2Distance) c -= pos[this_boidIndex] - curPos;
+                        if (distance < rule3Distance) perceived_velocity += vel1[this_boidIndex], n3++;
+                    }
+                }
+            }
+        }
+    }
+    if (n1 > 0) perceived_center /= static_cast<float>(n1), ret += (perceived_center - curPos) * rule1Scale;
+    ret += c * rule2Scale;
+    if (n3 > 0) perceived_velocity /= static_cast<float>(n3), ret += perceived_velocity * rule3Scale;
+
+    vel2[boidIndex] = glm::clamp(vel1[boidIndex] + ret, -maxSpeed, maxSpeed);
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int *gridCellStartIndices, int *gridCellEndIndices,
+    glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // except with one less level of indirection.
+    // This should expect gridCellStartIndices and gridCellEndIndices to refer
+    // directly to pos and vel1.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    //   DIFFERENCE: For best results, consider what order the cells should be
+    //   checked in to maximize the memory benefits of reordering the boids data.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+    int boidIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    if (boidIndex >= N) return;
+    glm::vec3 curPos = pos[boidIndex];
+    glm::vec3 offset = curPos - gridMin;
+    float xIndex = offset.x * inverseCellWidth;
+    float yIndex = offset.y * inverseCellWidth;
+    float zIndex = offset.z * inverseCellWidth;
+    int gridIndex = gridIndex3Dto1D(xIndex, yIndex, zIndex, gridResolution);
+
+    // find the bounding box of A GRID
+    int X_MAX = imin(xIndex + 1, gridResolution - 1);
+    int X_MIN = imax(xIndex - 1, 0);
+    int Y_MAX = imin(yIndex + 1, gridResolution - 1);
+    int Y_MIN = imax(yIndex - 1, 0);
+    int Z_MAX = imin(zIndex + 1, gridResolution - 1);
+    int Z_MIN = imax(zIndex - 1, 0);
+
+    glm::vec3 perceived_center(0.0f), perceived_velocity(0.0f), c(0.0f), ret(0.0f);
+    int n1 = 0, n3 = 0;
+
+    for (int x = X_MIN; x <= X_MAX; x++) {
+        for (int y = Y_MIN; y <= Y_MAX; y++) {
+            for (int z = Z_MIN; z <= Z_MAX; z++) {
+                int neighborGridIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+                int neighborGridStart = gridCellStartIndices[neighborGridIndex];
+                int neighborGridEnd = gridCellEndIndices[neighborGridIndex];
+                for (int i = neighborGridStart; i <= neighborGridEnd; i++) {
+                    if (i != boidIndex) {
+                        float distance = glm::distance(pos[i], curPos);
+                        if (distance < rule1Distance) perceived_center += pos[i], n1++;
+                        if (distance < rule2Distance) c -= pos[i] - curPos;
+                        if (distance < rule3Distance) perceived_velocity += vel1[i], n3++;
+                    }
+                }
+            }
+        }
+    }
+    if (n1 > 0) perceived_center /= static_cast<float>(n1), ret += (perceived_center - curPos) * rule1Scale;
+    ret += c * rule2Scale;
+    if (n3 > 0) perceived_velocity /= static_cast<float>(n3), ret += perceived_velocity * rule3Scale;
+
+    vel2[boidIndex] = glm::clamp(vel1[boidIndex] + ret, -maxSpeed, maxSpeed);
 }
 
 /**
-* Step the entire N-body simulation by `dt` seconds.
-*/
+ * Step the entire N-body simulation by `dt` seconds.
+ */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); // 分块，每块的大小是[n / blockSize]，向上取整，所以(numObjects + blockSize - 1) /  blockSize
+    kernUpdateVelocityBruteForce<<<blocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos<<<blocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+    // TODO-1.2 ping-pong the velocity buffers
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+    // TODO-2.1
+    // Uniform Grid Neighbor search using Thrust sort.
+    // In Parallel:
+    // - label each particle with its array index as well as its grid index.
+    //   Use 2x width grids.
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    // - Perform velocity updates using neighbor search
+    // - Update positions
+    // - Ping-pong buffers as needed
+    dim3 blocksPerGridBoids((numObjects + blockSize - 1) / blockSize);
+    kernComputeIndices<<<blocksPerGridBoids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_particleArrayIndices);
+
+    dim3 blocksPerGridCells((gridCellCount + blockSize - 1) / blockSize);
+    kernResetIntBuffer<<<blocksPerGridCells, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer<<<blocksPerGridCells, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+    kernIdentifyCellStartEnd<<<blocksPerGridCells, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    kernUpdateVelNeighborSearchScattered<<<blocksPerGridBoids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+                                                                            dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices,
+                                                                            dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos<<<blocksPerGridBoids, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+    std::swap(dev_vel1, dev_vel2);
+}
+
+__global__ void kernSortPosAndVel(int N, int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *sortedPos, glm::vec3 *vel, glm::vec3 *sortedVel) {
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= N) return;
+    int boidIndex = particleArrayIndices[index];
+    sortedPos[index] = pos[boidIndex];
+    sortedVel[index] = vel[boidIndex];
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+    // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+    // In Parallel:
+    // - Label each particle with its array index as well as its grid index.
+    //   Use 2x width grids
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+    //   the particle data in the simulation array.
+    //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    // - Perform velocity updates using neighbor search
+    // - Update positions
+    // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dim3 blocksPerGridBoids((numObjects + blockSize - 1) / blockSize);
+    kernComputeIndices<<<blocksPerGridBoids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_particleArrayIndices);
+
+    dim3 blocksPerGridCells((gridCellCount + blockSize - 1) / blockSize);
+    kernResetIntBuffer<<<blocksPerGridCells, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer<<<blocksPerGridCells, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+    kernIdentifyCellStartEnd<<<blocksPerGridCells, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    kernSortPosAndVel<<<blocksPerGridBoids, blockSize>>>(numObjects, dev_particleArrayIndices, dev_pos, dev_sortedPos, dev_vel1, dev_sortedVel);
+
+    kernUpdateVelNeighborSearchCoherent<<<blocksPerGridBoids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+                                                                            dev_gridCellStartIndices, dev_gridCellEndIndices,
+                                                                            dev_sortedPos, dev_sortedVel, dev_vel2);
+    kernUpdatePos<<<blocksPerGridBoids, blockSize>>>(numObjects, dt, dev_sortedPos, dev_vel2);
+    std::swap(dev_vel1, dev_vel2);
+    std::swap(dev_pos, dev_sortedPos);
 }
 
 void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
-
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    cudaFree(dev_vel1);
+    cudaFree(dev_vel2);
+    cudaFree(dev_pos);
+
+    // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    cudaFree(dev_particleArrayIndices);
+    cudaFree(dev_particleGridIndices);
+    cudaFree(dev_gridCellStartIndices);
+    cudaFree(dev_gridCellEndIndices);
+
+    cudaFree(dev_sortedPos);
+    cudaFree(dev_sortedVel);
 }
 
 void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
-
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // cleanup
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
+    // LOOK-1.2 Feel free to write additional tests here.
+
+    // test unstable sort
+    int *dev_intKeys;
+    int *dev_intValues;
+    int N = 10;
+
+    std::unique_ptr<int[]> intKeys{new int[N]};
+    std::unique_ptr<int[]> intValues{new int[N]};
+
+    intKeys[0] = 0; intValues[0] = 0;
+    intKeys[1] = 1; intValues[1] = 1;
+    intKeys[2] = 0; intValues[2] = 2;
+    intKeys[3] = 3; intValues[3] = 3;
+    intKeys[4] = 0; intValues[4] = 4;
+    intKeys[5] = 2; intValues[5] = 5;
+    intKeys[6] = 2; intValues[6] = 6;
+    intKeys[7] = 0; intValues[7] = 7;
+    intKeys[8] = 5; intValues[8] = 8;
+    intKeys[9] = 6; intValues[9] = 9;
+
+    cudaMalloc((void **)&dev_intKeys, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+    cudaMalloc((void **)&dev_intValues, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+    std::cout << "before unstable sort: " << std::endl;
+    for (int i = 0; i < N; i++) {
+        std::cout << "  key: " << intKeys[i];
+        std::cout << " value: " << intValues[i] << std::endl;
+    }
+
+    // How to copy data to the GPU
+    cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+
+    // Wrap device vectors in thrust iterators for use with thrust.
+    thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+    thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+    // LOOK-2.1 Example for using thrust::sort_by_key
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+    // How to copy data back to the CPU side from the GPU
+    cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    checkCUDAErrorWithLine("memcpy back failed!");
+
+    std::cout << "after unstable sort: " << std::endl;
+    for (int i = 0; i < N; i++) {
+        std::cout << "  key: " << intKeys[i];
+        std::cout << " value: " << intValues[i] << std::endl;
+    }
+
+    // cleanup
+    cudaFree(dev_intKeys);
+    cudaFree(dev_intValues);
+    checkCUDAErrorWithLine("cudaFree failed!");
+    return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..f720880 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
@@ -24,16 +24,16 @@ const float DT = 0.2f;
 /**
 * C main function.
 */
-int main(int argc, char* argv[]) {
-  projectName = "565 CUDA Intro: Boids";
-
-  if (init(argc, argv)) {
-    mainLoop();
-    Boids::endSimulation();
-    return 0;
-  } else {
-    return 1;
-  }
+int main(int argc, char *argv[]) {
+    projectName = "565 CUDA Intro: Boids";
+
+    if (init(argc, argv)) {
+        mainLoop();
+        Boids::endSimulation();
+        return 0;
+    } else {
+        return 1;
+    }
 }
 
 //-------------------------------
@@ -47,143 +47,141 @@ GLFWwindow *window;
 * Initialization of CUDA and GLFW.
 */
 bool init(int argc, char **argv) {
-  // Set window title to "Student Name: [SM 2.0] GPU Name"
-  cudaDeviceProp deviceProp;
-  int gpuDevice = 0;
-  int device_count = 0;
-  cudaGetDeviceCount(&device_count);
-  if (gpuDevice > device_count) {
-    std::cout
-    << "Error: GPU device number is greater than the number of devices!"
-    << " Perhaps a CUDA-capable GPU is not installed?"
-    << std::endl;
-    return false;
-  }
-  cudaGetDeviceProperties(&deviceProp, gpuDevice);
-  int major = deviceProp.major;
-  int minor = deviceProp.minor;
-
-  std::ostringstream ss;
-  ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]";
-  deviceName = ss.str();
-
-  // Window setup stuff
-  glfwSetErrorCallback(errorCallback);
-
-  if (!glfwInit()) {
-    std::cout
-    << "Error: Could not initialize GLFW!"
-    << " Perhaps OpenGL 3.3 isn't available?"
-    << std::endl;
-    return false;
-  }
-
-  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
-  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
-  glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
-  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
-
-  window = glfwCreateWindow(width, height, deviceName.c_str(), NULL, NULL);
-  if (!window) {
-    glfwTerminate();
-    return false;
-  }
-  glfwMakeContextCurrent(window);
-  glfwSetKeyCallback(window, keyCallback);
-  glfwSetCursorPosCallback(window, mousePositionCallback);
-  glfwSetMouseButtonCallback(window, mouseButtonCallback);
+    // Set window title to "Student Name: [SM 2.0] GPU Name"
+    cudaDeviceProp deviceProp;
+    int gpuDevice = 0;
+    int device_count = 0;
+    cudaGetDeviceCount(&device_count);
+    if (gpuDevice > device_count) {
+        std::cout
+            << "Error: GPU device number is greater than the number of devices!"
+            << " Perhaps a CUDA-capable GPU is not installed?"
+            << std::endl;
+        return false;
+    }
+    cudaGetDeviceProperties(&deviceProp, gpuDevice);
+    int major = deviceProp.major;
+    int minor = deviceProp.minor;
+
+    std::ostringstream ss;
+    ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]";
+    deviceName = ss.str();
+
+    // Window setup stuff
+    glfwSetErrorCallback(errorCallback);
+
+    if (!glfwInit()) {
+        std::cout
+            << "Error: Could not initialize GLFW!"
+            << " Perhaps OpenGL 3.3 isn't available?"
+            << std::endl;
+        return false;
+    }
 
-  glewExperimental = GL_TRUE;
-  if (glewInit() != GLEW_OK) {
-    return false;
-  }
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
 
-  // Initialize drawing state
-  initVAO();
+    window = glfwCreateWindow(width, height, deviceName.c_str(), NULL, NULL);
+    if (!window) {
+        glfwTerminate();
+        return false;
+    }
+    glfwMakeContextCurrent(window);
+    glfwSetKeyCallback(window, keyCallback);
+    glfwSetCursorPosCallback(window, mousePositionCallback);
+    glfwSetMouseButtonCallback(window, mouseButtonCallback);
+
+    glewExperimental = GL_TRUE;
+    if (glewInit() != GLEW_OK) {
+        return false;
+    }
 
-  // Default to device ID 0. If you have more than one GPU and want to test a non-default one,
-  // change the device ID.
-  cudaGLSetGLDevice(0);
+    // Initialize drawing state
+    initVAO();
 
-  cudaGLRegisterBufferObject(boidVBO_positions);
-  cudaGLRegisterBufferObject(boidVBO_velocities);
+    // Default to device ID 0. If you have more than one GPU and want to test a non-default one,
+    // change the device ID.
+    cudaGLSetGLDevice(0);
 
-  // Initialize N-body simulation
-  Boids::initSimulation(N_FOR_VIS);
+    cudaGLRegisterBufferObject(boidVBO_positions);
+    cudaGLRegisterBufferObject(boidVBO_velocities);
 
-  updateCamera();
+    // Initialize N-body simulation
+    Boids::initSimulation(N_FOR_VIS);
 
-  initShaders(program);
+    updateCamera();
 
-  glEnable(GL_DEPTH_TEST);
+    initShaders(program);
 
-  return true;
+    glEnable(GL_DEPTH_TEST);
+
+    return true;
 }
 
 void initVAO() {
+    std::unique_ptr<GLfloat[]> bodies{new GLfloat[4 * (N_FOR_VIS)]};
+    std::unique_ptr<GLuint[]> bindices{new GLuint[N_FOR_VIS]};
+
+    glm::vec4 ul(-1.0, -1.0, 1.0, 1.0);
+    glm::vec4 lr(1.0, 1.0, 0.0, 0.0);
+
+    for (int i = 0; i < N_FOR_VIS; i++) {
+        bodies[4 * i + 0] = 0.0f;
+        bodies[4 * i + 1] = 0.0f;
+        bodies[4 * i + 2] = 0.0f;
+        bodies[4 * i + 3] = 1.0f;
+        bindices[i] = i;
+    }
 
-  std::unique_ptr<GLfloat[]> bodies{ new GLfloat[4 * (N_FOR_VIS)] };
-  std::unique_ptr<GLuint[]> bindices{ new GLuint[N_FOR_VIS] };
-
-  glm::vec4 ul(-1.0, -1.0, 1.0, 1.0);
-  glm::vec4 lr(1.0, 1.0, 0.0, 0.0);
-
-  for (int i = 0; i < N_FOR_VIS; i++) {
-    bodies[4 * i + 0] = 0.0f;
-    bodies[4 * i + 1] = 0.0f;
-    bodies[4 * i + 2] = 0.0f;
-    bodies[4 * i + 3] = 1.0f;
-    bindices[i] = i;
-  }
-
-
-  glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this
-  glGenBuffers(1, &boidVBO_positions);
-  glGenBuffers(1, &boidVBO_velocities);
-  glGenBuffers(1, &boidIBO);
+    glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this
+    glGenBuffers(1, &boidVBO_positions);
+    glGenBuffers(1, &boidVBO_velocities);
+    glGenBuffers(1, &boidIBO);
 
-  glBindVertexArray(boidVAO);
+    glBindVertexArray(boidVAO);
 
-  // Bind the positions array to the boidVAO by way of the boidVBO_positions
-  glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer
-  glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data
+    // Bind the positions array to the boidVAO by way of the boidVBO_positions
+    glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions);                                                // bind the buffer
+    glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data
 
-  glEnableVertexAttribArray(positionLocation);
-  glVertexAttribPointer((GLuint)positionLocation, 4, GL_FLOAT, GL_FALSE, 0, 0);
+    glEnableVertexAttribArray(positionLocation);
+    glVertexAttribPointer((GLuint)positionLocation, 4, GL_FLOAT, GL_FALSE, 0, 0);
 
-  // Bind the velocities array to the boidVAO by way of the boidVBO_velocities
-  glBindBuffer(GL_ARRAY_BUFFER, boidVBO_velocities);
-  glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW);
-  glEnableVertexAttribArray(velocitiesLocation);
-  glVertexAttribPointer((GLuint)velocitiesLocation, 4, GL_FLOAT, GL_FALSE, 0, 0);
+    // Bind the velocities array to the boidVAO by way of the boidVBO_velocities
+    glBindBuffer(GL_ARRAY_BUFFER, boidVBO_velocities);
+    glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW);
+    glEnableVertexAttribArray(velocitiesLocation);
+    glVertexAttribPointer((GLuint)velocitiesLocation, 4, GL_FLOAT, GL_FALSE, 0, 0);
 
-  glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, boidIBO);
-  glBufferData(GL_ELEMENT_ARRAY_BUFFER, (N_FOR_VIS) * sizeof(GLuint), bindices.get(), GL_STATIC_DRAW);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, boidIBO);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, (N_FOR_VIS) * sizeof(GLuint), bindices.get(), GL_STATIC_DRAW);
 
-  glBindVertexArray(0);
+    glBindVertexArray(0);
 }
 
-void initShaders(GLuint * program) {
-  GLint location;
+void initShaders(GLuint *program) {
+    GLint location;
 
-  program[PROG_BOID] = glslUtility::createProgram(
-    "shaders/boid.vert.glsl",
-    "shaders/boid.geom.glsl",
-    "shaders/boid.frag.glsl", attributeLocations, 2);
+    program[PROG_BOID] = glslUtility::createProgram(
+        "shaders/boid.vert.glsl",
+        "shaders/boid.geom.glsl",
+        "shaders/boid.frag.glsl", attributeLocations, 2);
     glUseProgram(program[PROG_BOID]);
 
     if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
-      glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
+        glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
     }
     if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1) {
-      glUniform3fv(location, 1, &cameraPosition[0]);
+        glUniform3fv(location, 1, &cameraPosition[0]);
     }
-  }
+}
 
-  //====================================
-  // Main loop
-  //====================================
-  void runCUDA() {
+//====================================
+// Main loop
+//====================================
+void runCUDA() {
     // Map OpenGL buffer object for writing from CUDA on a single GPU
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
     // use this buffer
@@ -192,109 +190,108 @@ void initShaders(GLuint * program) {
     float *dptrVertPositions = NULL;
     float *dptrVertVelocities = NULL;
 
-    cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
-    cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
+    cudaGLMapBufferObject((void **)&dptrVertPositions, boidVBO_positions);
+    cudaGLMapBufferObject((void **)&dptrVertVelocities, boidVBO_velocities);
 
     // execute the kernel
-    #if UNIFORM_GRID && COHERENT_GRID
+#if UNIFORM_GRID && COHERENT_GRID
     Boids::stepSimulationCoherentGrid(DT);
-    #elif UNIFORM_GRID
+#elif UNIFORM_GRID
     Boids::stepSimulationScatteredGrid(DT);
-    #else
+#else
     Boids::stepSimulationNaive(DT);
-    #endif
+#endif
 
-    #if VISUALIZE
+#if VISUALIZE
     Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
-    #endif
+#endif
     // unmap buffer object
     cudaGLUnmapBufferObject(boidVBO_positions);
     cudaGLUnmapBufferObject(boidVBO_velocities);
-  }
+}
 
-  void mainLoop() {
+void mainLoop() {
     double fps = 0;
     double timebase = 0;
     int frame = 0;
 
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
-                       // your CUDA development setup is ready to go.
+    // your CUDA development setup is ready to go.
 
     while (!glfwWindowShouldClose(window)) {
-      glfwPollEvents();
+        glfwPollEvents();
 
-      frame++;
-      double time = glfwGetTime();
+        frame++;
+        double time = glfwGetTime();
 
-      if (time - timebase > 1.0) {
-        fps = frame / (time - timebase);
-        timebase = time;
-        frame = 0;
-      }
+        if (time - timebase > 1.0) {
+            fps = frame / (time - timebase);
+            timebase = time;
+            frame = 0;
+        }
 
-      runCUDA();
+        runCUDA();
 
-      std::ostringstream ss;
-      ss << "[";
-      ss.precision(1);
-      ss << std::fixed << fps;
-      ss << " fps] " << deviceName;
-      glfwSetWindowTitle(window, ss.str().c_str());
+        std::ostringstream ss;
+        ss << "[";
+        ss.precision(1);
+        ss << std::fixed << fps;
+        ss << " fps] " << deviceName;
+        glfwSetWindowTitle(window, ss.str().c_str());
 
-      glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
-      #if VISUALIZE
-      glUseProgram(program[PROG_BOID]);
-      glBindVertexArray(boidVAO);
-      glPointSize((GLfloat)pointSize);
-      glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
-      glPointSize(1.0f);
+#if VISUALIZE
+        glUseProgram(program[PROG_BOID]);
+        glBindVertexArray(boidVAO);
+        glPointSize((GLfloat)pointSize);
+        glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
+        glPointSize(1.0f);
 
-      glUseProgram(0);
-      glBindVertexArray(0);
+        glUseProgram(0);
+        glBindVertexArray(0);
 
-      glfwSwapBuffers(window);
-      #endif
+        glfwSwapBuffers(window);
+#endif
     }
     glfwDestroyWindow(window);
     glfwTerminate();
-  }
+}
 
 
-  void errorCallback(int error, const char *description) {
+void errorCallback(int error, const char *description) {
     fprintf(stderr, "error %d: %s\n", error, description);
-  }
+}
 
-  void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) {
+void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) {
     if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
-      glfwSetWindowShouldClose(window, GL_TRUE);
+        glfwSetWindowShouldClose(window, GL_TRUE);
     }
-  }
+}
 
-  void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) {
+void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods) {
     leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS);
     rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS);
-  }
+}
 
-  void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) {
+void mousePositionCallback(GLFWwindow *window, double xpos, double ypos) {
     if (leftMousePressed) {
-      // compute new camera parameters
-      phi += (xpos - lastX) / width;
-      theta -= (ypos - lastY) / height;
-      theta = std::fmax(0.01f, std::fmin(theta, 3.14f));
-      updateCamera();
-    }
-    else if (rightMousePressed) {
-      zoom += (ypos - lastY) / height;
-      zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f));
-      updateCamera();
+        // compute new camera parameters
+        phi += (xpos - lastX) / width;
+        theta -= (ypos - lastY) / height;
+        theta = std::fmax(0.01f, std::fmin(theta, 3.14f));
+        updateCamera();
+    } else if (rightMousePressed) {
+        zoom += (ypos - lastY) / height;
+        zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f));
+        updateCamera();
     }
 
-	lastX = xpos;
-	lastY = ypos;
-  }
+    lastX = xpos;
+    lastY = ypos;
+}
 
-  void updateCamera() {
+void updateCamera() {
     cameraPosition.x = zoom * sin(phi) * sin(theta);
     cameraPosition.z = zoom * cos(theta);
     cameraPosition.y = zoom * cos(phi) * sin(theta);
@@ -308,6 +305,6 @@ void initShaders(GLuint * program) {
 
     glUseProgram(program[PROG_BOID]);
     if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
-      glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
+        glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
     }
-  }
+}